Feat: added classification with number tokens

- added analysis for overall statistics
2025-01-09 23:13:24 +09:00 · 2025-01-09 23:13:24 +09:00 · 1b9c4323c3
parent 1b6659a600
commit 1b9c4323c3
70 changed files with 1394 additions and 342 deletions
--- a/analysis/categories/label_print.py
+++ b/analysis/categories/label_print.py
@ -13,6 +13,10 @@ full_df
 # %%
 mdm_list

+# %%
+mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
+# %%
+mdm_list
 # %%
 mask = full_df['pattern'] == 'GE#Flow FGMassFlow'
 full_df[mask]
--- a/analysis/data_properties/basic_eda.py
+++ b/analysis/data_properties/basic_eda.py
@ -1,13 +0,0 @@
-# %%
-import pandas as pd
-
-# %%
-data_path = '../../data_import/exports/raw_data.csv'
-df = pd.read_csv(data_path)
-
-# %%
-df
-
-# %%
-len(set(df['ships_idx']))
-# %%
--- a/analysis/data_properties/character_count.py
+++ b/analysis/data_properties/character_count.py
@ -0,0 +1,58 @@
+# %%
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+# %%
+# data_path = '../../data_import/exports/raw_data.csv'
+data_path = '../../data_preprocess/exports/preprocessed_data.csv'
+df = pd.read_csv(data_path)
+
+# %%
+df = df[df['MDM']].reset_index(drop=True)
+
+# %%
+# we want to print the string length
+
+# print summary stats
+def summary_stats(arr):
+    return {
+        "Mean": np.mean(arr),
+        "Median": np.median(arr),
+        "Standard Deviation": np.std(arr),
+        "Variance": np.var(arr),
+        "Min": np.min(arr),
+        "Max": np.max(arr),
+        "Range": np.ptp(arr),
+        "25th Percentile": np.percentile(arr, 25),
+        "75th Percentile": np.percentile(arr, 75),
+        "Sum": np.sum(arr),
+    }
+
+# %%
+ship_domain_data = df['tag_description'] + df['unit'].fillna('')
+
+ship_domain_array = np.array([len(item) for item in ship_domain_data])
+
+stats = summary_stats(ship_domain_array)
+
+for key, value in stats.items():
+    print(f"{key}: {value}")
+
+
+# %%
+plt.hist(ship_domain_array, bins=50)
+# %%
+
+# %%
+platform_domain_data = df['thing'] + df['property']
+
+platform_domain_array = np.array([len(item) for item in platform_domain_data])
+
+stats = summary_stats(platform_domain_array)
+
+for key, value in stats.items():
+    print(f"{key}: {value}")
+
+
+# %%
--- a/analysis/data_properties/description_analysis.py
+++ b/analysis/data_properties/description_analysis.py
--- a/analysis/delta_analysis/.gitignore
+++ b/analysis/delta_analysis/.gitignore
@ -0,0 +1 @@
+exports
--- a/analysis/delta_analysis/delta.py
+++ b/analysis/delta_analysis/delta.py
@ -0,0 +1,62 @@
+# %%
+import pandas as pd
+import numpy as np
+
+# %%
+data_path = '../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
+
+
+# %%
+fold = 5
+file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
+df_bert = pd.read_csv(file_path)
+# %%
+file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+df_t5 = pd.read_csv(file_path)
+df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True)
+df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
+df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
+
+# %%
+df_t5['bert_prediction'] = df_bert['class_prediction']
+df_bert['t5_prediction'] = df_t5['class_prediction']
+# %%
+bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
+# %%
+t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
+
+# %%
+sum(t5_correct)/len(t5_correct)
+
+# %%
+# replace t5 not in vocab with bert values
+t5_correct_modified = t5_correct.copy()
+condition = ~df_t5['in_vocab']
+t5_correct_modified[condition] = np.array(bert_correct[condition])
+
+# %%
+# new replacement correctness
+sum(t5_correct_modified)/len(t5_correct_modified)
+# %%
+# when bert is correct and t5 is wrong
+cond_mask = bert_correct & (~t5_correct)
+print(sum(cond_mask))
+print(df_t5[cond_mask].to_string())
+# %%
+# when bert is wrong and t5 is correct
+cond_mask = (~bert_correct) & (t5_correct)
+print(sum(cond_mask))
+print(df_bert[cond_mask].to_string())
+
+
+
+# %%
+# when both are wrong
+cond_mask = (~bert_correct) & (~t5_correct)
+print(sum(cond_mask))
+
+
+# %%
--- a/analysis/delta_analysis/replacement.py
+++ b/analysis/delta_analysis/replacement.py
@ -0,0 +1,72 @@
+# %%
+import pandas as pd
+import numpy as np
+
+# %%
+data_path = '../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
+
+# %%
+def run_mdm(fold):
+    file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
+    df_bert = pd.read_csv(file_path)
+    df_bert = df_bert[df_bert['MDM']].reset_index(drop=True)
+
+    file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+    # file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+    df_t5 = pd.read_csv(file_path)
+    df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True)
+    df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
+    df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
+
+    df_t5['bert_prediction'] = df_bert['class_prediction']
+    df_bert['t5_prediction'] = df_t5['class_prediction']
+
+    bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
+    t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
+
+    t5_original_accuracy = sum(t5_correct)/len(t5_correct)
+
+    # replace t5 not in vocab with bert values
+    t5_correct_modified = t5_correct.copy()
+    condition = ~df_t5['in_vocab']
+    t5_correct_modified[condition] = np.array(bert_correct[condition])
+    pd.Series(t5_correct_modified).to_csv(f'exports/result_group_{fold}.csv')
+
+    t5_new_accuracy = sum(t5_correct_modified)/len(t5_correct_modified)
+
+    print('original accuracy', t5_original_accuracy)
+    print('new accuracy', t5_new_accuracy)
+
+
+# %%
+# this does replacement for the full prediction
+def run_full(fold):
+    file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
+    df_bert = pd.read_csv(file_path)
+
+    file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+    # file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
+    df_t5 = pd.read_csv(file_path)
+    df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
+    df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
+
+    df_t5['bert_prediction'] = df_bert['class_prediction']
+    df_bert['t5_prediction'] = df_t5['class_prediction']
+
+    bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
+    t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
+
+    # replace t5 not in vocab with bert values
+    t5_correct_modified = t5_correct.copy()
+    condition = ~df_t5['in_vocab']
+    t5_correct_modified[condition] = np.array(bert_correct[condition])
+    pd.Series(t5_correct_modified, name='grounded_pred').to_csv(f'exports/result_group_{fold}.csv')
+
+
+# %%
+for fold in [1,2,3,4,5]:
+    run_mdm(fold)
+    run_full(fold)
+# %%
--- a/analysis/delta_analysis/temp.csv
+++ b/analysis/delta_analysis/temp.csv
@ -0,0 +1,67 @@
+,thing,property,ships_idx,tag_name,tag_description,signal_type,min,max,unit,data_type,thing_pattern,property_pattern,pattern,MDM,class_prediction
+6,SB1Flow,FOMassFlowTotal,1003,FM6_XI001_Y,AUXILIARY BOILER FUEL OIL TOTAL FLOW RATE,AI,0,0,FLOW,1304.0,SB#Flow,FOMassFlowTotal,SB#Flow FOMassFlowTotal,True,SB1FlowFOMassFlowIn
+38,ShipBoiler3,RunningState,1030,BC330,COMPOSITE BOILER FIRING,DI,0,0,NOVALUE,1301.0,ShipBoiler#,RunningState,ShipBoiler# RunningState,True,ShipBoiler1RunningState
+61,GeneratorEngine5,CBNonClosed,1003,PMS_5ACBNCL_Y,NO5 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine5RunningState
+72,CargoHandling,BoostPp_Port_Current,1018,IT_1400_Y,MP1400 BOOSTER PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,BoostPp_Port_Current,CargoHandling BoostPp_Port_Current,True,CargoHandlingBoostPp_Stbd_Current
+81,Navigation,MidPDraft,1018,TL_200002_Y,MID DRAFTP_LV,A,0,0,NOVALUE,1310.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft
+86,ShipBoiler1,FOInletTemp,1018,AB_000001_Y,BOILER FUEL OIL IN BURNER_TEMP,A,0,0,NOVALUE,1310.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
+140,Navigation,MidPDraft,1003,DCM_P3_Y,DRAUGHT MID PS (DRAFT SENSOR),AI,0,0,m ,1304.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft
+174,ShipBoiler1,FOInletPress,1051,MB.YO.IAS.Q3.40224,BOILER FUEL OIL IN BURNER_PRESS,Analog,0,4,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress
+200,GeneratorEngine3,VoltageB,1050,MB.KM.IAS.Q3.A40193,NO3 GENERATOR_ENGINE(B) GEN VOLTAGE,AO,0,655,VOLTAGE,1300.0,GeneratorEngine#,VoltageB,GeneratorEngine# VoltageB,True,GeneratorEngine3Voltage
+342,EngineRoom,AirTemp,1018,MA_TT8612_Y,MAIN_ENGINE AMBIENT_TEMP,A,0,0,NOVALUE,1310.0,EngineRoom,AirTemp,EngineRoom AirTemp,True,GeneratorEngine1CBTrip
+395,GeneratorEngine3,SAPress,1036,MB.KM.IAS.Q2.400121,NO3 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine3WindingTempR
+396,MainEngine1,RPM,1051,MB.YO.IAS.Q1.40006,M/E_RPM,Analog,-120,120,RPM,1304.0,MainEngine#,RPM,MainEngine# RPM,True,Shaft1RPM
+653,ShipBoiler1,FOInletTemp,1033,CB014,COMPOSITE BOILER FUEL OIL TEMPERATURE,AI,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
+731,GeneratorEngine4,CBNonClosed,1003,PMS_4ACBNCL_Y,NO4 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine4CBClosed
+745,ShipBoiler1,FOInletPress,1018,AB_000002_Y,BOILER FUEL OIL IN BURNER PRESSURE,A,0,0,PRESSURE,1310.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress
+783,GeneratorEngine1,LOFilterInletPress,1030,GA069,NO1 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine1LOInletPress
+786,GeneratorEngine1,FOFilterInletPress,1030,GA085,NO1 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine1FOInletPress
+812,GE1Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400031,GENERATOR_ENGINE FUEL OIL VISCOSITY INDICATION,AO,0,2346,VOLUME FLOW,1304.0,GE#Flow,FOViscosity,GE#Flow FOViscosity,True,GE1FlowFOVolumeFlowIn
+813,ME2Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400025,MAIN_ENGINE(P) FUEL OIL VISCOSITY INDICATION,AO,0,2285,VOLUME FLOW,1304.0,ME#Flow,FOViscosity,ME#Flow FOViscosity,True,ME2FlowFOVolumeFlowIn
+840,GeneratorEngine1,SAPress,1036,MB.KM.IAS.Q1.400051,NO1 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine1WindingTempR
+891,GE1Flow,FOMassFlowIn,1051,MB.YO.IAS.Q2.40103,GENERATOR_ENGINE HFO_FLOW,Analog,0,1800,MASS FLOW,1304.0,GE#Flow,FOMassFlowIn,GE#Flow FOMassFlowIn,True,GE1FlowFGMassFlow
+935,ShipBoiler1,FOInletTemp,1051,MB.YO.IAS.Q3.40223,BOILER FUEL OIL IN BURNER_TEMP,Analog,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
+951,MainEngine2,CFWInletTemp,1020,MB.YO.IAS.Q1.A400388,MAIN_ENGINE(P) CYLINDER COOL WATER TEMPERATURE INLET,AO,-50,130,TEMPERATURE,1304.0,MainEngine#,CFWInletTemp,MainEngine# CFWInletTemp,True,MainEngine2Cy3CWTemp
+1005,GeneratorEngine1,HFOUse,1051,MB.YO.IAS.Q1.10096,G/E_HFUEL OIL USE,Digital,0,1,-,1301.0,GeneratorEngine#,HFOUse,GeneratorEngine# HFOUse,True,MainEngine1HFOUse
+1075,ME1Flow,FGMassFlow,1004,MB.YO.IAS.Q2.A400121,LP LPG FUEL P/P FLOW,AI,0,3500,MASS FLOW,1304.0,ME#Flow,FGMassFlow,ME#Flow FGMassFlow,True,ME2FlowFGMassFlow
+1116,CargoHandling,LPGComp1MotorCurrent,1004,MB.YO.IAS.Q3.A400281,MP-2100 COMPRESSOR (P) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT3_DWPump_Port_Current
+1117,CargoHandling,LPGComp2MotorCurrent,1004,MB.YO.IAS.Q3.A400282,MP-2200 COMPRESSOR (C) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT2_DWPump_Stbd_Current
+1118,CargoHandling,LPGComp3MotorCurrent,1004,MB.YO.IAS.Q3.A400283,MP-2300 COMPRESSOR (S) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingBoostPp_Stbd_Current
+1174,FuelOilSystem,LFOVolumeSettleTK,1003,LC_XI001_Y,NO2 LIGHT FUEL OIL SETTLING TANK VOLUME,AI,0,999999,VOLUME,1304.0,FuelOilSystem,LFOVolumeSettleTK,FuelOilSystem LFOVolumeSettleTK,True,FuelOilSystemLFOVolumeStorageTK2P
+1198,GeneratorEngine4,BearingNDETemp1,1003,GE4_TIAH6_Y,NO4 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine4WindingTempT
+1199,GeneratorEngine5,BearingNDETemp1,1003,GE5_TIAH6_Y,NO5 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine5WindingTempT
+1200,MainEngine1,LoadPercent,1018,EG_0000005_Y,M/E_LOAD,D,0,0,%,1301.0,MainEngine#,LoadPercent,MainEngine# LoadPercent,True,GeneratorEngine2LoadPercent
+1214,GE1TurboCharger1,ExhGasOutletTemp,1003,GE1_TE27_Y,NO1 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger1ExhGasOutletTemp
+1226,GE2TurboCharger1,ExhGasOutletTemp,1003,GE2_TE27_Y,NO2 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger2ExhGasOutletTemp
+1237,GE3TurboCharger1,ExhGasOutletTemp,1003,GE3_TE27_Y,NO3 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger3ExhGasOutletTemp
+1246,GeneratorEngine3,BearingDETemp8,1003,GE3_TE698_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5
+1247,GeneratorEngine3,BearingDETemp9,1003,GE3_TE699_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
+1273,GeneratorEngine4,BearingDETemp8,1003,GE4_TE698_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
+1274,GeneratorEngine4,BearingDETemp9,1003,GE4_TE699_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
+1280,GeneratorEngine5,BearingDETemp2,1003,GE5_TE692_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP2,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2BearingDETemp6
+1281,GeneratorEngine5,BearingDETemp3,1003,GE5_TE693_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP3,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5
+1282,GeneratorEngine5,BearingDETemp4,1003,GE5_TE694_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP4,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp4
+1283,GeneratorEngine5,BearingDETemp5,1003,GE5_TE695_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP5,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5
+1284,GeneratorEngine5,BearingDETemp6,1003,GE5_TE696_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP6,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
+1285,GeneratorEngine5,BearingDETemp7,1003,GE5_TE697_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP7,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
+1286,GeneratorEngine5,BearingDETemp8,1003,GE5_TE698_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2Cy8KnockIntensity
+1287,GeneratorEngine5,BearingDETemp9,1003,GE5_TE699_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
+1298,ME1TurboCharger1,ExhGasInletTemp,1003,AMSI_TT3721A_Y,EXHAUST GAS TEMPERATURE BEFORE TURBOCHARGER 1,AI,0,600,TEMPERATURE,1304.0,ME#TurboCharger#,ExhGasInletTemp,ME#TurboCharger# ExhGasInletTemp,True,ME1TurboCharger1ExhGasOutletTemp
+1309,GeneratorEngine2,LOFilterInletPress,1030,GB069,NO2 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine2LOInletPress
+1472,GeneratorEngine3,VoltageA,1050,MB.KM.IAS.Q3.A40189,NO3 GENERATOR_ENGINE(A) GEN VOLTAGE,AO,0,654,VOLTAGE,1300.0,GeneratorEngine#,VoltageA,GeneratorEngine# VoltageA,True,GeneratorEngine3Voltage
+1524,GeneratorEngine2,FOFilterInletPress,1030,GB085,NO2 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine2FOInletPress
+1536,ShipBoiler1,FOInletTemp,1028,MB.KM.IAS.Q2.A400184,OIL TEMPERATURE (4-20MA),AI,0,200,°C,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,GeneratorEngine4WindingTempT
+1537,ShipBoiler1,FOInletPress,1028,MB.KM.IAS.Q2.A400185,FUEL OIL PRESSURE (4-20MA),AI,0,40,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,GeneratorEngine4FOInletPress
+1594,GeneratorEngine3,LOFilterInletPress,1030,GC069,NO3 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine3LOInletPress
+1597,GeneratorEngine3,FOFilterInletPress,1030,GC085,NO3 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine3FOInletPress
+1679,GeneratorEngine3,busBarVoltage,1003,PMS_3BUSVOLA_Y,BUS VOLTAGE,AI,0,10000,VOLTAGE,1304.0,GeneratorEngine#,busBarVoltage,GeneratorEngine# busBarVoltage,True,GeneratorEngine1busBarVoltage
+1727,GeneratorEngine2,SAPress,1036,MB.KM.IAS.Q1.400086,NO2 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine2WindingTempR
+1763,GeneratorEngine5,BearingDETemp1,1003,GE5_TE691_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP1,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5
+1873,GeneratorEngine5,CBClosed,1003,PMS_5VCBCLED_Y,NO5 GENERATOR_ENGINE MVSB VCB CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBClosed,GeneratorEngine# CBClosed,True,GeneratorEngine5StopState
+2034,CargoHandling,CT1_DWPump_Stbd_Current,1018,IT_1101_Y,MP1100 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current
+2035,CargoHandling,CT2_DWPump_Port_Current,1018,IT_1200_Y,MP1200 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
+2037,CargoHandling,CT3_DWPump_Stbd_Current,1018,IT_1501_Y,MP1501 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current
+2038,CargoHandling,CT4_DWPump_Port_Current,1018,IT_1700_Y,MP1700 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
+2048,GeneratorEngine5,RunningHour,1003,PMS_5GENWHRS_Y,NO5 GENERATOR_ENGINE WORKING HOURS,AI,0,10000,NOVALUE,1304.0,GeneratorEngine#,RunningHour,GeneratorEngine# RunningHour,True,GeneratorEngine4RunningHour
+2057,CargoHandling,CT1_DWPump_Port_Current,1018,IT_1100_Y,MP1100 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
+2079,ShipBoiler1,ExhGasOutletTemp,1003,EG_G02_Y,EXHAUST GAS ECONOMIZER EXHAUST GAS OUTLET TEMPERATURE,AI,0,600,TEMPERATURE,1304.0,ShipBoiler#,ExhGasOutletTemp,ShipBoiler# ExhGasOutletTemp,True,MainEngine1Cy1ExhGasOutletTemp
--- a/analysis/result_report_statistics/decoder_scaling.csv
+++ b/analysis/result_report_statistics/decoder_scaling.csv
@ -0,0 +1,27 @@
+type,fold,accuracy
+1layer,1,0.8968291528632276
+1layer,2,0.8859813084112149
+1layer,3,0.9382530120481928
+1layer,4,0.9586108468125595
+1layer,5,0.8827301878149336
+2layer,1,0.9318504495977283
+2layer,2,0.8859813084112149
+2layer,3,0.9678714859437751
+2layer,4,0.9738344433872502
+2layer,5,0.9015116811726981
+4layer,1,0.9503076194983436
+4layer,2,0.9135514018691588
+4layer,3,0.9698795180722891
+4layer,4,0.9790675547098002
+4layer,5,0.907924874026569
+6layer,1,0.9522006625650734
+6layer,2,0.9093457943925234
+6layer,3,0.9678714859437751
+6layer,4,0.9814462416745956
+6layer,5,0.890975721484196
+8layer,1,0.9441552295314718
+8layer,2,0.9121495327102803
+8layer,3,0.963855421686747
+8layer,4,0.9752616555661275
+8layer,5,0.907924874026569
+
--- a/analysis/result_report_statistics/frozen_encoder.csv
+++ b/analysis/result_report_statistics/frozen_encoder.csv
@ -0,0 +1,12 @@
+type,fold,accuracy
+normal,1,0.9522006625650734
+normal,2,0.9093457943925234
+normal,3,0.9678714859437751
+normal,4,0.9814462416745956
+normal,5,0.890975721484196
+frozen,1,0.9342167534311405
+frozen,2,0.883177570093458
+frozen,3,0.963855421686747
+frozen,4,0.9705042816365367
+frozen,5,0.9051763628034815
+
--- a/analysis/result_report_statistics/result_statistics.py
+++ b/analysis/result_report_statistics/result_statistics.py
@ -0,0 +1,199 @@
+# %%
+import pandas as pd
+import numpy as np
+
+####################################################################################
+# stage 1
+# %%
+# stage 1a: binary classification
+df_stage1a = pd.read_csv('stage1a.csv')
+# %%
+# desc only
+mask = df_stage1a['type'] == 'desc'
+df_stage1a[mask].describe().loc[['mean', 'std']]
+
+# %%
+# desc and unit
+mask = df_stage1a['type'] == 'desc_unit'
+df_stage1a[mask].describe().loc[['mean', 'std']]
+
+# %%
+# stage 1b: similarity-based classification
+df_stage1b = pd.read_csv('stage1b.csv')
+# %%
+# desc only
+mask = df_stage1b['type'] == 'desc'
+df_stage1b[mask].describe().loc[['mean', 'std']]
+
+# %%
+# desc and unit
+mask = df_stage1b['type'] == 'desc_unit'
+df_stage1b[mask].describe().loc[['mean', 'std']]
+
+
+# %%
+#################################################################################
+# stage 2: mapping model
+
+# %%
+# stage 2a: mapping by classification
+df_stage2a = pd.read_csv('stage2a.csv')
+# %%
+# desc only
+mask = df_stage2a['type'] == 'desc'
+df_stage2a[mask].describe().loc[['mean', 'std']]
+
+# %%
+# desc and unit
+mask = df_stage2a['type'] == 'desc_unit'
+df_stage2a[mask].describe().loc[['mean', 'std']]
+
+
+# %%
+# stage 2b: mapping by seq2seq
+df_stage2b = pd.read_csv('stage2b.csv')
+# %%
+# desc only
+mask = df_stage2b['type'] == 'desc'
+df_stage2b[mask].describe().loc[['mean', 'std']]
+
+# %%
+# desc and unit
+mask = df_stage2b['type'] == 'desc_unit'
+df_stage2b[mask].describe().loc[['mean', 'std']]
+
+
+
+############################
+# frozen encoder
+# %%
+df = pd.read_csv('frozen_encoder.csv')
+# %%
+# normal
+mask = df['type'] == 'normal'
+df[mask].describe().loc[['mean', 'std']]
+
+# %%
+# frozen
+mask = df['type'] == 'frozen'
+df[mask].describe().loc[['mean', 'std']]
+
+
+# %%
+############################
+# frozen encoder
+# %%
+df = pd.read_csv('decoder_scaling.csv')
+# %%
+# 1 layer
+mask = df['type'] == '1layer'
+df[mask].describe().loc[['mean', 'std']]
+
+
+# %%
+# 2 layer
+mask = df['type'] == '2layer'
+df[mask].describe().loc[['mean', 'std']]
+
+# %%
+# 4 layer
+mask = df['type'] == '4layer'
+df[mask].describe().loc[['mean', 'std']]
+
+# %%
+# 6 layer
+mask = df['type'] == '6layer'
+df[mask].describe().loc[['mean', 'std']]
+
+# %%
+# 8 layer
+mask = df['type'] == '8layer'
+df[mask].describe().loc[['mean', 'std']]
+
+
+
+# %%
+#########################
+# compute overall result
+
+# frac{1808}{2113} = 0.856$ & $\frac{10692}{10961} = 0.975$ & $\frac{12500}{13074} = 0.956$ \\
+# frac{1932}{2140} = 0.903$ & $\frac{8304}{8582} = 0.968$   & $\frac{10236}{10722} = 0.955$ \\
+# frac{1789}{1992} = 0.898$ & $\frac{7613}{7863} = 0.968$   & $\frac{9402}{9855} = 0.954$   \\
+# frac{1967}{2102} = 0.936$ & $\frac{12929}{13349} = 0.969$ & $\frac{14896}{15451} = 0.964$ \\
+# frac{1915}{2183} = 0.877$ & $\frac{10381}{10786} = 0.962$ & $\frac{12296}{12969} = 0.948$ \\
+
+# %%
+matrix = np.array([
+    [1808, 2113, 10692, 10961, 13074],
+    [1932, 2140,  8304,  8582, 10722],
+    [1789, 1992,  7613,  7863,  9855],
+    [1967, 2102, 12929, 13349, 15451],
+    [1915, 2183, 10381, 10786, 12969]
+])
+# %%
+relevant_class = matrix[:,0]/matrix[:,1]
+print(relevant_class)
+print(np.std(relevant_class))
+
+# %%
+non_relevant_class = matrix[:,2]/matrix[:,3]
+print(non_relevant_class)
+print(np.std(non_relevant_class))
+
+# %%
+numerator = (matrix[:,0] + matrix[:,2])
+denominator = (matrix[:,1] + matrix[:,3])
+print(numerator)
+print(denominator)  # same as last column
+overall = numerator/denominator
+print(overall)
+print(np.std(overall))
+
+
+######################
+# compute mapping result
+# %%
+
+#  $\frac{1761}{1808} = 0.974$ \\
+#  $\frac{1802}{1932} = 0.933$ \\
+#  $\frac{1760}{1789} = 0.984$ \\
+#  $\frac{1945}{1967} = 0.989$ \\
+#  $\frac{1837}{1915} = 0.959$ \\
+
+matrix = np.array([
+    [1761, 1808],
+    [1802, 1932],
+    [1760, 1789],
+    [1945, 1967],
+    [1837, 1915]
+])
+
+# %%
+result = matrix[:,0]/matrix[:,1]
+print(result)
+print(np.mean(result))
+print(np.std(result))
+
+# %%
+####################################
+# compute overall result
+#       & 1761      & 10692     & $\frac{1761 + 10692}{13074} = 0.953$ \\
+#       & 1802      & 8304      & $\frac{1802 + 8304}{10722} = 0.943$  \\
+#       & 1760      & 7613      & $\frac{1760 + 7613}{9855} = 0.951$   \\
+#       & 1945      & 12929     & $\frac{1945 + 12929}{15451} = 0.963$ \\
+#       & 1837      & 10381     & $\frac{1837 + 10381}{12969} = 0.942$ \\
+
+matrix = np.array([
+    [1761,10692, 13074],
+    [1802, 8304, 10722],
+    [1760, 7613,  9855],
+    [1945,12929, 15451],
+    [1837,10381, 12969]
+])
+
+# %%
+overall = (matrix[:,0] + matrix[:,1])/matrix[:,2]
+print(overall)
+print(np.mean(overall))
+print(np.std(overall))
+# %%
--- a/analysis/result_report_statistics/stage1a.csv
+++ b/analysis/result_report_statistics/stage1a.csv
@ -0,0 +1,11 @@
+type,fold,accuracy,f1_score,precision,recall
+desc,1,0.92588,0.74001,0.85440,0.65263
+desc,2,0.88733,0.64239,0.87641,0.50701
+desc,3,0.90583,0.71429,0.92357,0.58233
+desc,4,0.93114,0.70929,0.83312,0.61751
+desc,5,0.91171,0.67683,0.88162,0.54924
+desc_unit,1,0.95610,0.86301,0.87049,0.85566
+desc_unit,2,0.95467,0.88828,0.87421,0.90280
+desc_unit,3,0.95403,0.88762,0.87739,0.89809
+desc_unit,4,0.96408,0.87636,0.82405,0.93578
+desc_unit,5,0.94811,0.85054,0.82543,0.87723
--- a/analysis/result_report_statistics/stage1b.csv
+++ b/analysis/result_report_statistics/stage1b.csv
@ -0,0 +1,11 @@
+type,fold,accuracy,f1_score,precision,recall
+desc,1,0.93162,0.79580,0.76909,0.82442
+desc,2,0.92884,0.82440,0.81224,0.83692
+desc,3,0.93201,0.83375,0.82434,0.84337
+desc,4,0.94259,0.80937,0.73814,0.89581
+desc,5,0.92228,0.78397,0.73661,0.83784
+desc_unit,1,0.93353,0.79945,0.78018,0.81969
+desc_unit,2,0.92184,0.81006,0.78653,0.83505
+desc_unit,3,0.91821,0.80513,0.77659,0.83584
+desc_unit,4,0.93334,0.78675,0.69648,0.90390
+desc_unit,5,0.93084,0.80445,0.76747,0.84517
--- a/analysis/result_report_statistics/stage2a.csv
+++ b/analysis/result_report_statistics/stage2a.csv
@ -0,0 +1,11 @@
+type,fold,accuracy
+desc,1,0.93706
+desc,2,0.88785
+desc,3,0.96285
+desc,4,0.95861
+desc,5,0.89601
+desc_unit,1,0.94226
+desc_unit,2,0.90561
+desc_unit,3,0.96436
+desc_unit,4,0.96955
+desc_unit,5,0.90289
--- a/analysis/result_report_statistics/stage2b.csv
+++ b/analysis/result_report_statistics/stage2b.csv
@ -0,0 +1,16 @@
+type,fold,accuracy
+desc,1,0.9427354472314246
+desc,2,0.8981308411214953
+desc,3,0.9588353413654619
+desc,4,0.9633682207421503
+desc,5,0.8928080622995878
+desc_unit,1,0.9578797917652626
+desc_unit,2,0.9088785046728972
+desc_unit,3,0.9673694779116466
+desc_unit,4,0.9785918173168411
+desc_unit,5,0.8918918918918919
+
+
+
+
+
--- a/analysis/string_levenshtein/.gitignore
+++ b/analysis/string_levenshtein/.gitignore
@ -0,0 +1 @@
+*.png
--- a/analysis/string_levenshtein/between_ship_and_platform.py
+++ b/analysis/string_levenshtein/between_ship_and_platform.py
@ -41,13 +41,26 @@ distance_array


 # %%
+plt.rcParams.update({'font.size': 14})  # Adjust the size as needed
 plt.figure(figsize=(8, 6)) 
 plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
 plt.xlabel("Normalized Levenshtein Distance")
 plt.ylabel("Count")
 plt.tight_layout()
-plt.savefig("histogram.png", dpi=300)
+# Add arrow for increasing dissimilarity
+plt.annotate(
+    "Decreasing Similarity",  # Text label
+    xy=(0.7, 500),  # Arrow end (near the end of x-axis)
+    xytext=(0.4, 500),  # Arrow start (near the middle of x-axis)
+    arrowprops=dict(arrowstyle="->", lw=2, color="black"),  # Arrow style
+    va='center',  # needed to make arrow centered
+    fontsize=14,  # Font size for the text
+    color="black"  # Text color
+)
+# Add arrows and text
+plt.savefig("input_output_similarity.png", dpi=300)
 #
+
 # %%
 # summary statistics of computed levenshtein distance
 def summary_stats(arr):
--- a/analysis/string_levenshtein/within_same_class.py
+++ b/analysis/string_levenshtein/within_same_class.py
@ -58,12 +58,24 @@ score_list

 # %%
 # plt.hist(score_list, bins=50)
+plt.rcParams.update({'font.size': 14})  # Adjust the size as needed
 plt.figure(figsize=(8, 6)) 
 plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
 plt.xlabel("Normalized Levenshtein Distance")
 plt.ylabel("Platform Domain Class Count")
+# Add arrow for increasing dissimilarity
+plt.annotate(
+    "Decreasing Similarity",  # Text label
+    xy=(0.7, 70),  # Arrow end (near the end of x-axis)
+    xytext=(0.2, 70),  # Arrow start (near the middle of x-axis)
+    arrowprops=dict(arrowstyle="->", lw=2, color="black"),  # Arrow style
+    va='center',  # needed to make arrow centered
+    fontsize=14,  # Font size for the text
+    color="black"  # Text color
+)
+
 plt.tight_layout()
-plt.savefig("histogram.png", dpi=300)
+plt.savefig("within_class_similarity.png", dpi=300)
 # %%
 # summary statistics of computed levenshtein distance
 def summary_stats(arr):
--- a/analysis/unit_label_differences/unit.py
+++ b/analysis/unit_label_differences/unit.py
@ -0,0 +1,26 @@
+# %%
+import pandas as pd
+
+# %%
+data_path = '../../data_preprocess/exports/preprocessed_data.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+
+# %%
+df_in = full_df[full_df['MDM']].reset_index(drop=True)
+# %%
+df_out = full_df[~full_df['MDM']].reset_index(drop=True)
+# %%
+label_counts_in = df_in['unit'].value_counts()
+print(label_counts_in.to_string())
+
+# %%
+label_counts_out = df_out['unit'].value_counts()
+print(label_counts_out.to_string())
+
+
+# %%
+label_counts_out['NOVALUE']/len(df_out)
+
+# %%
+label_counts_in['NOVALUE']/len(df_out)
+# %%
--- a/overall/pipeline_evaluation.py
+++ b/overall/pipeline_evaluation.py
@ -9,14 +9,19 @@ def run(fold):
    df = pd.read_csv(data_path, skipinitialspace=True)
    p_mdm = df['p_mdm']

-    # data_path = f'../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv'
-    data_path = f'../train/modified_t5_decoder_4_layers/mapping_prediction/exports/result_group_{fold}.csv'
+    data_path = f'../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
    df = pd.read_csv(data_path, skipinitialspace=True)
    actual_mdm = df['MDM']

-    thing_correctness = df['thing'] == df['p_thing']
-    property_correctness = df['property'] == df['p_property']
-    answer = thing_correctness & property_correctness
+    # grounded labels
+    data_path = f'../analysis/delta_analysis/exports/result_group_{fold}.csv'
+    df_grounded = pd.read_csv(data_path, skipinitialspace=True)
+    answer = df_grounded['grounded_pred']
+
+    # original labels
+    # thing_correctness = df['thing'] == df['p_thing']
+    # property_correctness = df['property'] == df['p_property']
+    # answer = thing_correctness & property_correctness

    ##############
    # evaluate relevant-class prediction performance
@ -53,6 +58,13 @@ def run(fold):
    print(mapping_rate)
    print('size', correct_positive_mdm_and_map, '/', sum(p_mdm & actual_mdm))

+    # evaluate relevant mappings
+    correct_positive_mdm_and_map = sum(p_mdm & actual_mdm & answer)
+    mapping_rate = correct_positive_mdm_and_map / sum(actual_mdm)
+    print('relevant data mapping rate')
+    print(mapping_rate)
+    print('size', correct_positive_mdm_and_map, '/', sum(actual_mdm))
+

    ##############
    # evaluate overall pipeline result
@ -76,3 +88,5 @@ for fold in [1,2,3,4,5]:
    print('*' * 40)
    run(fold)

+
+# %%
--- a/relevant_class/binary_classifier_desc/train.py
+++ b/relevant_class/binary_classifier_desc/train.py
@ -179,8 +179,8 @@ def train(fold):
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-5,
-        per_device_train_batch_size=128,
-        per_device_eval_batch_size=128,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
--- a/relevant_class/binary_classifier_desc_unit/train.py
+++ b/relevant_class/binary_classifier_desc_unit/train.py
@ -180,8 +180,8 @@ def train(fold):
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-5,
-        per_device_train_batch_size=128,
-        per_device_eval_batch_size=128,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
--- a/train/class_number_tokens/.gitignore
+++ b/train/class_number_tokens/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log
--- a/train/class_number_tokens/classification_prediction/.gitignore
+++ b/train/class_number_tokens/classification_prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/train/class_number_tokens/classification_prediction/output.txt
+++ b/train/class_number_tokens/classification_prediction/output.txt
@ -0,0 +1,31 @@
+
+********************************************************************************
+Fold: 1
+Accuracy: 0.94510
+F1 Score: 0.94087
+Precision: 0.94623
+Recall: 0.94510
+********************************************************************************
+Fold: 2
+Accuracy: 0.91682
+F1 Score: 0.91698
+Precision: 0.92824
+Recall: 0.91682
+********************************************************************************
+Fold: 3
+Accuracy: 0.96185
+F1 Score: 0.95743
+Precision: 0.96001
+Recall: 0.96185
+********************************************************************************
+Fold: 4
+Accuracy: 0.97479
+F1 Score: 0.97074
+Precision: 0.97072
+Recall: 0.97479
+********************************************************************************
+Fold: 5
+Accuracy: 0.90563
+F1 Score: 0.89532
+Precision: 0.90040
+Recall: 0.90563
--- a/train/class_number_tokens/classification_prediction/predict.py
+++ b/train/class_number_tokens/classification_prediction/predict.py
@ -0,0 +1,289 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+import glob
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+)
+import evaluate
+import re
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+from tqdm import tqdm
+
+torch.set_float32_matmul_precision('high')
+
+
+BATCH_SIZE = 128
+
+# %%
+
+# we need to create the mdm_list
+# import the full mdm-only file
+data_path = '../../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+# mdm_list = sorted(list((set(full_df['pattern']))))
+thing_property = full_df['thing'] + full_df['property']
+thing_property = thing_property.to_list()
+mdm_list = sorted(list(set(thing_property)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(mdm_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+def substitute_and_append_digits(s):
+    """
+    Finds all digit groups in a string, substitutes them with a <digit> placeholder, 
+    and appends the extracted digit groups at the end of the string flanked by <digit> markers.
+
+    Args:
+        s (str): The input string.
+
+    Returns:
+        str: The transformed string.
+    """
+    # Find all digit groups in the string
+    digit_groups = re.findall(r'\d+', s)
+    
+    # Substitute digit groups with <digit> placeholder
+    substituted_string = re.sub(r'\d+', '<DIGIT>', s)
+    
+    # Append extracted digit groups to the end of the string
+    appended_digits = ''.join([f'<DIGIT>{group}<DIGIT>' for group in digit_groups])
+    
+    return substituted_string + appended_digits
+
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df, mdm_list):
+    output_list = []
+    for _, row in df.iterrows():
+        processed_desc = substitute_and_append_digits(row['tag_description'])
+        desc = f"<DESC>{processed_desc}<DESC>"
+        unit = f"<UNIT>{row['unit']}<UNIT>"
+
+        pattern = f"{row['thing'] + row['property']}"
+        try:
+            index = mdm_list.index(pattern)
+        except ValueError:
+            index = -1
+        element = {
+            'text' : f"{desc}{unit}",
+            'label': index,
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_dataset(fold, mdm_list):
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    test_df = pd.read_csv(data_path, skipinitialspace=True)
+    # uncomment for mdm
+    # we only use the mdm subset
+    test_df = test_df[test_df['MDM']].reset_index(drop=True)
+
+    test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
+
+    return test_dataset
+
+
+# %%
+
+# function to perform training for a given fold
+def test(fold):
+
+    test_dataset = create_dataset(fold, mdm_list)
+
+    # prepare tokenizer
+
+    checkpoint_directory = f'../checkpoint_fold_{fold}'
+    # Use glob to find matching paths
+    # path is usually checkpoint_fold_1/checkpoint-<step number>
+    # we are guaranteed to save only 1 checkpoint from training
+    pattern = 'checkpoint-*'
+    model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<DESC>", "<UNIT>", "<DIGIT>"]
+    # Add the additional special tokens to the tokenizer
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    # %%
+    # compute max token length
+    max_length = 0
+    for sample in test_dataset['text']:
+        # Tokenize the sample and get the length
+        input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
+        length = len(input_ids)
+        
+        # Update max_length if this sample is longer
+        if length > max_length:
+            max_length = length
+
+    print(max_length)
+
+    # %%
+
+    max_length = 128
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            # truncation=True,
+            padding='max_length'
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    datasets = test_dataset.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+
+    datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
+
+    # %%
+    # compute metrics
+    # metric = evaluate.load("accuracy")
+    # 
+    # 
+    # def compute_metrics(eval_preds):
+    #     preds, labels = eval_preds
+    #     preds = np.argmax(preds, axis=1)
+    #     return metric.compute(predictions=preds, references=labels)
+
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(mdm_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    model = model.eval()
+
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+
+    pred_labels = []
+    actual_labels = []
+
+
+    dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
+    for batch in tqdm(dataloader):
+            # Inference in batches
+            input_ids = batch['input_ids']
+            attention_mask = batch['attention_mask']
+            # save labels too
+            actual_labels.extend(batch['label'])
+            
+
+            # Move to GPU if available
+            input_ids = input_ids.to(device)
+            attention_mask = attention_mask.to(device)
+
+            # Perform inference
+            with torch.no_grad():
+                logits = model(
+                    input_ids,
+                    attention_mask).logits
+                predicted_class_ids = logits.argmax(dim=1).to("cpu")
+                pred_labels.extend(predicted_class_ids)
+
+    pred_labels = [tensor.item() for tensor in pred_labels]
+
+
+    # %%
+    from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
+    y_true = actual_labels
+    y_pred = pred_labels
+
+    # Compute metrics
+    accuracy = accuracy_score(y_true, y_pred)
+    average_parameter = 'weighted'
+    zero_division_parameter = 0
+    f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+    recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
+
+
+
+    with open("output.txt", "a") as f:
+
+        print('*' * 80, file=f)
+        print(f'Fold: {fold}', file=f)
+        # Print the results
+        print(f'Accuracy: {accuracy:.5f}', file=f)
+        print(f'F1 Score: {f1:.5f}', file=f)
+        print(f'Precision: {precision:.5f}', file=f)
+        print(f'Recall: {recall:.5f}', file=f)
+
+    # export result
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    df = pd.read_csv(data_path, skipinitialspace=True)
+    # uncomment if you want to predict for all
+    # df = df[df['MDM']].reset_index(drop=True)
+
+    label_list = [id2label[id] for id in pred_labels]
+    df_out = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+    df = pd.concat([df, df_out], axis=1)
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result_group_{fold}.csv", index=False)
+
+
+
+# %%
+# reset file before writing to it
+with open("output.txt", "w") as f:
+    print('', file=f)
+
+for fold in [1,2,3,4,5]:
+    test(fold)
--- a/train/class_number_tokens/train.py
+++ b/train/class_number_tokens/train.py
@ -0,0 +1,241 @@
+# %%
+
+# from datasets import load_from_disk
+import os
+
+os.environ['NCCL_P2P_DISABLE'] = '1'
+os.environ['NCCL_IB_DISABLE'] = '1'
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
+
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    DataCollatorWithPadding,
+    Trainer,
+    EarlyStoppingCallback,
+    TrainingArguments
+)
+import evaluate
+import re
+import numpy as np
+import pandas as pd
+# import matplotlib.pyplot as plt
+from datasets import Dataset, DatasetDict
+
+
+
+torch.set_float32_matmul_precision('high')
+
+# %%
+
+# we need to create the mdm_list
+# import the full mdm-only file
+data_path = '../../data_import/exports/data_mapping_mdm.csv'
+full_df = pd.read_csv(data_path, skipinitialspace=True)
+# rather than use pattern, we use the real thing and property
+# mdm_list = sorted(list((set(full_df['pattern']))))
+thing_property = full_df['thing'] + full_df['property']
+thing_property = thing_property.to_list()
+mdm_list = sorted(list(set(thing_property)))
+
+
+# %%
+id2label = {}
+label2id = {}
+for idx, val in enumerate(mdm_list):
+    id2label[idx] = val
+    label2id[val] = idx
+
+# %%
+def substitute_and_append_digits(s):
+    """
+    Finds all digit groups in a string, substitutes them with a <digit> placeholder, 
+    and appends the extracted digit groups at the end of the string flanked by <digit> markers.
+
+    Args:
+        s (str): The input string.
+
+    Returns:
+        str: The transformed string.
+    """
+    # Find all digit groups in the string
+    digit_groups = re.findall(r'\d+', s)
+    
+    # Substitute digit groups with <digit> placeholder
+    substituted_string = re.sub(r'\d+', '<DIGIT>', s)
+    
+    # Append extracted digit groups to the end of the string
+    appended_digits = ''.join([f'<DIGIT>{group}<DIGIT>' for group in digit_groups])
+    
+    return substituted_string + appended_digits
+
+
+# outputs a list of dictionaries
+# processes dataframe into lists of dictionaries
+# each element maps input to output
+# input: tag_description
+# output: class label
+def process_df_to_dict(df, mdm_list):
+    output_list = []
+    for _, row in df.iterrows():
+        processed_desc = substitute_and_append_digits(row['tag_description'])
+        desc = f"<DESC>{processed_desc}<DESC>"
+        unit = f"<UNIT>{row['unit']}<UNIT>"
+        pattern = f"{row['thing'] + row['property']}"
+        try:
+            index = mdm_list.index(pattern)
+        except ValueError:
+            print("Error: value not found in MDM list")
+            index = -1
+        element = {
+            'text' : f"{desc}{unit}",
+            'label': index,
+        }
+        output_list.append(element)
+
+    return output_list
+
+
+def create_split_dataset(fold, mdm_list):
+    # train 
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
+    train_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    # valid
+    data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
+    validation_df = pd.read_csv(data_path, skipinitialspace=True)
+
+    combined_data = DatasetDict({
+        'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
+        'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
+    })
+    return combined_data
+
+
+# %%
+
+# function to perform training for a given fold
+def train(fold):
+
+    save_path = f'checkpoint_fold_{fold}'
+    split_datasets = create_split_dataset(fold, mdm_list)
+
+    # prepare tokenizer
+
+    # model_checkpoint = "distilbert/distilbert-base-uncased"
+    model_checkpoint = 'google-bert/bert-base-cased'
+    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
+    # Define additional special tokens
+    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<DESC>", "<UNIT>", "<DIGIT>"]
+    # Add the additional special tokens to the tokenizer
+    tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+
+    max_length = 120
+
+    # given a dataset entry, run it through the tokenizer
+    def preprocess_function(example):
+        input = example['text']
+        # text_target sets the corresponding label to inputs
+        # there is no need to create a separate 'labels'
+        model_inputs = tokenizer(
+            input,
+            max_length=max_length,
+            truncation=True,
+            padding=True
+        )
+        return model_inputs
+
+    # map maps function to each "row" in the dataset
+    # aka the data in the immediate nesting
+    tokenized_datasets = split_datasets.map(
+        preprocess_function,
+        batched=True,
+        num_proc=8,
+        remove_columns="text",
+    )
+
+    # %% temp
+    # tokenized_datasets['train'].rename_columns()
+
+    # %%
+    # create data collator
+
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # %%
+    # compute metrics
+    metric = evaluate.load("accuracy")
+
+
+    def compute_metrics(eval_preds):
+        preds, labels = eval_preds
+        preds = np.argmax(preds, axis=1)
+        return metric.compute(predictions=preds, references=labels)
+
+    # %%
+    # create id2label and label2id
+
+
+    # %%
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_checkpoint,
+        num_labels=len(mdm_list),
+        id2label=id2label,
+        label2id=label2id)
+    # important! after extending tokens vocab
+    model.resize_token_embeddings(len(tokenizer))
+
+    # model = torch.compile(model, backend="inductor", dynamic=True)
+
+
+    # %%
+    # Trainer
+
+    training_args = TrainingArguments(
+        output_dir=f"{save_path}",
+        # eval_strategy="epoch",
+        eval_strategy="no",
+        logging_dir="tensorboard-log",
+        logging_strategy="epoch",
+        # save_strategy="epoch",
+        load_best_model_at_end=False,
+        learning_rate=1e-4,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
+        auto_find_batch_size=False,
+        ddp_find_unused_parameters=False,
+        weight_decay=0.01,
+        save_total_limit=1,
+        num_train_epochs=80,
+        bf16=True,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+
+
+    trainer = Trainer(
+        model,
+        training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["validation"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+    )
+
+    # uncomment to load training from checkpoint
+    # checkpoint_path = 'default_40_1/checkpoint-5600'
+    # trainer.train(resume_from_checkpoint=checkpoint_path)
+
+    trainer.train()
+
+# execute training
+for fold in [1,2,3,4,5]:
+    print(fold)
+    train(fold)
+
+
+# %%
--- a/train/classification_bert_complete_desc/classification_prediction/.gitignore
+++ b/train/classification_bert_complete_desc/classification_prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/train/classification_bert_complete_desc/classification_prediction/output.txt
+++ b/train/classification_bert_complete_desc/classification_prediction/output.txt
@ -1,31 +1,31 @@

 ********************************************************************************
 Fold: 1
-Accuracy: 0.78277
-F1 Score: 0.73629
-Precision: 0.71419
-Recall: 0.78277
+Accuracy: 0.93706
+F1 Score: 0.93286
+Precision: 0.93920
+Recall: 0.93706
 ********************************************************************************
 Fold: 2
-Accuracy: 0.78598
-F1 Score: 0.73708
-Precision: 0.71578
-Recall: 0.78598
+Accuracy: 0.88785
+F1 Score: 0.88726
+Precision: 0.90566
+Recall: 0.88785
 ********************************************************************************
 Fold: 3
-Accuracy: 0.79819
-F1 Score: 0.74411
-Precision: 0.71749
-Recall: 0.79819
+Accuracy: 0.96285
+F1 Score: 0.95930
+Precision: 0.96310
+Recall: 0.96285
 ********************************************************************************
 Fold: 4
-Accuracy: 0.79543
-F1 Score: 0.73902
-Precision: 0.71094
-Recall: 0.79543
+Accuracy: 0.95861
+F1 Score: 0.95320
+Precision: 0.95615
+Recall: 0.95861
 ********************************************************************************
 Fold: 5
-Accuracy: 0.77279
-F1 Score: 0.72098
-Precision: 0.69817
-Recall: 0.77279
+Accuracy: 0.89601
+F1 Score: 0.88613
+Precision: 0.89038
+Recall: 0.89601
--- a/train/classification_bert_complete_desc/classification_prediction/predict.py
+++ b/train/classification_bert_complete_desc/classification_prediction/predict.py
@ -235,6 +235,24 @@ def test(fold):
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)

+    # export result
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    df = pd.read_csv(data_path, skipinitialspace=True)
+    df[df['MDM']].reset_index(drop=True)
+
+    label_list = [id2label[id] for id in pred_labels]
+    df_out = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+    df = pd.concat([df, df_out], axis=1)
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result_group_{fold}.csv", index=False)
+
+
+
+
+

 # %%
 # reset file before writing to it
--- a/train/classification_bert_complete_desc/train.py
+++ b/train/classification_bert_complete_desc/train.py
@ -176,7 +176,7 @@ def train(fold):
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
-        learning_rate=1e-3,
+        learning_rate=1e-4,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
--- a/train/classification_bert_complete_desc_unit/classification_prediction/.gitignore
+++ b/train/classification_bert_complete_desc_unit/classification_prediction/.gitignore
@ -0,0 +1 @@
+exports
--- a/train/classification_bert_complete_desc_unit/classification_prediction/output.txt
+++ b/train/classification_bert_complete_desc_unit/classification_prediction/output.txt
@ -1,31 +1,31 @@

 ********************************************************************************
 Fold: 1
-Accuracy: 0.78940
-F1 Score: 0.73284
-Precision: 0.70389
-Recall: 0.78940
+Accuracy: 0.15229
+F1 Score: 0.07923
+Precision: 0.05929
+Recall: 0.15229
 ********************************************************************************
 Fold: 2
-Accuracy: 0.78411
-F1 Score: 0.73695
-Precision: 0.71914
-Recall: 0.78411
+Accuracy: 0.18075
+F1 Score: 0.09625
+Precision: 0.07243
+Recall: 0.18075
 ********************************************************************************
 Fold: 3
-Accuracy: 0.80522
-F1 Score: 0.75406
-Precision: 0.72847
-Recall: 0.80522
+Accuracy: 0.19493
+F1 Score: 0.10903
+Precision: 0.08332
+Recall: 0.19493
 ********************************************************************************
 Fold: 4
-Accuracy: 0.80780
-F1 Score: 0.75361
-Precision: 0.72432
-Recall: 0.80780
+Accuracy: 0.13190
+F1 Score: 0.05761
+Precision: 0.04173
+Recall: 0.13190
 ********************************************************************************
 Fold: 5
-Accuracy: 0.76958
-F1 Score: 0.71912
-Precision: 0.69965
-Recall: 0.76958
+Accuracy: 0.15198
+F1 Score: 0.07383
+Precision: 0.05411
+Recall: 0.15198
--- a/train/classification_bert_complete_desc_unit/classification_prediction/predict.py
+++ b/train/classification_bert_complete_desc_unit/classification_prediction/predict.py
@ -80,8 +80,9 @@ def process_df_to_dict(df, mdm_list):
 def create_dataset(fold, mdm_list):
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
    test_df = pd.read_csv(data_path, skipinitialspace=True)
+    # uncomment for mdm
    # we only use the mdm subset
-    test_df = test_df[test_df['MDM']].reset_index(drop=True)
+    # test_df = test_df[test_df['MDM']].reset_index(drop=True)

    test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))

@ -237,6 +238,22 @@ def test(fold):
        print(f'Precision: {precision:.5f}', file=f)
        print(f'Recall: {recall:.5f}', file=f)

+    # export result
+    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
+    df = pd.read_csv(data_path, skipinitialspace=True)
+    # uncomment if you want to predict for all
+    # df = df[df['MDM']].reset_index(drop=True)
+
+    label_list = [id2label[id] for id in pred_labels]
+    df_out = pd.DataFrame({
+        'class_prediction': pd.Series(label_list) 
+    })
+    df = pd.concat([df, df_out], axis=1)
+
+    # we can save the t5 generation output here
+    df.to_csv(f"exports/result_group_{fold}.csv", index=False)
+
+

 # %%
 # reset file before writing to it
--- a/train/classification_bert_complete_desc_unit/train.py
+++ b/train/classification_bert_complete_desc_unit/train.py
@ -177,7 +177,7 @@ def train(fold):
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
-        learning_rate=1e-5,
+        learning_rate=1e-4,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
--- a/train/frozen_t5_encoder/train_decoder.py
+++ b/train/frozen_t5_encoder/train_decoder.py
@ -202,7 +202,7 @@ def train(fold):
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/modified_t5_decoder_12_layers/.gitignore
+++ b/train/modified_t5_decoder_12_layers/.gitignore
--- a/train/modified_t5_decoder_12_layers/mapping_prediction/.gitignore
+++ b/train/modified_t5_decoder_12_layers/mapping_prediction/.gitignore
--- a/train/modified_t5_decoder_12_layers/mapping_prediction/inference.py
+++ b/train/modified_t5_decoder_12_layers/mapping_prediction/inference.py
@ -76,7 +76,7 @@ class Inference():
                text_target=target, 
                max_length=max_length,
                return_tensors="pt",
-                padding="max_length",
+                padding='max_length',
                truncation=True,
            )
            return model_inputs
@ -100,7 +100,7 @@ class Inference():


    def generate(self):
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        MAX_GENERATE_LENGTH = 128

        pred_generations = []
--- a/train/mapping_t5-base_desc/mapping_prediction/output.txt
+++ b/train/mapping_t5-base_desc/mapping_prediction/output.txt
@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.9536204448651207
+Accuracy for fold 2: 0.8845794392523364
+Accuracy for fold 3: 0.9618473895582329
+Accuracy for fold 4: 0.9576593720266413
+Accuracy for fold 5: 0.8928080622995878
--- a/train/mapping_t5-base_desc/mapping_prediction/output_with_abbreviation.txt
+++ b/train/mapping_t5-base_desc/mapping_prediction/output_with_abbreviation.txt
@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.9588263132986276
+Accuracy for fold 2: 0.9182242990654206
+Accuracy for fold 3: 0.9633534136546185
+Accuracy for fold 4: 0.9809705042816366
+Accuracy for fold 5: 0.8891433806688044
--- a/train/modified_t5_decoder_9_layers/mapping_prediction/predict.py
+++ b/train/modified_t5_decoder_9_layers/mapping_prediction/predict.py
@ -26,7 +26,7 @@ def infer_and_select(fold):
    # run inference
    # checkpoint
    # Use glob to find matching paths
-    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
+    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
@ -70,5 +70,6 @@ def infer_and_select(fold):
 with open("output.txt", "w") as f:
    print('', file=f)

+# for fold in [1,2,3,4,5]:
 for fold in [1,2,3,4,5]:
    infer_and_select(fold)
--- a/train/modified_t5_decoder_3_layers/train_decoder.py
+++ b/train/modified_t5_decoder_3_layers/train_decoder.py
@ -2,7 +2,6 @@

 # from datasets import load_from_disk
 import os
-import glob

 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

 import torch
-
-from safetensors.torch import load_file
-
-from transformers.models.t5.modeling_t5 import T5Block
 from transformers import (
-    T5Config,
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
-    Seq2SeqTrainingArguments,
-    T5ForConditionalGeneration,
-    T5Model
+    Seq2SeqTrainingArguments
 )
 import evaluate
 import numpy as np
@ -35,23 +27,13 @@ from datasets import Dataset, DatasetDict

 torch.set_float32_matmul_precision('high')

-
-
-# %%
-
-# model_checkpoint = "t5-small"
-# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
-# model.config
-
-# %%
 # outputs a list of dictionaries
 def process_df_to_dict(df):
    output_list = []
    for _, row in df.iterrows():
        desc = f"<DESC>{row['tag_description']}<DESC>"
-        unit = f"<UNIT>{row['unit']}<UNIT>"
        element = {
-            'input' : f"{desc}{unit}",
+            'input' : f"{desc}",
            'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
        }
        output_list.append(element)
@ -77,11 +59,12 @@ def create_split_dataset(fold):

 # function to perform training for a given fold
 def train(fold):
-    save_path = f'checkpoint_fold_{fold}b'
+    save_path = f'checkpoint_fold_{fold}'
    split_datasets = create_split_dataset(fold)

    # prepare tokenizer
-    model_checkpoint = "t5-small"
+
+    model_checkpoint = "t5-base"
    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
@ -101,7 +84,7 @@ def train(fold):
            text_target=target, 
            max_length=max_length,
            truncation=True,
-            padding="max_length"
+            padding=True
        )
        return model_inputs

@ -119,52 +102,10 @@ def train(fold):
    # device_map set to auto to force it to load contiguous weights 
    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')

-    # directory = os.path.join(".", f'checkpoint_fold_{fold}a')
-    # # Use glob to find matching paths
-    # # path is usually checkpoint_fold_1/checkpoint-<step number>
-    # # we are guaranteed to save only 1 checkpoint from training
-    # pattern = 'checkpoint-*'
-    # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
-    # # t5_classify = T5Model.from_pretrained(prev_checkpoint)
-    # # Load the checkpoint
-    # checkpoint_path = f"{prev_checkpoint}/model.safetensors"
-    # checkpoint = load_file(checkpoint_path)
-    # # Filter out weights related to the classification head
-    # # given name format: t5.encoder.embed_tokens.weight
-    # # we want: encoder.embed.tokens.weight
-    # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
-
-
-    pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
-
-    # Access the decoder stack
-    # config = T5Config("t5-small")
-
-    config = pretrained_model.config
-    config.num_layers = 6
-    config.num_decoder_layers = 3  # set new decoder layer count
-
-    model = T5ForConditionalGeneration(config)
-
-    model.shared = pretrained_model.shared
-    model.encoder = pretrained_model.encoder
-
-    pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
-    for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]):
-        model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights
-
-
-    # print number of decoder blocks
-    print(f'Number of decoder blocks: {len(model.decoder.block)}')
-    print(f'num_layers: {model.config.num_layers}')
-    print(f'num_decoder_layers: {model.config.num_decoder_layers}')
-
-
-    # change the token embedding size to match the shape
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))

-
-
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    metric = evaluate.load("sacrebleu")

@ -199,7 +140,7 @@ def train(fold):
    # Generation Config
    # from transformers import GenerationConfig
    gen_config = model.generation_config
-    gen_config.max_length = 128
+    gen_config.max_length = 64

    # compile
    # model = torch.compile(model, backend="inductor", dynamic=True)
@ -222,7 +163,7 @@ def train(fold):
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/mapping_t5-base_desc_unit/.gitignore
+++ b/train/mapping_t5-base_desc_unit/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log/
--- a/train/modified_t5_decoder_3_layers/mapping_prediction/.gitignore
+++ b/train/modified_t5_decoder_3_layers/mapping_prediction/.gitignore
--- a/train/modified_t5_decoder_3_layers/mapping_prediction/inference.py
+++ b/train/modified_t5_decoder_3_layers/mapping_prediction/inference.py
@ -76,7 +76,7 @@ class Inference():
                text_target=target, 
                max_length=max_length,
                return_tensors="pt",
-                padding="max_length",
+                padding='max_length',
                truncation=True,
            )
            return model_inputs
@ -100,7 +100,7 @@ class Inference():


    def generate(self):
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        MAX_GENERATE_LENGTH = 128

        pred_generations = []
--- a/train/mapping_t5-base_desc_unit/mapping_prediction/output.txt
+++ b/train/mapping_t5-base_desc_unit/mapping_prediction/output.txt
@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.9697113109323237
+Accuracy for fold 2: 0.9
+Accuracy for fold 3: 0.9613453815261044
+Accuracy for fold 4: 0.9686013320647003
+Accuracy for fold 5: 0.8932661475034357
--- a/train/modified_t5_decoder_12_layers/mapping_prediction/predict.py
+++ b/train/modified_t5_decoder_12_layers/mapping_prediction/predict.py
@ -6,13 +6,14 @@ from inference import Inference

 checkpoint_directory =  '../'

-BATCH_SIZE = 512
+BATCH_SIZE = 128

 def infer_and_select(fold):
    print(f"Inference for fold {fold}")
    # import test data
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
    df = pd.read_csv(data_path, skipinitialspace=True)
+    # uncomment for mdm only
    df = df[df['MDM']].reset_index(drop=True)

    # get target data
@ -26,7 +27,7 @@ def infer_and_select(fold):
    # run inference
    # checkpoint
    # Use glob to find matching paths
-    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
+    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
--- a/train/modified_t5_decoder_12_layers/train_decoder.py
+++ b/train/modified_t5_decoder_12_layers/train_decoder.py
@ -2,7 +2,6 @@

 # from datasets import load_from_disk
 import os
-import glob

 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

 import torch
-
-from safetensors.torch import load_file
-
-from transformers.models.t5.modeling_t5 import T5Block
 from transformers import (
-    T5Config,
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
-    Seq2SeqTrainingArguments,
-    T5ForConditionalGeneration,
-    T5Model
+    Seq2SeqTrainingArguments
 )
 import evaluate
 import numpy as np
@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict

 torch.set_float32_matmul_precision('high')

-
-
-# %%
-
-# model_checkpoint = "t5-small"
-# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
-# model.config
-
-# %%
 # outputs a list of dictionaries
 def process_df_to_dict(df):
    output_list = []
@ -77,11 +60,12 @@ def create_split_dataset(fold):

 # function to perform training for a given fold
 def train(fold):
-    save_path = f'checkpoint_fold_{fold}b'
+    save_path = f'checkpoint_fold_{fold}'
    split_datasets = create_split_dataset(fold)

    # prepare tokenizer
-    model_checkpoint = "t5-small"
+
+    model_checkpoint = "t5-base"
    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
    additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
@ -101,7 +85,7 @@ def train(fold):
            text_target=target, 
            max_length=max_length,
            truncation=True,
-            padding="max_length"
+            padding=True
        )
        return model_inputs

@ -119,52 +103,10 @@ def train(fold):
    # device_map set to auto to force it to load contiguous weights 
    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')

-    # directory = os.path.join(".", f'checkpoint_fold_{fold}a')
-    # # Use glob to find matching paths
-    # # path is usually checkpoint_fold_1/checkpoint-<step number>
-    # # we are guaranteed to save only 1 checkpoint from training
-    # pattern = 'checkpoint-*'
-    # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
-    # # t5_classify = T5Model.from_pretrained(prev_checkpoint)
-    # # Load the checkpoint
-    # checkpoint_path = f"{prev_checkpoint}/model.safetensors"
-    # checkpoint = load_file(checkpoint_path)
-    # # Filter out weights related to the classification head
-    # # given name format: t5.encoder.embed_tokens.weight
-    # # we want: encoder.embed.tokens.weight
-    # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
-
-
-    pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
-
-    # Access the decoder stack
-    # config = T5Config("t5-small")
-
-    config = pretrained_model.config
-    config.num_layers = 6
-    config.num_decoder_layers = 12  # set new decoder layer count
-
-    model = T5ForConditionalGeneration(config)
-
-    model.shared = pretrained_model.shared
-    model.encoder = pretrained_model.encoder
-
-    pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
-    for i, layer in enumerate(pretrained_decoder_weights):
-        model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights
-
-
-    # print number of decoder blocks
-    print(f'Number of decoder blocks: {len(model.decoder.block)}')
-    print(f'num_layers: {model.config.num_layers}')
-    print(f'num_decoder_layers: {model.config.num_decoder_layers}')
-
-
-    # change the token embedding size to match the shape
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))

-
-
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    metric = evaluate.load("sacrebleu")

@ -199,10 +141,11 @@ def train(fold):
    # Generation Config
    # from transformers import GenerationConfig
    gen_config = model.generation_config
-    gen_config.max_length = 128
+    gen_config.max_length = 64

    # compile
    # model = torch.compile(model, backend="inductor", dynamic=True)
+    # model = torch.compile(model)


    # Trainer
@ -210,10 +153,10 @@ def train(fold):
    args = Seq2SeqTrainingArguments(
        f"{save_path}",
        # eval_strategy="epoch",
+        save_strategy="epoch",
        eval_strategy="no",
        logging_dir="tensorboard-log",
        logging_strategy="epoch",
-        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
        per_device_train_batch_size=64,
@ -222,12 +165,13 @@ def train(fold):
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
        generation_config=gen_config,
        remove_unused_columns=False,
+        warmup_steps=400
    )


--- a/train/mapping_t5_1e4/.gitignore
+++ b/train/mapping_t5_1e4/.gitignore
@ -0,0 +1,2 @@
+checkpoint*
+tensorboard-log/
--- a/train/modified_t5_decoder_9_layers/mapping_prediction/.gitignore
+++ b/train/modified_t5_decoder_9_layers/mapping_prediction/.gitignore
--- a/train/modified_t5_decoder_9_layers/mapping_prediction/inference.py
+++ b/train/modified_t5_decoder_9_layers/mapping_prediction/inference.py
@ -76,7 +76,7 @@ class Inference():
                text_target=target, 
                max_length=max_length,
                return_tensors="pt",
-                padding="max_length",
+                padding='max_length',
                truncation=True,
            )
            return model_inputs
@ -100,7 +100,7 @@ class Inference():


    def generate(self):
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
        MAX_GENERATE_LENGTH = 128

        pred_generations = []
--- a/train/mapping_t5_1e4/mapping_prediction/output.txt
+++ b/train/mapping_t5_1e4/mapping_prediction/output.txt
@ -0,0 +1,6 @@
+
+Accuracy for fold 1: 0.934690014197823
+Accuracy for fold 2: 0.9023364485981309
+Accuracy for fold 3: 0.9643574297188755
+Accuracy for fold 4: 0.9700285442435775
+Accuracy for fold 5: 0.8941823179111315
--- a/train/modified_t5_decoder_3_layers/mapping_prediction/predict.py
+++ b/train/modified_t5_decoder_3_layers/mapping_prediction/predict.py
@ -26,7 +26,7 @@ def infer_and_select(fold):
    # run inference
    # checkpoint
    # Use glob to find matching paths
-    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
+    directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
    # Use glob to find matching paths
    # path is usually checkpoint_fold_1/checkpoint-<step number>
    # we are guaranteed to save only 1 checkpoint from training
--- a/train/modified_t5_decoder_9_layers/train_decoder.py
+++ b/train/modified_t5_decoder_9_layers/train_decoder.py
@ -2,7 +2,6 @@

 # from datasets import load_from_disk
 import os
-import glob

 os.environ['NCCL_P2P_DISABLE'] = '1'
 os.environ['NCCL_IB_DISABLE'] = '1'
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

 import torch
-
-from safetensors.torch import load_file
-
-from transformers.models.t5.modeling_t5 import T5Block
 from transformers import (
-    T5Config,
    T5TokenizerFast,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
-    Seq2SeqTrainingArguments,
-    T5ForConditionalGeneration,
-    T5Model
+    Seq2SeqTrainingArguments
 )
 import evaluate
 import numpy as np
@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict

 torch.set_float32_matmul_precision('high')

-
-
-# %%
-
-# model_checkpoint = "t5-small"
-# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
-# model.config
-
-# %%
 # outputs a list of dictionaries
 def process_df_to_dict(df):
    output_list = []
@ -77,10 +60,11 @@ def create_split_dataset(fold):

 # function to perform training for a given fold
 def train(fold):
-    save_path = f'checkpoint_fold_{fold}b'
+    save_path = f'checkpoint_fold_{fold}'
    split_datasets = create_split_dataset(fold)

    # prepare tokenizer
+
    model_checkpoint = "t5-small"
    tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
    # Define additional special tokens
@ -101,7 +85,7 @@ def train(fold):
            text_target=target, 
            max_length=max_length,
            truncation=True,
-            padding="max_length"
+            padding=True
        )
        return model_inputs

@ -119,52 +103,10 @@ def train(fold):
    # device_map set to auto to force it to load contiguous weights 
    # model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')

-    # directory = os.path.join(".", f'checkpoint_fold_{fold}a')
-    # # Use glob to find matching paths
-    # # path is usually checkpoint_fold_1/checkpoint-<step number>
-    # # we are guaranteed to save only 1 checkpoint from training
-    # pattern = 'checkpoint-*'
-    # prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
-    # # t5_classify = T5Model.from_pretrained(prev_checkpoint)
-    # # Load the checkpoint
-    # checkpoint_path = f"{prev_checkpoint}/model.safetensors"
-    # checkpoint = load_file(checkpoint_path)
-    # # Filter out weights related to the classification head
-    # # given name format: t5.encoder.embed_tokens.weight
-    # # we want: encoder.embed.tokens.weight
-    # t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
-
-
-    pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
-
-    # Access the decoder stack
-    # config = T5Config("t5-small")
-
-    config = pretrained_model.config
-    config.num_layers = 6
-    config.num_decoder_layers = 9  # set new decoder layer count
-
-    model = T5ForConditionalGeneration(config)
-
-    model.shared = pretrained_model.shared
-    model.encoder = pretrained_model.encoder
-
-    pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
-    for i, layer in enumerate(pretrained_decoder_weights):
-        model.decoder.block[i].load_state_dict(layer)  # Load pretrained weights
-
-
-    # print number of decoder blocks
-    print(f'Number of decoder blocks: {len(model.decoder.block)}')
-    print(f'num_layers: {model.config.num_layers}')
-    print(f'num_decoder_layers: {model.config.num_decoder_layers}')
-
-
-    # change the token embedding size to match the shape
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
+    # important! after extending tokens vocab
    model.resize_token_embeddings(len(tokenizer))

-
-
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    metric = evaluate.load("sacrebleu")

@ -199,7 +141,7 @@ def train(fold):
    # Generation Config
    # from transformers import GenerationConfig
    gen_config = model.generation_config
-    gen_config.max_length = 128
+    gen_config.max_length = 64

    # compile
    # model = torch.compile(model, backend="inductor", dynamic=True)
@ -215,14 +157,14 @@ def train(fold):
        logging_strategy="epoch",
        # save_strategy="epoch",
        load_best_model_at_end=False,
-        learning_rate=1e-3,
+        learning_rate=1e-4,
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/mapping_t5_complete_desc/mapping_prediction/output.txt
+++ b/train/mapping_t5_complete_desc/mapping_prediction/output.txt
@ -1,6 +1,6 @@

-Accuracy for fold 1: 0.9455750118315192
-Accuracy for fold 2: 0.8864485981308411
-Accuracy for fold 3: 0.9558232931726908
-Accuracy for fold 4: 0.9686013320647003
-Accuracy for fold 5: 0.896930829134219
+Accuracy for fold 1: 0.9427354472314246
+Accuracy for fold 2: 0.8981308411214953
+Accuracy for fold 3: 0.9588353413654619
+Accuracy for fold 4: 0.9633682207421503
+Accuracy for fold 5: 0.8928080622995878
--- a/train/mapping_t5_complete_desc/train.py
+++ b/train/mapping_t5_complete_desc/train.py
@ -157,13 +157,13 @@ def train(fold):
        # save_strategy="epoch",
        load_best_model_at_end=False,
        learning_rate=1e-3,
-        per_device_train_batch_size=128,
-        per_device_eval_batch_size=128,
+        per_device_train_batch_size=64,
+        per_device_eval_batch_size=64,
        auto_find_batch_size=False,
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt
+++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/output.txt
@ -1,6 +1,6 @@

-Accuracy for fold 1: 0.9522006625650734
-Accuracy for fold 2: 0.9093457943925234
-Accuracy for fold 3: 0.9678714859437751
-Accuracy for fold 4: 0.9814462416745956
-Accuracy for fold 5: 0.890975721484196
+Accuracy for fold 1: 0.9578797917652626
+Accuracy for fold 2: 0.9088785046728972
+Accuracy for fold 3: 0.9673694779116466
+Accuracy for fold 4: 0.9785918173168411
+Accuracy for fold 5: 0.8918918918918919
--- a/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py
+++ b/train/mapping_t5_complete_desc_unit/mapping_prediction/predict.py
@ -13,7 +13,8 @@ def infer_and_select(fold):
    # import test data
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
    df = pd.read_csv(data_path, skipinitialspace=True)
-    df = df[df['MDM']].reset_index(drop=True)
+    # note: we need to uncomment this for overall evaluation
+    # df = df[df['MDM']].reset_index(drop=True)

    # get target data
    data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
--- a/train/mapping_t5_complete_desc_unit/train.py
+++ b/train/mapping_t5_complete_desc_unit/train.py
@ -164,7 +164,7 @@ def train(fold):
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/modified_t5_decoder_12_layers/mapping_prediction/output.txt
+++ b/train/modified_t5_decoder_12_layers/mapping_prediction/output.txt
@ -1,6 +0,0 @@
-
-Accuracy for fold 1: 0.9403691433980123
-Accuracy for fold 2: 0.9046728971962616
-Accuracy for fold 3: 0.9678714859437751
-Accuracy for fold 4: 0.9695528068506185
-Accuracy for fold 5: 0.902427851580394
--- a/train/modified_t5_decoder_1_layers/train_decoder.py
+++ b/train/modified_t5_decoder_1_layers/train_decoder.py
@ -222,7 +222,7 @@ def train(fold):
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/modified_t5_decoder_2_layers/train_decoder.py
+++ b/train/modified_t5_decoder_2_layers/train_decoder.py
@ -222,7 +222,7 @@ def train(fold):
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/modified_t5_decoder_3_layers/.gitignore
+++ b/train/modified_t5_decoder_3_layers/.gitignore
@ -1,2 +0,0 @@
-checkpoint*
-tensorboard-log
--- a/train/modified_t5_decoder_3_layers/mapping_prediction/output.txt
+++ b/train/modified_t5_decoder_3_layers/mapping_prediction/output.txt
@ -1,6 +0,0 @@
-
-Accuracy for fold 1: 0.9427354472314246
-Accuracy for fold 2: 0.9098130841121496
-Accuracy for fold 3: 0.964859437751004
-Accuracy for fold 4: 0.9719314938154139
-Accuracy for fold 5: 0.9070087036188731
--- a/train/modified_t5_decoder_4_layers/train_decoder.py
+++ b/train/modified_t5_decoder_4_layers/train_decoder.py
@ -222,7 +222,7 @@ def train(fold):
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/modified_t5_decoder_8_layers/train_decoder.py
+++ b/train/modified_t5_decoder_8_layers/train_decoder.py
@ -222,7 +222,7 @@ def train(fold):
        ddp_find_unused_parameters=False,
        weight_decay=0.01,
        save_total_limit=1,
-        num_train_epochs=40,
+        num_train_epochs=80,
        predict_with_generate=True,
        bf16=True,
        push_to_hub=False,
--- a/train/modified_t5_decoder_9_layers/.gitignore
+++ b/train/modified_t5_decoder_9_layers/.gitignore
@ -1,2 +0,0 @@
-checkpoint*
-tensorboard-log
--- a/train/modified_t5_decoder_9_layers/mapping_prediction/output.txt
+++ b/train/modified_t5_decoder_9_layers/mapping_prediction/output.txt
@ -1,6 +0,0 @@
-
-Accuracy for fold 1: 0.9441552295314718
-Accuracy for fold 2: 0.9121495327102803
-Accuracy for fold 3: 0.963855421686747
-Accuracy for fold 4: 0.9752616555661275
-Accuracy for fold 5: 0.907924874026569
--- a/train/train.bash
+++ b/train/train.bash
@ -1,28 +1,14 @@
 #!/bin/bash

-cd hybrid_t5_complete_desc_unit
-micromamba run -n hug accelerate launch train_encoder.py
-micromamba run -n hug accelerate launch train_decoder.py
+cd mapping_t5-base_desc
+micromamba run -n hug accelerate launch train.py
 cd ..

-cd hybrid_t5_pattern_desc_unit
-micromamba run -n hug accelerate launch train_encoder.py
-micromamba run -n hug accelerate launch train_decoder.py
+cd mapping_t5-base_desc_unit
+micromamba run -n hug accelerate launch train.py
 cd ..


-# cd classification_bert_complete_desc
-# micromamba run -n hug accelerate launch train.py
-# cd ..
-
-# cd classification_bert_complete_desc_unit
-# micromamba run -n hug accelerate launch train.py
-# cd ..
-
-# cd classification_bert_complete_desc_unit_name
-# micromamba run -n hug accelerate launch train.py
-# cd ..
-
 # cd mapping_t5_complete_desc
 # micromamba run -n hug accelerate launch train.py
 # cd ..
@ -31,6 +17,31 @@ cd ..
 # micromamba run -n hug accelerate launch train.py
 # cd ..
 # 
-# cd mapping_t5_complete_name_desc_unit
+# cd frozen_t5_encoder
+# micromamba run -n hug accelerate launch train_decoder.py
+# cd ..
+# 
+# cd modified_t5_decoder_1_layers
+# micromamba run -n hug accelerate launch train_decoder.py
+# cd ..
+# 
+# cd modified_t5_decoder_2_layers
+# micromamba run -n hug accelerate launch train_decoder.py
+# cd ..
+# 
+# cd modified_t5_decoder_4_layers
+# micromamba run -n hug accelerate launch train_decoder.py
+# cd ..
+# 
+# cd modified_t5_decoder_8_layers
+# micromamba run -n hug accelerate launch train_decoder.py
+# cd ..
+# 
+# cd classification_bert_complete_desc
 # micromamba run -n hug accelerate launch train.py
 # cd ..
+# 
+# cd classification_bert_complete_desc_unit
+# micromamba run -n hug accelerate launch train.py
+# cd ..
+