Feat: added classification with number tokens

- added analysis for overall statistics
This commit is contained in:
Richard Wong 2025-01-09 23:13:24 +09:00
parent 1b6659a600
commit 1b9c4323c3
70 changed files with 1394 additions and 342 deletions

View File

@ -13,6 +13,10 @@ full_df
# %%
mdm_list
# %%
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
# %%
mdm_list
# %%
mask = full_df['pattern'] == 'GE#Flow FGMassFlow'
full_df[mask]

View File

@ -1,13 +0,0 @@
# %%
import pandas as pd
# %%
data_path = '../../data_import/exports/raw_data.csv'
df = pd.read_csv(data_path)
# %%
df
# %%
len(set(df['ships_idx']))
# %%

View File

@ -0,0 +1,58 @@
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %%
# data_path = '../../data_import/exports/raw_data.csv'
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
df = pd.read_csv(data_path)
# %%
df = df[df['MDM']].reset_index(drop=True)
# %%
# we want to print the string length
# print summary stats
def summary_stats(arr):
return {
"Mean": np.mean(arr),
"Median": np.median(arr),
"Standard Deviation": np.std(arr),
"Variance": np.var(arr),
"Min": np.min(arr),
"Max": np.max(arr),
"Range": np.ptp(arr),
"25th Percentile": np.percentile(arr, 25),
"75th Percentile": np.percentile(arr, 75),
"Sum": np.sum(arr),
}
# %%
ship_domain_data = df['tag_description'] + df['unit'].fillna('')
ship_domain_array = np.array([len(item) for item in ship_domain_data])
stats = summary_stats(ship_domain_array)
for key, value in stats.items():
print(f"{key}: {value}")
# %%
plt.hist(ship_domain_array, bins=50)
# %%
# %%
platform_domain_data = df['thing'] + df['property']
platform_domain_array = np.array([len(item) for item in platform_domain_data])
stats = summary_stats(platform_domain_array)
for key, value in stats.items():
print(f"{key}: {value}")
# %%

1
analysis/delta_analysis/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
exports

View File

@ -0,0 +1,62 @@
# %%
import pandas as pd
import numpy as np
# %%
data_path = '../../data_import/exports/data_mapping_mdm.csv'
full_df = pd.read_csv(data_path, skipinitialspace=True)
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
# %%
fold = 5
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
df_bert = pd.read_csv(file_path)
# %%
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
df_t5 = pd.read_csv(file_path)
df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True)
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
# %%
df_t5['bert_prediction'] = df_bert['class_prediction']
df_bert['t5_prediction'] = df_t5['class_prediction']
# %%
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
# %%
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
# %%
sum(t5_correct)/len(t5_correct)
# %%
# replace t5 not in vocab with bert values
t5_correct_modified = t5_correct.copy()
condition = ~df_t5['in_vocab']
t5_correct_modified[condition] = np.array(bert_correct[condition])
# %%
# new replacement correctness
sum(t5_correct_modified)/len(t5_correct_modified)
# %%
# when bert is correct and t5 is wrong
cond_mask = bert_correct & (~t5_correct)
print(sum(cond_mask))
print(df_t5[cond_mask].to_string())
# %%
# when bert is wrong and t5 is correct
cond_mask = (~bert_correct) & (t5_correct)
print(sum(cond_mask))
print(df_bert[cond_mask].to_string())
# %%
# when both are wrong
cond_mask = (~bert_correct) & (~t5_correct)
print(sum(cond_mask))
# %%

View File

@ -0,0 +1,72 @@
# %%
import pandas as pd
import numpy as np
# %%
data_path = '../../data_import/exports/data_mapping_mdm.csv'
full_df = pd.read_csv(data_path, skipinitialspace=True)
mdm_list = sorted(list((set(full_df['thing'] + full_df['property']))))
# %%
def run_mdm(fold):
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
df_bert = pd.read_csv(file_path)
df_bert = df_bert[df_bert['MDM']].reset_index(drop=True)
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
df_t5 = pd.read_csv(file_path)
df_t5 = df_t5[df_t5['MDM']].reset_index(drop=True)
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
df_t5['bert_prediction'] = df_bert['class_prediction']
df_bert['t5_prediction'] = df_t5['class_prediction']
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
t5_original_accuracy = sum(t5_correct)/len(t5_correct)
# replace t5 not in vocab with bert values
t5_correct_modified = t5_correct.copy()
condition = ~df_t5['in_vocab']
t5_correct_modified[condition] = np.array(bert_correct[condition])
pd.Series(t5_correct_modified).to_csv(f'exports/result_group_{fold}.csv')
t5_new_accuracy = sum(t5_correct_modified)/len(t5_correct_modified)
print('original accuracy', t5_original_accuracy)
print('new accuracy', t5_new_accuracy)
# %%
# this does replacement for the full prediction
def run_full(fold):
file_path = f'../../train/classification_bert_complete_desc_unit/classification_prediction/exports/result_group_{fold}.csv'
df_bert = pd.read_csv(file_path)
file_path = f'../../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
# file_path = f'../../train/mapping_t5-base_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
df_t5 = pd.read_csv(file_path)
df_t5['class_prediction'] = (df_t5['p_thing'] + df_t5['p_property'])
df_t5['in_vocab'] = df_t5['class_prediction'].isin(mdm_list)
df_t5['bert_prediction'] = df_bert['class_prediction']
df_bert['t5_prediction'] = df_t5['class_prediction']
bert_correct = (df_bert['thing'] + df_bert['property']) == df_bert['class_prediction']
t5_correct = (df_t5['thing'] + df_t5['property']) == (df_t5['p_thing'] + df_t5['p_property'])
# replace t5 not in vocab with bert values
t5_correct_modified = t5_correct.copy()
condition = ~df_t5['in_vocab']
t5_correct_modified[condition] = np.array(bert_correct[condition])
pd.Series(t5_correct_modified, name='grounded_pred').to_csv(f'exports/result_group_{fold}.csv')
# %%
for fold in [1,2,3,4,5]:
run_mdm(fold)
run_full(fold)
# %%

View File

@ -0,0 +1,67 @@
,thing,property,ships_idx,tag_name,tag_description,signal_type,min,max,unit,data_type,thing_pattern,property_pattern,pattern,MDM,class_prediction
6,SB1Flow,FOMassFlowTotal,1003,FM6_XI001_Y,AUXILIARY BOILER FUEL OIL TOTAL FLOW RATE,AI,0,0,FLOW,1304.0,SB#Flow,FOMassFlowTotal,SB#Flow FOMassFlowTotal,True,SB1FlowFOMassFlowIn
38,ShipBoiler3,RunningState,1030,BC330,COMPOSITE BOILER FIRING,DI,0,0,NOVALUE,1301.0,ShipBoiler#,RunningState,ShipBoiler# RunningState,True,ShipBoiler1RunningState
61,GeneratorEngine5,CBNonClosed,1003,PMS_5ACBNCL_Y,NO5 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine5RunningState
72,CargoHandling,BoostPp_Port_Current,1018,IT_1400_Y,MP1400 BOOSTER PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,BoostPp_Port_Current,CargoHandling BoostPp_Port_Current,True,CargoHandlingBoostPp_Stbd_Current
81,Navigation,MidPDraft,1018,TL_200002_Y,MID DRAFTP_LV,A,0,0,NOVALUE,1310.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft
86,ShipBoiler1,FOInletTemp,1018,AB_000001_Y,BOILER FUEL OIL IN BURNER_TEMP,A,0,0,NOVALUE,1310.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
140,Navigation,MidPDraft,1003,DCM_P3_Y,DRAUGHT MID PS (DRAFT SENSOR),AI,0,0,m ,1304.0,Navigation,MidPDraft,Navigation MidPDraft,True,NavigationMidSDraft
174,ShipBoiler1,FOInletPress,1051,MB.YO.IAS.Q3.40224,BOILER FUEL OIL IN BURNER_PRESS,Analog,0,4,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress
200,GeneratorEngine3,VoltageB,1050,MB.KM.IAS.Q3.A40193,NO3 GENERATOR_ENGINE(B) GEN VOLTAGE,AO,0,655,VOLTAGE,1300.0,GeneratorEngine#,VoltageB,GeneratorEngine# VoltageB,True,GeneratorEngine3Voltage
342,EngineRoom,AirTemp,1018,MA_TT8612_Y,MAIN_ENGINE AMBIENT_TEMP,A,0,0,NOVALUE,1310.0,EngineRoom,AirTemp,EngineRoom AirTemp,True,GeneratorEngine1CBTrip
395,GeneratorEngine3,SAPress,1036,MB.KM.IAS.Q2.400121,NO3 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine3WindingTempR
396,MainEngine1,RPM,1051,MB.YO.IAS.Q1.40006,M/E_RPM,Analog,-120,120,RPM,1304.0,MainEngine#,RPM,MainEngine# RPM,True,Shaft1RPM
653,ShipBoiler1,FOInletTemp,1033,CB014,COMPOSITE BOILER FUEL OIL TEMPERATURE,AI,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
731,GeneratorEngine4,CBNonClosed,1003,PMS_4ACBNCL_Y,NO4 GENERATOR_ENGINE ACB NON CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBNonClosed,GeneratorEngine# CBNonClosed,True,GeneratorEngine4CBClosed
745,ShipBoiler1,FOInletPress,1018,AB_000002_Y,BOILER FUEL OIL IN BURNER PRESSURE,A,0,0,PRESSURE,1310.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,ShipBoiler3FOInletPress
783,GeneratorEngine1,LOFilterInletPress,1030,GA069,NO1 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine1LOInletPress
786,GeneratorEngine1,FOFilterInletPress,1030,GA085,NO1 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine1FOInletPress
812,GE1Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400031,GENERATOR_ENGINE FUEL OIL VISCOSITY INDICATION,AO,0,2346,VOLUME FLOW,1304.0,GE#Flow,FOViscosity,GE#Flow FOViscosity,True,GE1FlowFOVolumeFlowIn
813,ME2Flow,FOViscosity,1020,MB.YO.IAS.Q1.A400025,MAIN_ENGINE(P) FUEL OIL VISCOSITY INDICATION,AO,0,2285,VOLUME FLOW,1304.0,ME#Flow,FOViscosity,ME#Flow FOViscosity,True,ME2FlowFOVolumeFlowIn
840,GeneratorEngine1,SAPress,1036,MB.KM.IAS.Q1.400051,NO1 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine1WindingTempR
891,GE1Flow,FOMassFlowIn,1051,MB.YO.IAS.Q2.40103,GENERATOR_ENGINE HFO_FLOW,Analog,0,1800,MASS FLOW,1304.0,GE#Flow,FOMassFlowIn,GE#Flow FOMassFlowIn,True,GE1FlowFGMassFlow
935,ShipBoiler1,FOInletTemp,1051,MB.YO.IAS.Q3.40223,BOILER FUEL OIL IN BURNER_TEMP,Analog,0,200,TEMPERATURE,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,ShipBoiler3FOInletTemp
951,MainEngine2,CFWInletTemp,1020,MB.YO.IAS.Q1.A400388,MAIN_ENGINE(P) CYLINDER COOL WATER TEMPERATURE INLET,AO,-50,130,TEMPERATURE,1304.0,MainEngine#,CFWInletTemp,MainEngine# CFWInletTemp,True,MainEngine2Cy3CWTemp
1005,GeneratorEngine1,HFOUse,1051,MB.YO.IAS.Q1.10096,G/E_HFUEL OIL USE,Digital,0,1,-,1301.0,GeneratorEngine#,HFOUse,GeneratorEngine# HFOUse,True,MainEngine1HFOUse
1075,ME1Flow,FGMassFlow,1004,MB.YO.IAS.Q2.A400121,LP LPG FUEL P/P FLOW,AI,0,3500,MASS FLOW,1304.0,ME#Flow,FGMassFlow,ME#Flow FGMassFlow,True,ME2FlowFGMassFlow
1116,CargoHandling,LPGComp1MotorCurrent,1004,MB.YO.IAS.Q3.A400281,MP-2100 COMPRESSOR (P) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT3_DWPump_Port_Current
1117,CargoHandling,LPGComp2MotorCurrent,1004,MB.YO.IAS.Q3.A400282,MP-2200 COMPRESSOR (C) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingCT2_DWPump_Stbd_Current
1118,CargoHandling,LPGComp3MotorCurrent,1004,MB.YO.IAS.Q3.A400283,MP-2300 COMPRESSOR (S) CURRENT,AI,0,1200,CURRENT,1304.0,CargoHandling,LPGComp#MotorCurrent,CargoHandling LPGComp#MotorCurrent,True,CargoHandlingBoostPp_Stbd_Current
1174,FuelOilSystem,LFOVolumeSettleTK,1003,LC_XI001_Y,NO2 LIGHT FUEL OIL SETTLING TANK VOLUME,AI,0,999999,VOLUME,1304.0,FuelOilSystem,LFOVolumeSettleTK,FuelOilSystem LFOVolumeSettleTK,True,FuelOilSystemLFOVolumeStorageTK2P
1198,GeneratorEngine4,BearingNDETemp1,1003,GE4_TIAH6_Y,NO4 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine4WindingTempT
1199,GeneratorEngine5,BearingNDETemp1,1003,GE5_TIAH6_Y,NO5 GENERATOR_ENGINE BEARING TEMPERATURE(NDE),AI,0,200,℃,1304.0,GeneratorEngine#,BearingNDETemp#,GeneratorEngine# BearingNDETemp#,True,GeneratorEngine5WindingTempT
1200,MainEngine1,LoadPercent,1018,EG_0000005_Y,M/E_LOAD,D,0,0,%,1301.0,MainEngine#,LoadPercent,MainEngine# LoadPercent,True,GeneratorEngine2LoadPercent
1214,GE1TurboCharger1,ExhGasOutletTemp,1003,GE1_TE27_Y,NO1 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger1ExhGasOutletTemp
1226,GE2TurboCharger1,ExhGasOutletTemp,1003,GE2_TE27_Y,NO2 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger2ExhGasOutletTemp
1237,GE3TurboCharger1,ExhGasOutletTemp,1003,GE3_TE27_Y,NO3 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER),AI,0,800,°C,1304.0,GE#TurboCharger#,ExhGasOutletTemp,GE#TurboCharger# ExhGasOutletTemp,True,GE3TurboCharger3ExhGasOutletTemp
1246,GeneratorEngine3,BearingDETemp8,1003,GE3_TE698_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5
1247,GeneratorEngine3,BearingDETemp9,1003,GE3_TE699_Y,NO3 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
1273,GeneratorEngine4,BearingDETemp8,1003,GE4_TE698_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
1274,GeneratorEngine4,BearingDETemp9,1003,GE4_TE699_Y,NO4 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
1280,GeneratorEngine5,BearingDETemp2,1003,GE5_TE692_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP2,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2BearingDETemp6
1281,GeneratorEngine5,BearingDETemp3,1003,GE5_TE693_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP3,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp5
1282,GeneratorEngine5,BearingDETemp4,1003,GE5_TE694_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP4,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp4
1283,GeneratorEngine5,BearingDETemp5,1003,GE5_TE695_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP5,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5
1284,GeneratorEngine5,BearingDETemp6,1003,GE5_TE696_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP6,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
1285,GeneratorEngine5,BearingDETemp7,1003,GE5_TE697_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP7,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine3BearingDETemp6
1286,GeneratorEngine5,BearingDETemp8,1003,GE5_TE698_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP8,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine2Cy8KnockIntensity
1287,GeneratorEngine5,BearingDETemp9,1003,GE5_TE699_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP9,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine4BearingDETemp6
1298,ME1TurboCharger1,ExhGasInletTemp,1003,AMSI_TT3721A_Y,EXHAUST GAS TEMPERATURE BEFORE TURBOCHARGER 1,AI,0,600,TEMPERATURE,1304.0,ME#TurboCharger#,ExhGasInletTemp,ME#TurboCharger# ExhGasInletTemp,True,ME1TurboCharger1ExhGasOutletTemp
1309,GeneratorEngine2,LOFilterInletPress,1030,GB069,NO2 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine2LOInletPress
1472,GeneratorEngine3,VoltageA,1050,MB.KM.IAS.Q3.A40189,NO3 GENERATOR_ENGINE(A) GEN VOLTAGE,AO,0,654,VOLTAGE,1300.0,GeneratorEngine#,VoltageA,GeneratorEngine# VoltageA,True,GeneratorEngine3Voltage
1524,GeneratorEngine2,FOFilterInletPress,1030,GB085,NO2 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine2FOInletPress
1536,ShipBoiler1,FOInletTemp,1028,MB.KM.IAS.Q2.A400184,OIL TEMPERATURE (4-20MA),AI,0,200,°C,1304.0,ShipBoiler#,FOInletTemp,ShipBoiler# FOInletTemp,True,GeneratorEngine4WindingTempT
1537,ShipBoiler1,FOInletPress,1028,MB.KM.IAS.Q2.A400185,FUEL OIL PRESSURE (4-20MA),AI,0,40,PRESSURE,1304.0,ShipBoiler#,FOInletPress,ShipBoiler# FOInletPress,True,GeneratorEngine4FOInletPress
1594,GeneratorEngine3,LOFilterInletPress,1030,GC069,NO3 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN,AI,0,10,PRESSURE,1304.0,GeneratorEngine#,LOFilterInletPress,GeneratorEngine# LOFilterInletPress,True,GeneratorEngine3LOInletPress
1597,GeneratorEngine3,FOFilterInletPress,1030,GC085,NO3 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN,AI,0,16,PRESSURE,1304.0,GeneratorEngine#,FOFilterInletPress,GeneratorEngine# FOFilterInletPress,True,GeneratorEngine3FOInletPress
1679,GeneratorEngine3,busBarVoltage,1003,PMS_3BUSVOLA_Y,BUS VOLTAGE,AI,0,10000,VOLTAGE,1304.0,GeneratorEngine#,busBarVoltage,GeneratorEngine# busBarVoltage,True,GeneratorEngine1busBarVoltage
1727,GeneratorEngine2,SAPress,1036,MB.KM.IAS.Q1.400086,NO2 GENERATOR_ENGINE STARTING AIR ENGINE INLET,Analog,0,16,kgf/㎠,1304.0,GeneratorEngine#,SAPress,GeneratorEngine# SAPress,True,GeneratorEngine2WindingTempR
1763,GeneratorEngine5,BearingDETemp1,1003,GE5_TE691_Y,NO5 GENERATOR_ENGINE MAIN BRG TEMP1,AI,0,200,°C,1304.0,GeneratorEngine#,BearingDETemp#,GeneratorEngine# BearingDETemp#,True,GeneratorEngine1BearingDETemp5
1873,GeneratorEngine5,CBClosed,1003,PMS_5VCBCLED_Y,NO5 GENERATOR_ENGINE MVSB VCB CLOSED,DI,0,0,NOVALUE,1301.0,GeneratorEngine#,CBClosed,GeneratorEngine# CBClosed,True,GeneratorEngine5StopState
2034,CargoHandling,CT1_DWPump_Stbd_Current,1018,IT_1101_Y,MP1100 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current
2035,CargoHandling,CT2_DWPump_Port_Current,1018,IT_1200_Y,MP1200 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
2037,CargoHandling,CT3_DWPump_Stbd_Current,1018,IT_1501_Y,MP1501 DEEPWELL PUMP STBD CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Stbd_Current,CargoHandling CT#_DWPump_Stbd_Current,True,CargoHandlingCT2_DWPump_Stbd_Current
2038,CargoHandling,CT4_DWPump_Port_Current,1018,IT_1700_Y,MP1700 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
2048,GeneratorEngine5,RunningHour,1003,PMS_5GENWHRS_Y,NO5 GENERATOR_ENGINE WORKING HOURS,AI,0,10000,NOVALUE,1304.0,GeneratorEngine#,RunningHour,GeneratorEngine# RunningHour,True,GeneratorEngine4RunningHour
2057,CargoHandling,CT1_DWPump_Port_Current,1018,IT_1100_Y,MP1100 DEEPWELL PUMP PORT CURRENT TX,A,0,0,NOVALUE,1310.0,CargoHandling,CT#_DWPump_Port_Current,CargoHandling CT#_DWPump_Port_Current,True,CargoHandlingCT3_DWPump_Port_Current
2079,ShipBoiler1,ExhGasOutletTemp,1003,EG_G02_Y,EXHAUST GAS ECONOMIZER EXHAUST GAS OUTLET TEMPERATURE,AI,0,600,TEMPERATURE,1304.0,ShipBoiler#,ExhGasOutletTemp,ShipBoiler# ExhGasOutletTemp,True,MainEngine1Cy1ExhGasOutletTemp
1 thing property ships_idx tag_name tag_description signal_type min max unit data_type thing_pattern property_pattern pattern MDM class_prediction
2 6 SB1Flow FOMassFlowTotal 1003 FM6_XI001_Y AUXILIARY BOILER FUEL OIL TOTAL FLOW RATE AI 0 0 FLOW 1304.0 SB#Flow FOMassFlowTotal SB#Flow FOMassFlowTotal True SB1FlowFOMassFlowIn
3 38 ShipBoiler3 RunningState 1030 BC330 COMPOSITE BOILER FIRING DI 0 0 NOVALUE 1301.0 ShipBoiler# RunningState ShipBoiler# RunningState True ShipBoiler1RunningState
4 61 GeneratorEngine5 CBNonClosed 1003 PMS_5ACBNCL_Y NO5 GENERATOR_ENGINE ACB NON CLOSED DI 0 0 NOVALUE 1301.0 GeneratorEngine# CBNonClosed GeneratorEngine# CBNonClosed True GeneratorEngine5RunningState
5 72 CargoHandling BoostPp_Port_Current 1018 IT_1400_Y MP1400 BOOSTER PUMP PORT CURRENT TX A 0 0 NOVALUE 1310.0 CargoHandling BoostPp_Port_Current CargoHandling BoostPp_Port_Current True CargoHandlingBoostPp_Stbd_Current
6 81 Navigation MidPDraft 1018 TL_200002_Y MID DRAFTP_LV A 0 0 NOVALUE 1310.0 Navigation MidPDraft Navigation MidPDraft True NavigationMidSDraft
7 86 ShipBoiler1 FOInletTemp 1018 AB_000001_Y BOILER FUEL OIL IN BURNER_TEMP A 0 0 NOVALUE 1310.0 ShipBoiler# FOInletTemp ShipBoiler# FOInletTemp True ShipBoiler3FOInletTemp
8 140 Navigation MidPDraft 1003 DCM_P3_Y DRAUGHT MID PS (DRAFT SENSOR) AI 0 0 m 1304.0 Navigation MidPDraft Navigation MidPDraft True NavigationMidSDraft
9 174 ShipBoiler1 FOInletPress 1051 MB.YO.IAS.Q3.40224 BOILER FUEL OIL IN BURNER_PRESS Analog 0 4 PRESSURE 1304.0 ShipBoiler# FOInletPress ShipBoiler# FOInletPress True ShipBoiler3FOInletPress
10 200 GeneratorEngine3 VoltageB 1050 MB.KM.IAS.Q3.A40193 NO3 GENERATOR_ENGINE(B) GEN VOLTAGE AO 0 655 VOLTAGE 1300.0 GeneratorEngine# VoltageB GeneratorEngine# VoltageB True GeneratorEngine3Voltage
11 342 EngineRoom AirTemp 1018 MA_TT8612_Y MAIN_ENGINE AMBIENT_TEMP A 0 0 NOVALUE 1310.0 EngineRoom AirTemp EngineRoom AirTemp True GeneratorEngine1CBTrip
12 395 GeneratorEngine3 SAPress 1036 MB.KM.IAS.Q2.400121 NO3 GENERATOR_ENGINE STARTING AIR ENGINE INLET Analog 0 16 kgf/㎠ 1304.0 GeneratorEngine# SAPress GeneratorEngine# SAPress True GeneratorEngine3WindingTempR
13 396 MainEngine1 RPM 1051 MB.YO.IAS.Q1.40006 M/E_RPM Analog -120 120 RPM 1304.0 MainEngine# RPM MainEngine# RPM True Shaft1RPM
14 653 ShipBoiler1 FOInletTemp 1033 CB014 COMPOSITE BOILER FUEL OIL TEMPERATURE AI 0 200 TEMPERATURE 1304.0 ShipBoiler# FOInletTemp ShipBoiler# FOInletTemp True ShipBoiler3FOInletTemp
15 731 GeneratorEngine4 CBNonClosed 1003 PMS_4ACBNCL_Y NO4 GENERATOR_ENGINE ACB NON CLOSED DI 0 0 NOVALUE 1301.0 GeneratorEngine# CBNonClosed GeneratorEngine# CBNonClosed True GeneratorEngine4CBClosed
16 745 ShipBoiler1 FOInletPress 1018 AB_000002_Y BOILER FUEL OIL IN BURNER PRESSURE A 0 0 PRESSURE 1310.0 ShipBoiler# FOInletPress ShipBoiler# FOInletPress True ShipBoiler3FOInletPress
17 783 GeneratorEngine1 LOFilterInletPress 1030 GA069 NO1 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN AI 0 10 PRESSURE 1304.0 GeneratorEngine# LOFilterInletPress GeneratorEngine# LOFilterInletPress True GeneratorEngine1LOInletPress
18 786 GeneratorEngine1 FOFilterInletPress 1030 GA085 NO1 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN AI 0 16 PRESSURE 1304.0 GeneratorEngine# FOFilterInletPress GeneratorEngine# FOFilterInletPress True GeneratorEngine1FOInletPress
19 812 GE1Flow FOViscosity 1020 MB.YO.IAS.Q1.A400031 GENERATOR_ENGINE FUEL OIL VISCOSITY INDICATION AO 0 2346 VOLUME FLOW 1304.0 GE#Flow FOViscosity GE#Flow FOViscosity True GE1FlowFOVolumeFlowIn
20 813 ME2Flow FOViscosity 1020 MB.YO.IAS.Q1.A400025 MAIN_ENGINE(P) FUEL OIL VISCOSITY INDICATION AO 0 2285 VOLUME FLOW 1304.0 ME#Flow FOViscosity ME#Flow FOViscosity True ME2FlowFOVolumeFlowIn
21 840 GeneratorEngine1 SAPress 1036 MB.KM.IAS.Q1.400051 NO1 GENERATOR_ENGINE STARTING AIR ENGINE INLET Analog 0 16 kgf/㎠ 1304.0 GeneratorEngine# SAPress GeneratorEngine# SAPress True GeneratorEngine1WindingTempR
22 891 GE1Flow FOMassFlowIn 1051 MB.YO.IAS.Q2.40103 GENERATOR_ENGINE HFO_FLOW Analog 0 1800 MASS FLOW 1304.0 GE#Flow FOMassFlowIn GE#Flow FOMassFlowIn True GE1FlowFGMassFlow
23 935 ShipBoiler1 FOInletTemp 1051 MB.YO.IAS.Q3.40223 BOILER FUEL OIL IN BURNER_TEMP Analog 0 200 TEMPERATURE 1304.0 ShipBoiler# FOInletTemp ShipBoiler# FOInletTemp True ShipBoiler3FOInletTemp
24 951 MainEngine2 CFWInletTemp 1020 MB.YO.IAS.Q1.A400388 MAIN_ENGINE(P) CYLINDER COOL WATER TEMPERATURE INLET AO -50 130 TEMPERATURE 1304.0 MainEngine# CFWInletTemp MainEngine# CFWInletTemp True MainEngine2Cy3CWTemp
25 1005 GeneratorEngine1 HFOUse 1051 MB.YO.IAS.Q1.10096 G/E_HFUEL OIL USE Digital 0 1 - 1301.0 GeneratorEngine# HFOUse GeneratorEngine# HFOUse True MainEngine1HFOUse
26 1075 ME1Flow FGMassFlow 1004 MB.YO.IAS.Q2.A400121 LP LPG FUEL P/P FLOW AI 0 3500 MASS FLOW 1304.0 ME#Flow FGMassFlow ME#Flow FGMassFlow True ME2FlowFGMassFlow
27 1116 CargoHandling LPGComp1MotorCurrent 1004 MB.YO.IAS.Q3.A400281 MP-2100 COMPRESSOR (P) CURRENT AI 0 1200 CURRENT 1304.0 CargoHandling LPGComp#MotorCurrent CargoHandling LPGComp#MotorCurrent True CargoHandlingCT3_DWPump_Port_Current
28 1117 CargoHandling LPGComp2MotorCurrent 1004 MB.YO.IAS.Q3.A400282 MP-2200 COMPRESSOR (C) CURRENT AI 0 1200 CURRENT 1304.0 CargoHandling LPGComp#MotorCurrent CargoHandling LPGComp#MotorCurrent True CargoHandlingCT2_DWPump_Stbd_Current
29 1118 CargoHandling LPGComp3MotorCurrent 1004 MB.YO.IAS.Q3.A400283 MP-2300 COMPRESSOR (S) CURRENT AI 0 1200 CURRENT 1304.0 CargoHandling LPGComp#MotorCurrent CargoHandling LPGComp#MotorCurrent True CargoHandlingBoostPp_Stbd_Current
30 1174 FuelOilSystem LFOVolumeSettleTK 1003 LC_XI001_Y NO2 LIGHT FUEL OIL SETTLING TANK VOLUME AI 0 999999 VOLUME 1304.0 FuelOilSystem LFOVolumeSettleTK FuelOilSystem LFOVolumeSettleTK True FuelOilSystemLFOVolumeStorageTK2P
31 1198 GeneratorEngine4 BearingNDETemp1 1003 GE4_TIAH6_Y NO4 GENERATOR_ENGINE BEARING TEMPERATURE(NDE) AI 0 200 1304.0 GeneratorEngine# BearingNDETemp# GeneratorEngine# BearingNDETemp# True GeneratorEngine4WindingTempT
32 1199 GeneratorEngine5 BearingNDETemp1 1003 GE5_TIAH6_Y NO5 GENERATOR_ENGINE BEARING TEMPERATURE(NDE) AI 0 200 1304.0 GeneratorEngine# BearingNDETemp# GeneratorEngine# BearingNDETemp# True GeneratorEngine5WindingTempT
33 1200 MainEngine1 LoadPercent 1018 EG_0000005_Y M/E_LOAD D 0 0 % 1301.0 MainEngine# LoadPercent MainEngine# LoadPercent True GeneratorEngine2LoadPercent
34 1214 GE1TurboCharger1 ExhGasOutletTemp 1003 GE1_TE27_Y NO1 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER) AI 0 800 °C 1304.0 GE#TurboCharger# ExhGasOutletTemp GE#TurboCharger# ExhGasOutletTemp True GE3TurboCharger1ExhGasOutletTemp
35 1226 GE2TurboCharger1 ExhGasOutletTemp 1003 GE2_TE27_Y NO2 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER) AI 0 800 °C 1304.0 GE#TurboCharger# ExhGasOutletTemp GE#TurboCharger# ExhGasOutletTemp True GE3TurboCharger2ExhGasOutletTemp
36 1237 GE3TurboCharger1 ExhGasOutletTemp 1003 GE3_TE27_Y NO3 GENERATOR_ENGINE EXHAUST GAS TEMPERATURE(OUTLET A TURBOCHARGER) AI 0 800 °C 1304.0 GE#TurboCharger# ExhGasOutletTemp GE#TurboCharger# ExhGasOutletTemp True GE3TurboCharger3ExhGasOutletTemp
37 1246 GeneratorEngine3 BearingDETemp8 1003 GE3_TE698_Y NO3 GENERATOR_ENGINE MAIN BRG TEMP8 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine3BearingDETemp5
38 1247 GeneratorEngine3 BearingDETemp9 1003 GE3_TE699_Y NO3 GENERATOR_ENGINE MAIN BRG TEMP9 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine3BearingDETemp6
39 1273 GeneratorEngine4 BearingDETemp8 1003 GE4_TE698_Y NO4 GENERATOR_ENGINE MAIN BRG TEMP8 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine4BearingDETemp6
40 1274 GeneratorEngine4 BearingDETemp9 1003 GE4_TE699_Y NO4 GENERATOR_ENGINE MAIN BRG TEMP9 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine4BearingDETemp6
41 1280 GeneratorEngine5 BearingDETemp2 1003 GE5_TE692_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP2 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine2BearingDETemp6
42 1281 GeneratorEngine5 BearingDETemp3 1003 GE5_TE693_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP3 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine3BearingDETemp5
43 1282 GeneratorEngine5 BearingDETemp4 1003 GE5_TE694_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP4 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine4BearingDETemp4
44 1283 GeneratorEngine5 BearingDETemp5 1003 GE5_TE695_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP5 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine1BearingDETemp5
45 1284 GeneratorEngine5 BearingDETemp6 1003 GE5_TE696_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP6 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine3BearingDETemp6
46 1285 GeneratorEngine5 BearingDETemp7 1003 GE5_TE697_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP7 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine3BearingDETemp6
47 1286 GeneratorEngine5 BearingDETemp8 1003 GE5_TE698_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP8 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine2Cy8KnockIntensity
48 1287 GeneratorEngine5 BearingDETemp9 1003 GE5_TE699_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP9 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine4BearingDETemp6
49 1298 ME1TurboCharger1 ExhGasInletTemp 1003 AMSI_TT3721A_Y EXHAUST GAS TEMPERATURE BEFORE TURBOCHARGER 1 AI 0 600 TEMPERATURE 1304.0 ME#TurboCharger# ExhGasInletTemp ME#TurboCharger# ExhGasInletTemp True ME1TurboCharger1ExhGasOutletTemp
50 1309 GeneratorEngine2 LOFilterInletPress 1030 GB069 NO2 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN AI 0 10 PRESSURE 1304.0 GeneratorEngine# LOFilterInletPress GeneratorEngine# LOFilterInletPress True GeneratorEngine2LOInletPress
51 1472 GeneratorEngine3 VoltageA 1050 MB.KM.IAS.Q3.A40189 NO3 GENERATOR_ENGINE(A) GEN VOLTAGE AO 0 654 VOLTAGE 1300.0 GeneratorEngine# VoltageA GeneratorEngine# VoltageA True GeneratorEngine3Voltage
52 1524 GeneratorEngine2 FOFilterInletPress 1030 GB085 NO2 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN AI 0 16 PRESSURE 1304.0 GeneratorEngine# FOFilterInletPress GeneratorEngine# FOFilterInletPress True GeneratorEngine2FOInletPress
53 1536 ShipBoiler1 FOInletTemp 1028 MB.KM.IAS.Q2.A400184 OIL TEMPERATURE (4-20MA) AI 0 200 °C 1304.0 ShipBoiler# FOInletTemp ShipBoiler# FOInletTemp True GeneratorEngine4WindingTempT
54 1537 ShipBoiler1 FOInletPress 1028 MB.KM.IAS.Q2.A400185 FUEL OIL PRESSURE (4-20MA) AI 0 40 PRESSURE 1304.0 ShipBoiler# FOInletPress ShipBoiler# FOInletPress True GeneratorEngine4FOInletPress
55 1594 GeneratorEngine3 LOFilterInletPress 1030 GC069 NO3 GENERATOR_ENGINE LUB OIL PRESSURE FLT IN AI 0 10 PRESSURE 1304.0 GeneratorEngine# LOFilterInletPress GeneratorEngine# LOFilterInletPress True GeneratorEngine3LOInletPress
56 1597 GeneratorEngine3 FOFilterInletPress 1030 GC085 NO3 GENERATOR_ENGINE FUEL OIL PRESSURE FLT IN AI 0 16 PRESSURE 1304.0 GeneratorEngine# FOFilterInletPress GeneratorEngine# FOFilterInletPress True GeneratorEngine3FOInletPress
57 1679 GeneratorEngine3 busBarVoltage 1003 PMS_3BUSVOLA_Y BUS VOLTAGE AI 0 10000 VOLTAGE 1304.0 GeneratorEngine# busBarVoltage GeneratorEngine# busBarVoltage True GeneratorEngine1busBarVoltage
58 1727 GeneratorEngine2 SAPress 1036 MB.KM.IAS.Q1.400086 NO2 GENERATOR_ENGINE STARTING AIR ENGINE INLET Analog 0 16 kgf/㎠ 1304.0 GeneratorEngine# SAPress GeneratorEngine# SAPress True GeneratorEngine2WindingTempR
59 1763 GeneratorEngine5 BearingDETemp1 1003 GE5_TE691_Y NO5 GENERATOR_ENGINE MAIN BRG TEMP1 AI 0 200 °C 1304.0 GeneratorEngine# BearingDETemp# GeneratorEngine# BearingDETemp# True GeneratorEngine1BearingDETemp5
60 1873 GeneratorEngine5 CBClosed 1003 PMS_5VCBCLED_Y NO5 GENERATOR_ENGINE MVSB VCB CLOSED DI 0 0 NOVALUE 1301.0 GeneratorEngine# CBClosed GeneratorEngine# CBClosed True GeneratorEngine5StopState
61 2034 CargoHandling CT1_DWPump_Stbd_Current 1018 IT_1101_Y MP1100 DEEPWELL PUMP STBD CURRENT TX A 0 0 NOVALUE 1310.0 CargoHandling CT#_DWPump_Stbd_Current CargoHandling CT#_DWPump_Stbd_Current True CargoHandlingCT2_DWPump_Stbd_Current
62 2035 CargoHandling CT2_DWPump_Port_Current 1018 IT_1200_Y MP1200 DEEPWELL PUMP PORT CURRENT TX A 0 0 NOVALUE 1310.0 CargoHandling CT#_DWPump_Port_Current CargoHandling CT#_DWPump_Port_Current True CargoHandlingCT3_DWPump_Port_Current
63 2037 CargoHandling CT3_DWPump_Stbd_Current 1018 IT_1501_Y MP1501 DEEPWELL PUMP STBD CURRENT TX A 0 0 NOVALUE 1310.0 CargoHandling CT#_DWPump_Stbd_Current CargoHandling CT#_DWPump_Stbd_Current True CargoHandlingCT2_DWPump_Stbd_Current
64 2038 CargoHandling CT4_DWPump_Port_Current 1018 IT_1700_Y MP1700 DEEPWELL PUMP PORT CURRENT TX A 0 0 NOVALUE 1310.0 CargoHandling CT#_DWPump_Port_Current CargoHandling CT#_DWPump_Port_Current True CargoHandlingCT3_DWPump_Port_Current
65 2048 GeneratorEngine5 RunningHour 1003 PMS_5GENWHRS_Y NO5 GENERATOR_ENGINE WORKING HOURS AI 0 10000 NOVALUE 1304.0 GeneratorEngine# RunningHour GeneratorEngine# RunningHour True GeneratorEngine4RunningHour
66 2057 CargoHandling CT1_DWPump_Port_Current 1018 IT_1100_Y MP1100 DEEPWELL PUMP PORT CURRENT TX A 0 0 NOVALUE 1310.0 CargoHandling CT#_DWPump_Port_Current CargoHandling CT#_DWPump_Port_Current True CargoHandlingCT3_DWPump_Port_Current
67 2079 ShipBoiler1 ExhGasOutletTemp 1003 EG_G02_Y EXHAUST GAS ECONOMIZER EXHAUST GAS OUTLET TEMPERATURE AI 0 600 TEMPERATURE 1304.0 ShipBoiler# ExhGasOutletTemp ShipBoiler# ExhGasOutletTemp True MainEngine1Cy1ExhGasOutletTemp

View File

@ -0,0 +1,27 @@
type,fold,accuracy
1layer,1,0.8968291528632276
1layer,2,0.8859813084112149
1layer,3,0.9382530120481928
1layer,4,0.9586108468125595
1layer,5,0.8827301878149336
2layer,1,0.9318504495977283
2layer,2,0.8859813084112149
2layer,3,0.9678714859437751
2layer,4,0.9738344433872502
2layer,5,0.9015116811726981
4layer,1,0.9503076194983436
4layer,2,0.9135514018691588
4layer,3,0.9698795180722891
4layer,4,0.9790675547098002
4layer,5,0.907924874026569
6layer,1,0.9522006625650734
6layer,2,0.9093457943925234
6layer,3,0.9678714859437751
6layer,4,0.9814462416745956
6layer,5,0.890975721484196
8layer,1,0.9441552295314718
8layer,2,0.9121495327102803
8layer,3,0.963855421686747
8layer,4,0.9752616555661275
8layer,5,0.907924874026569
1 type fold accuracy
2 1layer 1 0.8968291528632276
3 1layer 2 0.8859813084112149
4 1layer 3 0.9382530120481928
5 1layer 4 0.9586108468125595
6 1layer 5 0.8827301878149336
7 2layer 1 0.9318504495977283
8 2layer 2 0.8859813084112149
9 2layer 3 0.9678714859437751
10 2layer 4 0.9738344433872502
11 2layer 5 0.9015116811726981
12 4layer 1 0.9503076194983436
13 4layer 2 0.9135514018691588
14 4layer 3 0.9698795180722891
15 4layer 4 0.9790675547098002
16 4layer 5 0.907924874026569
17 6layer 1 0.9522006625650734
18 6layer 2 0.9093457943925234
19 6layer 3 0.9678714859437751
20 6layer 4 0.9814462416745956
21 6layer 5 0.890975721484196
22 8layer 1 0.9441552295314718
23 8layer 2 0.9121495327102803
24 8layer 3 0.963855421686747
25 8layer 4 0.9752616555661275
26 8layer 5 0.907924874026569

View File

@ -0,0 +1,12 @@
type,fold,accuracy
normal,1,0.9522006625650734
normal,2,0.9093457943925234
normal,3,0.9678714859437751
normal,4,0.9814462416745956
normal,5,0.890975721484196
frozen,1,0.9342167534311405
frozen,2,0.883177570093458
frozen,3,0.963855421686747
frozen,4,0.9705042816365367
frozen,5,0.9051763628034815
1 type fold accuracy
2 normal 1 0.9522006625650734
3 normal 2 0.9093457943925234
4 normal 3 0.9678714859437751
5 normal 4 0.9814462416745956
6 normal 5 0.890975721484196
7 frozen 1 0.9342167534311405
8 frozen 2 0.883177570093458
9 frozen 3 0.963855421686747
10 frozen 4 0.9705042816365367
11 frozen 5 0.9051763628034815

View File

@ -0,0 +1,199 @@
# %%
import pandas as pd
import numpy as np
####################################################################################
# stage 1
# %%
# stage 1a: binary classification
df_stage1a = pd.read_csv('stage1a.csv')
# %%
# desc only
mask = df_stage1a['type'] == 'desc'
df_stage1a[mask].describe().loc[['mean', 'std']]
# %%
# desc and unit
mask = df_stage1a['type'] == 'desc_unit'
df_stage1a[mask].describe().loc[['mean', 'std']]
# %%
# stage 1b: similarity-based classification
df_stage1b = pd.read_csv('stage1b.csv')
# %%
# desc only
mask = df_stage1b['type'] == 'desc'
df_stage1b[mask].describe().loc[['mean', 'std']]
# %%
# desc and unit
mask = df_stage1b['type'] == 'desc_unit'
df_stage1b[mask].describe().loc[['mean', 'std']]
# %%
#################################################################################
# stage 2: mapping model
# %%
# stage 2a: mapping by classification
df_stage2a = pd.read_csv('stage2a.csv')
# %%
# desc only
mask = df_stage2a['type'] == 'desc'
df_stage2a[mask].describe().loc[['mean', 'std']]
# %%
# desc and unit
mask = df_stage2a['type'] == 'desc_unit'
df_stage2a[mask].describe().loc[['mean', 'std']]
# %%
# stage 2b: mapping by seq2seq
df_stage2b = pd.read_csv('stage2b.csv')
# %%
# desc only
mask = df_stage2b['type'] == 'desc'
df_stage2b[mask].describe().loc[['mean', 'std']]
# %%
# desc and unit
mask = df_stage2b['type'] == 'desc_unit'
df_stage2b[mask].describe().loc[['mean', 'std']]
############################
# frozen encoder
# %%
df = pd.read_csv('frozen_encoder.csv')
# %%
# normal
mask = df['type'] == 'normal'
df[mask].describe().loc[['mean', 'std']]
# %%
# frozen
mask = df['type'] == 'frozen'
df[mask].describe().loc[['mean', 'std']]
# %%
############################
# frozen encoder
# %%
df = pd.read_csv('decoder_scaling.csv')
# %%
# 1 layer
mask = df['type'] == '1layer'
df[mask].describe().loc[['mean', 'std']]
# %%
# 2 layer
mask = df['type'] == '2layer'
df[mask].describe().loc[['mean', 'std']]
# %%
# 4 layer
mask = df['type'] == '4layer'
df[mask].describe().loc[['mean', 'std']]
# %%
# 6 layer
mask = df['type'] == '6layer'
df[mask].describe().loc[['mean', 'std']]
# %%
# 8 layer
mask = df['type'] == '8layer'
df[mask].describe().loc[['mean', 'std']]
# %%
#########################
# compute overall result
# frac{1808}{2113} = 0.856$ & $\frac{10692}{10961} = 0.975$ & $\frac{12500}{13074} = 0.956$ \\
# frac{1932}{2140} = 0.903$ & $\frac{8304}{8582} = 0.968$ & $\frac{10236}{10722} = 0.955$ \\
# frac{1789}{1992} = 0.898$ & $\frac{7613}{7863} = 0.968$ & $\frac{9402}{9855} = 0.954$ \\
# frac{1967}{2102} = 0.936$ & $\frac{12929}{13349} = 0.969$ & $\frac{14896}{15451} = 0.964$ \\
# frac{1915}{2183} = 0.877$ & $\frac{10381}{10786} = 0.962$ & $\frac{12296}{12969} = 0.948$ \\
# %%
matrix = np.array([
[1808, 2113, 10692, 10961, 13074],
[1932, 2140, 8304, 8582, 10722],
[1789, 1992, 7613, 7863, 9855],
[1967, 2102, 12929, 13349, 15451],
[1915, 2183, 10381, 10786, 12969]
])
# %%
relevant_class = matrix[:,0]/matrix[:,1]
print(relevant_class)
print(np.std(relevant_class))
# %%
non_relevant_class = matrix[:,2]/matrix[:,3]
print(non_relevant_class)
print(np.std(non_relevant_class))
# %%
numerator = (matrix[:,0] + matrix[:,2])
denominator = (matrix[:,1] + matrix[:,3])
print(numerator)
print(denominator) # same as last column
overall = numerator/denominator
print(overall)
print(np.std(overall))
######################
# compute mapping result
# %%
# $\frac{1761}{1808} = 0.974$ \\
# $\frac{1802}{1932} = 0.933$ \\
# $\frac{1760}{1789} = 0.984$ \\
# $\frac{1945}{1967} = 0.989$ \\
# $\frac{1837}{1915} = 0.959$ \\
matrix = np.array([
[1761, 1808],
[1802, 1932],
[1760, 1789],
[1945, 1967],
[1837, 1915]
])
# %%
result = matrix[:,0]/matrix[:,1]
print(result)
print(np.mean(result))
print(np.std(result))
# %%
####################################
# compute overall result
# & 1761 & 10692 & $\frac{1761 + 10692}{13074} = 0.953$ \\
# & 1802 & 8304 & $\frac{1802 + 8304}{10722} = 0.943$ \\
# & 1760 & 7613 & $\frac{1760 + 7613}{9855} = 0.951$ \\
# & 1945 & 12929 & $\frac{1945 + 12929}{15451} = 0.963$ \\
# & 1837 & 10381 & $\frac{1837 + 10381}{12969} = 0.942$ \\
matrix = np.array([
[1761,10692, 13074],
[1802, 8304, 10722],
[1760, 7613, 9855],
[1945,12929, 15451],
[1837,10381, 12969]
])
# %%
overall = (matrix[:,0] + matrix[:,1])/matrix[:,2]
print(overall)
print(np.mean(overall))
print(np.std(overall))
# %%

View File

@ -0,0 +1,11 @@
type,fold,accuracy,f1_score,precision,recall
desc,1,0.92588,0.74001,0.85440,0.65263
desc,2,0.88733,0.64239,0.87641,0.50701
desc,3,0.90583,0.71429,0.92357,0.58233
desc,4,0.93114,0.70929,0.83312,0.61751
desc,5,0.91171,0.67683,0.88162,0.54924
desc_unit,1,0.95610,0.86301,0.87049,0.85566
desc_unit,2,0.95467,0.88828,0.87421,0.90280
desc_unit,3,0.95403,0.88762,0.87739,0.89809
desc_unit,4,0.96408,0.87636,0.82405,0.93578
desc_unit,5,0.94811,0.85054,0.82543,0.87723
1 type fold accuracy f1_score precision recall
2 desc 1 0.92588 0.74001 0.85440 0.65263
3 desc 2 0.88733 0.64239 0.87641 0.50701
4 desc 3 0.90583 0.71429 0.92357 0.58233
5 desc 4 0.93114 0.70929 0.83312 0.61751
6 desc 5 0.91171 0.67683 0.88162 0.54924
7 desc_unit 1 0.95610 0.86301 0.87049 0.85566
8 desc_unit 2 0.95467 0.88828 0.87421 0.90280
9 desc_unit 3 0.95403 0.88762 0.87739 0.89809
10 desc_unit 4 0.96408 0.87636 0.82405 0.93578
11 desc_unit 5 0.94811 0.85054 0.82543 0.87723

View File

@ -0,0 +1,11 @@
type,fold,accuracy,f1_score,precision,recall
desc,1,0.93162,0.79580,0.76909,0.82442
desc,2,0.92884,0.82440,0.81224,0.83692
desc,3,0.93201,0.83375,0.82434,0.84337
desc,4,0.94259,0.80937,0.73814,0.89581
desc,5,0.92228,0.78397,0.73661,0.83784
desc_unit,1,0.93353,0.79945,0.78018,0.81969
desc_unit,2,0.92184,0.81006,0.78653,0.83505
desc_unit,3,0.91821,0.80513,0.77659,0.83584
desc_unit,4,0.93334,0.78675,0.69648,0.90390
desc_unit,5,0.93084,0.80445,0.76747,0.84517
1 type fold accuracy f1_score precision recall
2 desc 1 0.93162 0.79580 0.76909 0.82442
3 desc 2 0.92884 0.82440 0.81224 0.83692
4 desc 3 0.93201 0.83375 0.82434 0.84337
5 desc 4 0.94259 0.80937 0.73814 0.89581
6 desc 5 0.92228 0.78397 0.73661 0.83784
7 desc_unit 1 0.93353 0.79945 0.78018 0.81969
8 desc_unit 2 0.92184 0.81006 0.78653 0.83505
9 desc_unit 3 0.91821 0.80513 0.77659 0.83584
10 desc_unit 4 0.93334 0.78675 0.69648 0.90390
11 desc_unit 5 0.93084 0.80445 0.76747 0.84517

View File

@ -0,0 +1,11 @@
type,fold,accuracy
desc,1,0.93706
desc,2,0.88785
desc,3,0.96285
desc,4,0.95861
desc,5,0.89601
desc_unit,1,0.94226
desc_unit,2,0.90561
desc_unit,3,0.96436
desc_unit,4,0.96955
desc_unit,5,0.90289
1 type fold accuracy
2 desc 1 0.93706
3 desc 2 0.88785
4 desc 3 0.96285
5 desc 4 0.95861
6 desc 5 0.89601
7 desc_unit 1 0.94226
8 desc_unit 2 0.90561
9 desc_unit 3 0.96436
10 desc_unit 4 0.96955
11 desc_unit 5 0.90289

View File

@ -0,0 +1,16 @@
type,fold,accuracy
desc,1,0.9427354472314246
desc,2,0.8981308411214953
desc,3,0.9588353413654619
desc,4,0.9633682207421503
desc,5,0.8928080622995878
desc_unit,1,0.9578797917652626
desc_unit,2,0.9088785046728972
desc_unit,3,0.9673694779116466
desc_unit,4,0.9785918173168411
desc_unit,5,0.8918918918918919
1 type fold accuracy
2 desc 1 0.9427354472314246
3 desc 2 0.8981308411214953
4 desc 3 0.9588353413654619
5 desc 4 0.9633682207421503
6 desc 5 0.8928080622995878
7 desc_unit 1 0.9578797917652626
8 desc_unit 2 0.9088785046728972
9 desc_unit 3 0.9673694779116466
10 desc_unit 4 0.9785918173168411
11 desc_unit 5 0.8918918918918919

View File

@ -0,0 +1 @@
*.png

View File

@ -41,13 +41,26 @@ distance_array
# %%
plt.rcParams.update({'font.size': 14}) # Adjust the size as needed
plt.figure(figsize=(8, 6))
plt.hist(distance_array, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel("Normalized Levenshtein Distance")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("histogram.png", dpi=300)
# Add arrow for increasing dissimilarity
plt.annotate(
"Decreasing Similarity", # Text label
xy=(0.7, 500), # Arrow end (near the end of x-axis)
xytext=(0.4, 500), # Arrow start (near the middle of x-axis)
arrowprops=dict(arrowstyle="->", lw=2, color="black"), # Arrow style
va='center', # needed to make arrow centered
fontsize=14, # Font size for the text
color="black" # Text color
)
# Add arrows and text
plt.savefig("input_output_similarity.png", dpi=300)
#
# %%
# summary statistics of computed levenshtein distance
def summary_stats(arr):

View File

@ -58,12 +58,24 @@ score_list
# %%
# plt.hist(score_list, bins=50)
plt.rcParams.update({'font.size': 14}) # Adjust the size as needed
plt.figure(figsize=(8, 6))
plt.hist(score_list, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
plt.xlabel("Normalized Levenshtein Distance")
plt.ylabel("Platform Domain Class Count")
# Add arrow for increasing dissimilarity
plt.annotate(
"Decreasing Similarity", # Text label
xy=(0.7, 70), # Arrow end (near the end of x-axis)
xytext=(0.2, 70), # Arrow start (near the middle of x-axis)
arrowprops=dict(arrowstyle="->", lw=2, color="black"), # Arrow style
va='center', # needed to make arrow centered
fontsize=14, # Font size for the text
color="black" # Text color
)
plt.tight_layout()
plt.savefig("histogram.png", dpi=300)
plt.savefig("within_class_similarity.png", dpi=300)
# %%
# summary statistics of computed levenshtein distance
def summary_stats(arr):

View File

@ -0,0 +1,26 @@
# %%
import pandas as pd
# %%
data_path = '../../data_preprocess/exports/preprocessed_data.csv'
full_df = pd.read_csv(data_path, skipinitialspace=True)
# %%
df_in = full_df[full_df['MDM']].reset_index(drop=True)
# %%
df_out = full_df[~full_df['MDM']].reset_index(drop=True)
# %%
label_counts_in = df_in['unit'].value_counts()
print(label_counts_in.to_string())
# %%
label_counts_out = df_out['unit'].value_counts()
print(label_counts_out.to_string())
# %%
label_counts_out['NOVALUE']/len(df_out)
# %%
label_counts_in['NOVALUE']/len(df_out)
# %%

View File

@ -9,14 +9,19 @@ def run(fold):
df = pd.read_csv(data_path, skipinitialspace=True)
p_mdm = df['p_mdm']
# data_path = f'../train/mapping_t5_complete_desc_unit_name/mapping_prediction/exports/result_group_{fold}.csv'
data_path = f'../train/modified_t5_decoder_4_layers/mapping_prediction/exports/result_group_{fold}.csv'
data_path = f'../train/mapping_t5_complete_desc_unit/mapping_prediction/exports/result_group_{fold}.csv'
df = pd.read_csv(data_path, skipinitialspace=True)
actual_mdm = df['MDM']
thing_correctness = df['thing'] == df['p_thing']
property_correctness = df['property'] == df['p_property']
answer = thing_correctness & property_correctness
# grounded labels
data_path = f'../analysis/delta_analysis/exports/result_group_{fold}.csv'
df_grounded = pd.read_csv(data_path, skipinitialspace=True)
answer = df_grounded['grounded_pred']
# original labels
# thing_correctness = df['thing'] == df['p_thing']
# property_correctness = df['property'] == df['p_property']
# answer = thing_correctness & property_correctness
##############
# evaluate relevant-class prediction performance
@ -53,6 +58,13 @@ def run(fold):
print(mapping_rate)
print('size', correct_positive_mdm_and_map, '/', sum(p_mdm & actual_mdm))
# evaluate relevant mappings
correct_positive_mdm_and_map = sum(p_mdm & actual_mdm & answer)
mapping_rate = correct_positive_mdm_and_map / sum(actual_mdm)
print('relevant data mapping rate')
print(mapping_rate)
print('size', correct_positive_mdm_and_map, '/', sum(actual_mdm))
##############
# evaluate overall pipeline result
@ -76,3 +88,5 @@ for fold in [1,2,3,4,5]:
print('*' * 40)
run(fold)
# %%

View File

@ -179,8 +179,8 @@ def train(fold):
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-5,
per_device_train_batch_size=128,
per_device_eval_batch_size=128,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,

View File

@ -180,8 +180,8 @@ def train(fold):
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-5,
per_device_train_batch_size=128,
per_device_eval_batch_size=128,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,

2
train/class_number_tokens/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
checkpoint*
tensorboard-log

View File

@ -0,0 +1 @@
exports

View File

@ -0,0 +1,31 @@
********************************************************************************
Fold: 1
Accuracy: 0.94510
F1 Score: 0.94087
Precision: 0.94623
Recall: 0.94510
********************************************************************************
Fold: 2
Accuracy: 0.91682
F1 Score: 0.91698
Precision: 0.92824
Recall: 0.91682
********************************************************************************
Fold: 3
Accuracy: 0.96185
F1 Score: 0.95743
Precision: 0.96001
Recall: 0.96185
********************************************************************************
Fold: 4
Accuracy: 0.97479
F1 Score: 0.97074
Precision: 0.97072
Recall: 0.97479
********************************************************************************
Fold: 5
Accuracy: 0.90563
F1 Score: 0.89532
Precision: 0.90040
Recall: 0.90563

View File

@ -0,0 +1,289 @@
# %%
# from datasets import load_from_disk
import os
import glob
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from torch.utils.data import DataLoader
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
)
import evaluate
import re
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
from tqdm import tqdm
torch.set_float32_matmul_precision('high')
BATCH_SIZE = 128
# %%
# we need to create the mdm_list
# import the full mdm-only file
data_path = '../../../data_import/exports/data_mapping_mdm.csv'
full_df = pd.read_csv(data_path, skipinitialspace=True)
# rather than use pattern, we use the real thing and property
# mdm_list = sorted(list((set(full_df['pattern']))))
thing_property = full_df['thing'] + full_df['property']
thing_property = thing_property.to_list()
mdm_list = sorted(list(set(thing_property)))
# %%
id2label = {}
label2id = {}
for idx, val in enumerate(mdm_list):
id2label[idx] = val
label2id[val] = idx
# %%
def substitute_and_append_digits(s):
"""
Finds all digit groups in a string, substitutes them with a <digit> placeholder,
and appends the extracted digit groups at the end of the string flanked by <digit> markers.
Args:
s (str): The input string.
Returns:
str: The transformed string.
"""
# Find all digit groups in the string
digit_groups = re.findall(r'\d+', s)
# Substitute digit groups with <digit> placeholder
substituted_string = re.sub(r'\d+', '<DIGIT>', s)
# Append extracted digit groups to the end of the string
appended_digits = ''.join([f'<DIGIT>{group}<DIGIT>' for group in digit_groups])
return substituted_string + appended_digits
# outputs a list of dictionaries
# processes dataframe into lists of dictionaries
# each element maps input to output
# input: tag_description
# output: class label
def process_df_to_dict(df, mdm_list):
output_list = []
for _, row in df.iterrows():
processed_desc = substitute_and_append_digits(row['tag_description'])
desc = f"<DESC>{processed_desc}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
pattern = f"{row['thing'] + row['property']}"
try:
index = mdm_list.index(pattern)
except ValueError:
index = -1
element = {
'text' : f"{desc}{unit}",
'label': index,
}
output_list.append(element)
return output_list
def create_dataset(fold, mdm_list):
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
test_df = pd.read_csv(data_path, skipinitialspace=True)
# uncomment for mdm
# we only use the mdm subset
test_df = test_df[test_df['MDM']].reset_index(drop=True)
test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
return test_dataset
# %%
# function to perform training for a given fold
def test(fold):
test_dataset = create_dataset(fold, mdm_list)
# prepare tokenizer
checkpoint_directory = f'../checkpoint_fold_{fold}'
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
pattern = 'checkpoint-*'
model_checkpoint = glob.glob(os.path.join(checkpoint_directory, pattern))[0]
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<DESC>", "<UNIT>", "<DIGIT>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
# %%
# compute max token length
max_length = 0
for sample in test_dataset['text']:
# Tokenize the sample and get the length
input_ids = tokenizer(sample, truncation=False, add_special_tokens=True)["input_ids"]
length = len(input_ids)
# Update max_length if this sample is longer
if length > max_length:
max_length = length
print(max_length)
# %%
max_length = 128
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
# truncation=True,
padding='max_length'
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
datasets = test_dataset.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
# %% temp
# tokenized_datasets['train'].rename_columns()
# %%
# create data collator
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="max_length")
# %%
# compute metrics
# metric = evaluate.load("accuracy")
#
#
# def compute_metrics(eval_preds):
# preds, labels = eval_preds
# preds = np.argmax(preds, axis=1)
# return metric.compute(predictions=preds, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(mdm_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
model = model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
pred_labels = []
actual_labels = []
dataloader = DataLoader(datasets, batch_size=BATCH_SIZE, shuffle=False)
for batch in tqdm(dataloader):
# Inference in batches
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
# save labels too
actual_labels.extend(batch['label'])
# Move to GPU if available
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Perform inference
with torch.no_grad():
logits = model(
input_ids,
attention_mask).logits
predicted_class_ids = logits.argmax(dim=1).to("cpu")
pred_labels.extend(predicted_class_ids)
pred_labels = [tensor.item() for tensor in pred_labels]
# %%
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
y_true = actual_labels
y_pred = pred_labels
# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
average_parameter = 'weighted'
zero_division_parameter = 0
f1 = f1_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
precision = precision_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
recall = recall_score(y_true, y_pred, average=average_parameter, zero_division=zero_division_parameter)
with open("output.txt", "a") as f:
print('*' * 80, file=f)
print(f'Fold: {fold}', file=f)
# Print the results
print(f'Accuracy: {accuracy:.5f}', file=f)
print(f'F1 Score: {f1:.5f}', file=f)
print(f'Precision: {precision:.5f}', file=f)
print(f'Recall: {recall:.5f}', file=f)
# export result
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
df = pd.read_csv(data_path, skipinitialspace=True)
# uncomment if you want to predict for all
# df = df[df['MDM']].reset_index(drop=True)
label_list = [id2label[id] for id in pred_labels]
df_out = pd.DataFrame({
'class_prediction': pd.Series(label_list)
})
df = pd.concat([df, df_out], axis=1)
# we can save the t5 generation output here
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
# %%
# reset file before writing to it
with open("output.txt", "w") as f:
print('', file=f)
for fold in [1,2,3,4,5]:
test(fold)

View File

@ -0,0 +1,241 @@
# %%
# from datasets import load_from_disk
import os
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
DataCollatorWithPadding,
Trainer,
EarlyStoppingCallback,
TrainingArguments
)
import evaluate
import re
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# %%
# we need to create the mdm_list
# import the full mdm-only file
data_path = '../../data_import/exports/data_mapping_mdm.csv'
full_df = pd.read_csv(data_path, skipinitialspace=True)
# rather than use pattern, we use the real thing and property
# mdm_list = sorted(list((set(full_df['pattern']))))
thing_property = full_df['thing'] + full_df['property']
thing_property = thing_property.to_list()
mdm_list = sorted(list(set(thing_property)))
# %%
id2label = {}
label2id = {}
for idx, val in enumerate(mdm_list):
id2label[idx] = val
label2id[val] = idx
# %%
def substitute_and_append_digits(s):
"""
Finds all digit groups in a string, substitutes them with a <digit> placeholder,
and appends the extracted digit groups at the end of the string flanked by <digit> markers.
Args:
s (str): The input string.
Returns:
str: The transformed string.
"""
# Find all digit groups in the string
digit_groups = re.findall(r'\d+', s)
# Substitute digit groups with <digit> placeholder
substituted_string = re.sub(r'\d+', '<DIGIT>', s)
# Append extracted digit groups to the end of the string
appended_digits = ''.join([f'<DIGIT>{group}<DIGIT>' for group in digit_groups])
return substituted_string + appended_digits
# outputs a list of dictionaries
# processes dataframe into lists of dictionaries
# each element maps input to output
# input: tag_description
# output: class label
def process_df_to_dict(df, mdm_list):
output_list = []
for _, row in df.iterrows():
processed_desc = substitute_and_append_digits(row['tag_description'])
desc = f"<DESC>{processed_desc}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
pattern = f"{row['thing'] + row['property']}"
try:
index = mdm_list.index(pattern)
except ValueError:
print("Error: value not found in MDM list")
index = -1
element = {
'text' : f"{desc}{unit}",
'label': index,
}
output_list.append(element)
return output_list
def create_split_dataset(fold, mdm_list):
# train
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"
train_df = pd.read_csv(data_path, skipinitialspace=True)
# valid
data_path = f"../../data_preprocess/exports/dataset/group_{fold}/valid.csv"
validation_df = pd.read_csv(data_path, skipinitialspace=True)
combined_data = DatasetDict({
'train': Dataset.from_list(process_df_to_dict(train_df, mdm_list)),
'validation' : Dataset.from_list(process_df_to_dict(validation_df, mdm_list)),
})
return combined_data
# %%
# function to perform training for a given fold
def train(fold):
save_path = f'checkpoint_fold_{fold}'
split_datasets = create_split_dataset(fold, mdm_list)
# prepare tokenizer
# model_checkpoint = "distilbert/distilbert-base-uncased"
model_checkpoint = 'google-bert/bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<DESC>", "<UNIT>", "<DIGIT>"]
# Add the additional special tokens to the tokenizer
tokenizer.add_special_tokens({"additional_special_tokens": additional_special_tokens})
max_length = 120
# given a dataset entry, run it through the tokenizer
def preprocess_function(example):
input = example['text']
# text_target sets the corresponding label to inputs
# there is no need to create a separate 'labels'
model_inputs = tokenizer(
input,
max_length=max_length,
truncation=True,
padding=True
)
return model_inputs
# map maps function to each "row" in the dataset
# aka the data in the immediate nesting
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
num_proc=8,
remove_columns="text",
)
# %% temp
# tokenized_datasets['train'].rename_columns()
# %%
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# %%
# compute metrics
metric = evaluate.load("accuracy")
def compute_metrics(eval_preds):
preds, labels = eval_preds
preds = np.argmax(preds, axis=1)
return metric.compute(predictions=preds, references=labels)
# %%
# create id2label and label2id
# %%
model = AutoModelForSequenceClassification.from_pretrained(
model_checkpoint,
num_labels=len(mdm_list),
id2label=id2label,
label2id=label2id)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
# model = torch.compile(model, backend="inductor", dynamic=True)
# %%
# Trainer
training_args = TrainingArguments(
output_dir=f"{save_path}",
# eval_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-4,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=80,
bf16=True,
push_to_hub=False,
remove_unused_columns=False,
)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
# callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
# uncomment to load training from checkpoint
# checkpoint_path = 'default_40_1/checkpoint-5600'
# trainer.train(resume_from_checkpoint=checkpoint_path)
trainer.train()
# execute training
for fold in [1,2,3,4,5]:
print(fold)
train(fold)
# %%

View File

@ -0,0 +1 @@
exports

View File

@ -1,31 +1,31 @@
********************************************************************************
Fold: 1
Accuracy: 0.78277
F1 Score: 0.73629
Precision: 0.71419
Recall: 0.78277
Accuracy: 0.93706
F1 Score: 0.93286
Precision: 0.93920
Recall: 0.93706
********************************************************************************
Fold: 2
Accuracy: 0.78598
F1 Score: 0.73708
Precision: 0.71578
Recall: 0.78598
Accuracy: 0.88785
F1 Score: 0.88726
Precision: 0.90566
Recall: 0.88785
********************************************************************************
Fold: 3
Accuracy: 0.79819
F1 Score: 0.74411
Precision: 0.71749
Recall: 0.79819
Accuracy: 0.96285
F1 Score: 0.95930
Precision: 0.96310
Recall: 0.96285
********************************************************************************
Fold: 4
Accuracy: 0.79543
F1 Score: 0.73902
Precision: 0.71094
Recall: 0.79543
Accuracy: 0.95861
F1 Score: 0.95320
Precision: 0.95615
Recall: 0.95861
********************************************************************************
Fold: 5
Accuracy: 0.77279
F1 Score: 0.72098
Precision: 0.69817
Recall: 0.77279
Accuracy: 0.89601
F1 Score: 0.88613
Precision: 0.89038
Recall: 0.89601

View File

@ -235,6 +235,24 @@ def test(fold):
print(f'Precision: {precision:.5f}', file=f)
print(f'Recall: {recall:.5f}', file=f)
# export result
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
df = pd.read_csv(data_path, skipinitialspace=True)
df[df['MDM']].reset_index(drop=True)
label_list = [id2label[id] for id in pred_labels]
df_out = pd.DataFrame({
'class_prediction': pd.Series(label_list)
})
df = pd.concat([df, df_out], axis=1)
# we can save the t5 generation output here
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
# %%
# reset file before writing to it

View File

@ -176,7 +176,7 @@ def train(fold):
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
learning_rate=1e-4,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,

View File

@ -0,0 +1 @@
exports

View File

@ -1,31 +1,31 @@
********************************************************************************
Fold: 1
Accuracy: 0.78940
F1 Score: 0.73284
Precision: 0.70389
Recall: 0.78940
Accuracy: 0.15229
F1 Score: 0.07923
Precision: 0.05929
Recall: 0.15229
********************************************************************************
Fold: 2
Accuracy: 0.78411
F1 Score: 0.73695
Precision: 0.71914
Recall: 0.78411
Accuracy: 0.18075
F1 Score: 0.09625
Precision: 0.07243
Recall: 0.18075
********************************************************************************
Fold: 3
Accuracy: 0.80522
F1 Score: 0.75406
Precision: 0.72847
Recall: 0.80522
Accuracy: 0.19493
F1 Score: 0.10903
Precision: 0.08332
Recall: 0.19493
********************************************************************************
Fold: 4
Accuracy: 0.80780
F1 Score: 0.75361
Precision: 0.72432
Recall: 0.80780
Accuracy: 0.13190
F1 Score: 0.05761
Precision: 0.04173
Recall: 0.13190
********************************************************************************
Fold: 5
Accuracy: 0.76958
F1 Score: 0.71912
Precision: 0.69965
Recall: 0.76958
Accuracy: 0.15198
F1 Score: 0.07383
Precision: 0.05411
Recall: 0.15198

View File

@ -80,8 +80,9 @@ def process_df_to_dict(df, mdm_list):
def create_dataset(fold, mdm_list):
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
test_df = pd.read_csv(data_path, skipinitialspace=True)
# uncomment for mdm
# we only use the mdm subset
test_df = test_df[test_df['MDM']].reset_index(drop=True)
# test_df = test_df[test_df['MDM']].reset_index(drop=True)
test_dataset = Dataset.from_list(process_df_to_dict(test_df, mdm_list))
@ -237,6 +238,22 @@ def test(fold):
print(f'Precision: {precision:.5f}', file=f)
print(f'Recall: {recall:.5f}', file=f)
# export result
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
df = pd.read_csv(data_path, skipinitialspace=True)
# uncomment if you want to predict for all
# df = df[df['MDM']].reset_index(drop=True)
label_list = [id2label[id] for id in pred_labels]
df_out = pd.DataFrame({
'class_prediction': pd.Series(label_list)
})
df = pd.concat([df, df_out], axis=1)
# we can save the t5 generation output here
df.to_csv(f"exports/result_group_{fold}.csv", index=False)
# %%
# reset file before writing to it

View File

@ -177,7 +177,7 @@ def train(fold):
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-5,
learning_rate=1e-4,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,

View File

@ -202,7 +202,7 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -76,7 +76,7 @@ class Inference():
text_target=target,
max_length=max_length,
return_tensors="pt",
padding="max_length",
padding='max_length',
truncation=True,
)
return model_inputs
@ -100,7 +100,7 @@ class Inference():
def generate(self):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
MAX_GENERATE_LENGTH = 128
pred_generations = []

View File

@ -0,0 +1,6 @@
Accuracy for fold 1: 0.9536204448651207
Accuracy for fold 2: 0.8845794392523364
Accuracy for fold 3: 0.9618473895582329
Accuracy for fold 4: 0.9576593720266413
Accuracy for fold 5: 0.8928080622995878

View File

@ -0,0 +1,6 @@
Accuracy for fold 1: 0.9588263132986276
Accuracy for fold 2: 0.9182242990654206
Accuracy for fold 3: 0.9633534136546185
Accuracy for fold 4: 0.9809705042816366
Accuracy for fold 5: 0.8891433806688044

View File

@ -26,7 +26,7 @@ def infer_and_select(fold):
# run inference
# checkpoint
# Use glob to find matching paths
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training
@ -70,5 +70,6 @@ def infer_and_select(fold):
with open("output.txt", "w") as f:
print('', file=f)
# for fold in [1,2,3,4,5]:
for fold in [1,2,3,4,5]:
infer_and_select(fold)

View File

@ -2,7 +2,6 @@
# from datasets import load_from_disk
import os
import glob
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from safetensors.torch import load_file
from transformers.models.t5.modeling_t5 import T5Block
from transformers import (
T5Config,
T5TokenizerFast,
AutoModelForSeq2SeqLM,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
EarlyStoppingCallback,
Seq2SeqTrainingArguments,
T5ForConditionalGeneration,
T5Model
Seq2SeqTrainingArguments
)
import evaluate
import numpy as np
@ -35,23 +27,13 @@ from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# %%
# model_checkpoint = "t5-small"
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# model.config
# %%
# outputs a list of dictionaries
def process_df_to_dict(df):
output_list = []
for _, row in df.iterrows():
desc = f"<DESC>{row['tag_description']}<DESC>"
unit = f"<UNIT>{row['unit']}<UNIT>"
element = {
'input' : f"{desc}{unit}",
'input' : f"{desc}",
'output': f"<THING_START>{row['thing']}<THING_END><PROPERTY_START>{row['property']}<PROPERTY_END>",
}
output_list.append(element)
@ -77,11 +59,12 @@ def create_split_dataset(fold):
# function to perform training for a given fold
def train(fold):
save_path = f'checkpoint_fold_{fold}b'
save_path = f'checkpoint_fold_{fold}'
split_datasets = create_split_dataset(fold)
# prepare tokenizer
model_checkpoint = "t5-small"
model_checkpoint = "t5-base"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
@ -101,7 +84,7 @@ def train(fold):
text_target=target,
max_length=max_length,
truncation=True,
padding="max_length"
padding=True
)
return model_inputs
@ -119,52 +102,10 @@ def train(fold):
# device_map set to auto to force it to load contiguous weights
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
# # Use glob to find matching paths
# # path is usually checkpoint_fold_1/checkpoint-<step number>
# # we are guaranteed to save only 1 checkpoint from training
# pattern = 'checkpoint-*'
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
# # Load the checkpoint
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
# checkpoint = load_file(checkpoint_path)
# # Filter out weights related to the classification head
# # given name format: t5.encoder.embed_tokens.weight
# # we want: encoder.embed.tokens.weight
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# Access the decoder stack
# config = T5Config("t5-small")
config = pretrained_model.config
config.num_layers = 6
config.num_decoder_layers = 3 # set new decoder layer count
model = T5ForConditionalGeneration(config)
model.shared = pretrained_model.shared
model.encoder = pretrained_model.encoder
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
for i, layer in enumerate(pretrained_decoder_weights[:config.num_decoder_layers]):
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
# print number of decoder blocks
print(f'Number of decoder blocks: {len(model.decoder.block)}')
print(f'num_layers: {model.config.num_layers}')
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
# change the token embedding size to match the shape
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load("sacrebleu")
@ -199,7 +140,7 @@ def train(fold):
# Generation Config
# from transformers import GenerationConfig
gen_config = model.generation_config
gen_config.max_length = 128
gen_config.max_length = 64
# compile
# model = torch.compile(model, backend="inductor", dynamic=True)
@ -222,7 +163,7 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -0,0 +1,2 @@
checkpoint*
tensorboard-log/

View File

@ -76,7 +76,7 @@ class Inference():
text_target=target,
max_length=max_length,
return_tensors="pt",
padding="max_length",
padding='max_length',
truncation=True,
)
return model_inputs
@ -100,7 +100,7 @@ class Inference():
def generate(self):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
MAX_GENERATE_LENGTH = 128
pred_generations = []

View File

@ -0,0 +1,6 @@
Accuracy for fold 1: 0.9697113109323237
Accuracy for fold 2: 0.9
Accuracy for fold 3: 0.9613453815261044
Accuracy for fold 4: 0.9686013320647003
Accuracy for fold 5: 0.8932661475034357

View File

@ -6,13 +6,14 @@ from inference import Inference
checkpoint_directory = '../'
BATCH_SIZE = 512
BATCH_SIZE = 128
def infer_and_select(fold):
print(f"Inference for fold {fold}")
# import test data
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
df = pd.read_csv(data_path, skipinitialspace=True)
# uncomment for mdm only
df = df[df['MDM']].reset_index(drop=True)
# get target data
@ -26,7 +27,7 @@ def infer_and_select(fold):
# run inference
# checkpoint
# Use glob to find matching paths
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training

View File

@ -2,7 +2,6 @@
# from datasets import load_from_disk
import os
import glob
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from safetensors.torch import load_file
from transformers.models.t5.modeling_t5 import T5Block
from transformers import (
T5Config,
T5TokenizerFast,
AutoModelForSeq2SeqLM,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
EarlyStoppingCallback,
Seq2SeqTrainingArguments,
T5ForConditionalGeneration,
T5Model
Seq2SeqTrainingArguments
)
import evaluate
import numpy as np
@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# %%
# model_checkpoint = "t5-small"
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# model.config
# %%
# outputs a list of dictionaries
def process_df_to_dict(df):
output_list = []
@ -77,11 +60,12 @@ def create_split_dataset(fold):
# function to perform training for a given fold
def train(fold):
save_path = f'checkpoint_fold_{fold}b'
save_path = f'checkpoint_fold_{fold}'
split_datasets = create_split_dataset(fold)
# prepare tokenizer
model_checkpoint = "t5-small"
model_checkpoint = "t5-base"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
additional_special_tokens = ["<THING_START>", "<THING_END>", "<PROPERTY_START>", "<PROPERTY_END>", "<NAME>", "<DESC>", "<SIG>", "<UNIT>", "<DATA_TYPE>"]
@ -101,7 +85,7 @@ def train(fold):
text_target=target,
max_length=max_length,
truncation=True,
padding="max_length"
padding=True
)
return model_inputs
@ -119,52 +103,10 @@ def train(fold):
# device_map set to auto to force it to load contiguous weights
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
# # Use glob to find matching paths
# # path is usually checkpoint_fold_1/checkpoint-<step number>
# # we are guaranteed to save only 1 checkpoint from training
# pattern = 'checkpoint-*'
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
# # Load the checkpoint
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
# checkpoint = load_file(checkpoint_path)
# # Filter out weights related to the classification head
# # given name format: t5.encoder.embed_tokens.weight
# # we want: encoder.embed.tokens.weight
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# Access the decoder stack
# config = T5Config("t5-small")
config = pretrained_model.config
config.num_layers = 6
config.num_decoder_layers = 12 # set new decoder layer count
model = T5ForConditionalGeneration(config)
model.shared = pretrained_model.shared
model.encoder = pretrained_model.encoder
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
for i, layer in enumerate(pretrained_decoder_weights):
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
# print number of decoder blocks
print(f'Number of decoder blocks: {len(model.decoder.block)}')
print(f'num_layers: {model.config.num_layers}')
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
# change the token embedding size to match the shape
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load("sacrebleu")
@ -199,10 +141,11 @@ def train(fold):
# Generation Config
# from transformers import GenerationConfig
gen_config = model.generation_config
gen_config.max_length = 128
gen_config.max_length = 64
# compile
# model = torch.compile(model, backend="inductor", dynamic=True)
# model = torch.compile(model)
# Trainer
@ -210,10 +153,10 @@ def train(fold):
args = Seq2SeqTrainingArguments(
f"{save_path}",
# eval_strategy="epoch",
save_strategy="epoch",
eval_strategy="no",
logging_dir="tensorboard-log",
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
per_device_train_batch_size=64,
@ -222,12 +165,13 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,
generation_config=gen_config,
remove_unused_columns=False,
warmup_steps=400
)

2
train/mapping_t5_1e4/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
checkpoint*
tensorboard-log/

View File

@ -76,7 +76,7 @@ class Inference():
text_target=target,
max_length=max_length,
return_tensors="pt",
padding="max_length",
padding='max_length',
truncation=True,
)
return model_inputs
@ -100,7 +100,7 @@ class Inference():
def generate(self):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
MAX_GENERATE_LENGTH = 128
pred_generations = []

View File

@ -0,0 +1,6 @@
Accuracy for fold 1: 0.934690014197823
Accuracy for fold 2: 0.9023364485981309
Accuracy for fold 3: 0.9643574297188755
Accuracy for fold 4: 0.9700285442435775
Accuracy for fold 5: 0.8941823179111315

View File

@ -26,7 +26,7 @@ def infer_and_select(fold):
# run inference
# checkpoint
# Use glob to find matching paths
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}b')
directory = os.path.join(checkpoint_directory, f'checkpoint_fold_{fold}')
# Use glob to find matching paths
# path is usually checkpoint_fold_1/checkpoint-<step number>
# we are guaranteed to save only 1 checkpoint from training

View File

@ -2,7 +2,6 @@
# from datasets import load_from_disk
import os
import glob
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_IB_DISABLE'] = '1'
@ -10,20 +9,13 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
import torch
from safetensors.torch import load_file
from transformers.models.t5.modeling_t5 import T5Block
from transformers import (
T5Config,
T5TokenizerFast,
AutoModelForSeq2SeqLM,
DataCollatorForSeq2Seq,
Seq2SeqTrainer,
EarlyStoppingCallback,
Seq2SeqTrainingArguments,
T5ForConditionalGeneration,
T5Model
Seq2SeqTrainingArguments
)
import evaluate
import numpy as np
@ -35,15 +27,6 @@ from datasets import Dataset, DatasetDict
torch.set_float32_matmul_precision('high')
# %%
# model_checkpoint = "t5-small"
# model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# model.config
# %%
# outputs a list of dictionaries
def process_df_to_dict(df):
output_list = []
@ -77,10 +60,11 @@ def create_split_dataset(fold):
# function to perform training for a given fold
def train(fold):
save_path = f'checkpoint_fold_{fold}b'
save_path = f'checkpoint_fold_{fold}'
split_datasets = create_split_dataset(fold)
# prepare tokenizer
model_checkpoint = "t5-small"
tokenizer = T5TokenizerFast.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
# Define additional special tokens
@ -101,7 +85,7 @@ def train(fold):
text_target=target,
max_length=max_length,
truncation=True,
padding="max_length"
padding=True
)
return model_inputs
@ -119,52 +103,10 @@ def train(fold):
# device_map set to auto to force it to load contiguous weights
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, device_map='auto')
# directory = os.path.join(".", f'checkpoint_fold_{fold}a')
# # Use glob to find matching paths
# # path is usually checkpoint_fold_1/checkpoint-<step number>
# # we are guaranteed to save only 1 checkpoint from training
# pattern = 'checkpoint-*'
# prev_checkpoint = glob.glob(os.path.join(directory, pattern))[0]
# # t5_classify = T5Model.from_pretrained(prev_checkpoint)
# # Load the checkpoint
# checkpoint_path = f"{prev_checkpoint}/model.safetensors"
# checkpoint = load_file(checkpoint_path)
# # Filter out weights related to the classification head
# # given name format: t5.encoder.embed_tokens.weight
# # we want: encoder.embed.tokens.weight
# t5_weights= {key.replace("t5.", "", 1): value for key, value in checkpoint.items() if "classifier" not in key}
pretrained_model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
# Access the decoder stack
# config = T5Config("t5-small")
config = pretrained_model.config
config.num_layers = 6
config.num_decoder_layers = 9 # set new decoder layer count
model = T5ForConditionalGeneration(config)
model.shared = pretrained_model.shared
model.encoder = pretrained_model.encoder
pretrained_decoder_weights = [layer.state_dict() for layer in pretrained_model.decoder.block]
for i, layer in enumerate(pretrained_decoder_weights):
model.decoder.block[i].load_state_dict(layer) # Load pretrained weights
# print number of decoder blocks
print(f'Number of decoder blocks: {len(model.decoder.block)}')
print(f'num_layers: {model.config.num_layers}')
print(f'num_decoder_layers: {model.config.num_decoder_layers}')
# change the token embedding size to match the shape
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# important! after extending tokens vocab
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = evaluate.load("sacrebleu")
@ -199,7 +141,7 @@ def train(fold):
# Generation Config
# from transformers import GenerationConfig
gen_config = model.generation_config
gen_config.max_length = 128
gen_config.max_length = 64
# compile
# model = torch.compile(model, backend="inductor", dynamic=True)
@ -215,14 +157,14 @@ def train(fold):
logging_strategy="epoch",
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
learning_rate=1e-4,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -1,6 +1,6 @@
Accuracy for fold 1: 0.9455750118315192
Accuracy for fold 2: 0.8864485981308411
Accuracy for fold 3: 0.9558232931726908
Accuracy for fold 4: 0.9686013320647003
Accuracy for fold 5: 0.896930829134219
Accuracy for fold 1: 0.9427354472314246
Accuracy for fold 2: 0.8981308411214953
Accuracy for fold 3: 0.9588353413654619
Accuracy for fold 4: 0.9633682207421503
Accuracy for fold 5: 0.8928080622995878

View File

@ -157,13 +157,13 @@ def train(fold):
# save_strategy="epoch",
load_best_model_at_end=False,
learning_rate=1e-3,
per_device_train_batch_size=128,
per_device_eval_batch_size=128,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
auto_find_batch_size=False,
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -1,6 +1,6 @@
Accuracy for fold 1: 0.9522006625650734
Accuracy for fold 2: 0.9093457943925234
Accuracy for fold 3: 0.9678714859437751
Accuracy for fold 4: 0.9814462416745956
Accuracy for fold 5: 0.890975721484196
Accuracy for fold 1: 0.9578797917652626
Accuracy for fold 2: 0.9088785046728972
Accuracy for fold 3: 0.9673694779116466
Accuracy for fold 4: 0.9785918173168411
Accuracy for fold 5: 0.8918918918918919

View File

@ -13,7 +13,8 @@ def infer_and_select(fold):
# import test data
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/test_all.csv"
df = pd.read_csv(data_path, skipinitialspace=True)
df = df[df['MDM']].reset_index(drop=True)
# note: we need to uncomment this for overall evaluation
# df = df[df['MDM']].reset_index(drop=True)
# get target data
data_path = f"../../../data_preprocess/exports/dataset/group_{fold}/train_all.csv"

View File

@ -164,7 +164,7 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -1,6 +0,0 @@
Accuracy for fold 1: 0.9403691433980123
Accuracy for fold 2: 0.9046728971962616
Accuracy for fold 3: 0.9678714859437751
Accuracy for fold 4: 0.9695528068506185
Accuracy for fold 5: 0.902427851580394

View File

@ -222,7 +222,7 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -222,7 +222,7 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -1,2 +0,0 @@
checkpoint*
tensorboard-log

View File

@ -1,6 +0,0 @@
Accuracy for fold 1: 0.9427354472314246
Accuracy for fold 2: 0.9098130841121496
Accuracy for fold 3: 0.964859437751004
Accuracy for fold 4: 0.9719314938154139
Accuracy for fold 5: 0.9070087036188731

View File

@ -222,7 +222,7 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -222,7 +222,7 @@ def train(fold):
ddp_find_unused_parameters=False,
weight_decay=0.01,
save_total_limit=1,
num_train_epochs=40,
num_train_epochs=80,
predict_with_generate=True,
bf16=True,
push_to_hub=False,

View File

@ -1,2 +0,0 @@
checkpoint*
tensorboard-log

View File

@ -1,6 +0,0 @@
Accuracy for fold 1: 0.9441552295314718
Accuracy for fold 2: 0.9121495327102803
Accuracy for fold 3: 0.963855421686747
Accuracy for fold 4: 0.9752616555661275
Accuracy for fold 5: 0.907924874026569

View File

@ -1,28 +1,14 @@
#!/bin/bash
cd hybrid_t5_complete_desc_unit
micromamba run -n hug accelerate launch train_encoder.py
micromamba run -n hug accelerate launch train_decoder.py
cd mapping_t5-base_desc
micromamba run -n hug accelerate launch train.py
cd ..
cd hybrid_t5_pattern_desc_unit
micromamba run -n hug accelerate launch train_encoder.py
micromamba run -n hug accelerate launch train_decoder.py
cd mapping_t5-base_desc_unit
micromamba run -n hug accelerate launch train.py
cd ..
# cd classification_bert_complete_desc
# micromamba run -n hug accelerate launch train.py
# cd ..
# cd classification_bert_complete_desc_unit
# micromamba run -n hug accelerate launch train.py
# cd ..
# cd classification_bert_complete_desc_unit_name
# micromamba run -n hug accelerate launch train.py
# cd ..
# cd mapping_t5_complete_desc
# micromamba run -n hug accelerate launch train.py
# cd ..
@ -31,6 +17,31 @@ cd ..
# micromamba run -n hug accelerate launch train.py
# cd ..
#
# cd mapping_t5_complete_name_desc_unit
# cd frozen_t5_encoder
# micromamba run -n hug accelerate launch train_decoder.py
# cd ..
#
# cd modified_t5_decoder_1_layers
# micromamba run -n hug accelerate launch train_decoder.py
# cd ..
#
# cd modified_t5_decoder_2_layers
# micromamba run -n hug accelerate launch train_decoder.py
# cd ..
#
# cd modified_t5_decoder_4_layers
# micromamba run -n hug accelerate launch train_decoder.py
# cd ..
#
# cd modified_t5_decoder_8_layers
# micromamba run -n hug accelerate launch train_decoder.py
# cd ..
#
# cd classification_bert_complete_desc
# micromamba run -n hug accelerate launch train.py
# cd ..
#
# cd classification_bert_complete_desc_unit
# micromamba run -n hug accelerate launch train.py
# cd ..