parent
a1d000d9c8
commit
e90bc69ea9
|
@ -0,0 +1 @@
|
|||
*.zip
|
|
@ -1 +1,3 @@
|
|||
__pycache__
|
||||
__pycache__
|
||||
*.csv
|
||||
*.md
|
|
@ -1,6 +1,7 @@
|
|||
# %%
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
# %%
|
||||
# import training file
|
||||
|
@ -13,5 +14,45 @@ id_counts = train_df['entity_id'].value_counts()
|
|||
|
||||
# %%
|
||||
|
||||
plt.hist(id_counts, bins=50)
|
||||
# %%
|
||||
id_counts[:50]
|
||||
|
||||
# %%
|
||||
|
||||
plt.hist(id_counts, bins=50)
|
||||
|
||||
# %%
|
||||
def compute_normalized_class_weights(class_counts, max_resamples=10):
|
||||
"""
|
||||
Compute normalized class weights inversely proportional to class counts.
|
||||
The weights are normalized so that they sum to 1.
|
||||
|
||||
Args:
|
||||
class_counts (array-like): An array or list where each element represents the count of samples for a class.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: A normalized array of weights for each class.
|
||||
"""
|
||||
class_counts = np.array(class_counts)
|
||||
total_samples = np.sum(class_counts)
|
||||
class_weights = total_samples / class_counts
|
||||
# so that highest weight is 1
|
||||
normalized_weights = class_weights / np.max(class_weights)
|
||||
# Scale weights such that the highest weight corresponds to `max_resamples`
|
||||
resample_counts = normalized_weights * max_resamples
|
||||
# Round resamples to nearest integer
|
||||
resample_counts = np.round(resample_counts).astype(int)
|
||||
return resample_counts
|
||||
|
||||
# %%
|
||||
id_weights = compute_normalized_class_weights(id_counts, max_resamples=10)
|
||||
|
||||
# %%
|
||||
id_weights
|
||||
# %%
|
||||
id_mask = train_df['entity_id'] == 536
|
||||
train_df[id_mask]
|
||||
|
||||
# %%
|
||||
id_counts.index.to_list()
|
||||
# %%
|
||||
|
|
|
@ -18,12 +18,11 @@ id2label = {}
|
|||
for _, row in entity_df.iterrows():
|
||||
id2label[row['id']] = row['name']
|
||||
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
train_df.sort_values(by=['entity_id']).to_markdown('out.md')
|
||||
|
||||
# %%
|
||||
data_path = '../train/class_bert_process/classification_prediction/exports/result.csv'
|
||||
data_path = '../train/class_bert_process/prediction/exports/result.csv'
|
||||
prediction_df = pd.read_csv(data_path)
|
||||
|
||||
# %%
|
||||
|
@ -39,26 +38,31 @@ new_df = pd.concat((test_df, prediction_df ), axis=1)
|
|||
mismatch_mask = new_df['entity_id'] != new_df['class_prediction']
|
||||
mismatch_df = new_df[mismatch_mask]
|
||||
|
||||
# %%
|
||||
len(mismatch_df)
|
||||
|
||||
# %%
|
||||
# print the top 10 offending classes
|
||||
print(mismatch_df['entity_id'].value_counts()[:10])
|
||||
|
||||
|
||||
# %%
|
||||
# Convert the whole dataframe as a string and display
|
||||
# print the mismatch_df
|
||||
print(mismatch_df.to_markdown())
|
||||
print(mismatch_df.sort_values(by=['entity_id']).to_markdown())
|
||||
|
||||
# %%
|
||||
mismatch_df.to_csv('error.csv')
|
||||
|
||||
# %%
|
||||
# let us see the test mentions
|
||||
select_value = 434
|
||||
select_value = 268
|
||||
select_mask = mismatch_df['entity_id'] == select_value
|
||||
mismatch_df[select_mask]
|
||||
|
||||
# %%
|
||||
# let us see the train mentions
|
||||
select_value = 434
|
||||
select_value = 452
|
||||
select_mask = train_df['entity_id'] == select_value
|
||||
train_df[select_mask]
|
||||
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
*
|
||||
!.gitignore
|
||||
!*.txt
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
*
|
||||
!.gitignore
|
||||
!*.txt
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
*
|
||||
!.gitignore
|
||||
!*.txt
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,3 @@
|
|||
*
|
||||
!.gitignore
|
||||
!*.txt
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1 +0,0 @@
|
|||
*.csv
|
|
@ -0,0 +1,699 @@
|
|||
id,name,type_id,type_name
|
||||
1,(E)JES,2,App
|
||||
2,A-Auto Job Scheduling Software,2,App
|
||||
3,Activiti,2,App
|
||||
4,Adobe Acrobat Reader,2,App
|
||||
5,Ansible,2,App
|
||||
6,Apache ActiveMQ,2,App
|
||||
7,Apache Hbase,2,App
|
||||
8,Apache Hive,2,App
|
||||
9,Apache Kafka,2,App
|
||||
10,Apache ServiceMix,2,App
|
||||
11,Apache Solr,2,App
|
||||
12,Apache Subversion,2,App
|
||||
13,Application Development Facility (ADF),2,App
|
||||
14,Asterisk,2,App
|
||||
15,Automic Job Scheduler,2,App
|
||||
16,Autosys,2,App
|
||||
17,Bluebeam|Bluebeam Q,2,App
|
||||
18,BMC Control-M,2,App
|
||||
19,BMC Identity Management,2,App
|
||||
20,Borland Database Engine (BDE),2,App
|
||||
21,Business Intelligence and Reporting Tools (BIRT),2,App
|
||||
22,CA Gen,2,App
|
||||
23,CA Introscope,2,App
|
||||
24,CA-Panvalet,2,App
|
||||
25,CA-TELON,2,App
|
||||
26,Casegen,2,App
|
||||
27,Chef Automate,2,App
|
||||
28,Cisco AMP for Endpoints,2,App
|
||||
29,CiscoWorks LAN Management Solution (LMS),2,App
|
||||
30,Citrix Virtual Apps and Desktops,2,App
|
||||
31,Citrix ADC CPX,2,App
|
||||
32,Citrix Provisioning,2,App
|
||||
33,Clarify,2,App
|
||||
34,Clarity LIMS,2,App
|
||||
35,LabWare LIMS,2,App
|
||||
36,Cognos,2,App
|
||||
37,Coldfusion,2,App
|
||||
38,ConceptWave,2,App
|
||||
39,CONNAPI,2,App
|
||||
40,Connect Direct,2,App
|
||||
41,Cornerstone software,2,App
|
||||
42,Crystal Reports,2,App
|
||||
43,DB2,2,App
|
||||
44,Documentum Content Server,2,App
|
||||
45,Drupal,2,App
|
||||
46,Eclipse,2,App
|
||||
47,Elastic (ELK) Stack,2,App
|
||||
48,ETAP License Manager (LM),2,App
|
||||
49,ExamDiff,2,App
|
||||
50,F5 Secure Web Gateway Services,2,App
|
||||
51,FileMaker Pro,2,App
|
||||
52,FlexNet Manager Suite,2,App
|
||||
53,FTP Voyager,2,App
|
||||
54,Genymotion,2,App
|
||||
55,Google Chrome,2,App
|
||||
56,Greenplum DB,2,App
|
||||
57,Hadoop,2,App
|
||||
58,HP aC++ compiler,2,App
|
||||
59,HP C/ANSI C compiler,2,App
|
||||
60,HP Operations Orchestration (HPOO),2,App
|
||||
61,HP Server Automation (HPSA),2,App
|
||||
62,IBM BigFix Platform,2,App
|
||||
63,IBM Business Monitor,2,App
|
||||
64,IBM Business Process Manager,2,App
|
||||
65,IBM Content Manager OnDemand (CMOD),2,App
|
||||
66,IBM FileNet P8 Platform,2,App
|
||||
67,IBM InfoSphere DataStage,2,App
|
||||
68,IBM Integration Bus,2,App
|
||||
69,IBM License Metric Tool,2,App
|
||||
70,IBM Maximo,2,App
|
||||
71,IBM Migration Utility,2,App
|
||||
72,IBM Mobile Foundation,2,App
|
||||
73,IBM Operational Decision Manager (ODM),2,App
|
||||
74,IBM Spectrum Scale,2,App
|
||||
75,IBM Tivoli Asset Management,2,App
|
||||
76,IBM Tivoli Composite Application Manager,2,App
|
||||
77,IBM Tivoli Monitoring,2,App
|
||||
78,IBM Tivoli Storage Manager,2,App
|
||||
79,IBM Tivoli Workload Scheduler (TWS),2,App
|
||||
80,IBM WebSphere Business Integration Adaptor,2,App
|
||||
81,IBM Websphere MQ,2,App
|
||||
82,IBM WebSphere MQ Telemetry,2,App
|
||||
83,IBM WebSphere Transformation Extender (WTX),2,App
|
||||
84,IMS DB,2,App
|
||||
85,Info-ZIP,2,App
|
||||
86,Infobright Community Edition (ICE),2,App
|
||||
87,Informatica PowerCenter,2,App
|
||||
88,Ingres,2,App
|
||||
89,JBoss|JBoss Enterprise Service Bus,2,App
|
||||
90,Jenkins,2,App
|
||||
91,joinIT,2,App
|
||||
92,LifeFlow,2,App
|
||||
93,Lotus Notes,2,App
|
||||
94,MaaS360,2,App
|
||||
95,Malwarebytes Anti-Malware,2,App
|
||||
96,ManageEngine ADSelfService Plus,2,App
|
||||
97,MarkLogic DB,2,App
|
||||
98,Memcached,2,App
|
||||
99,Microsoft Access,2,App
|
||||
100,Microsoft BizTalk Adapters for Host Systems,2,App
|
||||
101,Microsoft Dynamics AX,2,App
|
||||
102,Microsoft Endpoint Configuration Manager (SCCM),2,App
|
||||
103,Microsoft Excel,2,App
|
||||
104,Microsoft Exchange Server,2,App
|
||||
105,Microsoft Forefront Identity Manager (FIM),2,App
|
||||
106,Microsoft InfoPath,2,App
|
||||
107,Microsoft Internet Explorer,2,App
|
||||
108,Microsoft ISA Server,2,App
|
||||
109,Microsoft MQ,2,App
|
||||
110,Microsoft System Center Endpoint Protection,2,App
|
||||
111,Microsoft Visual Studio,2,App
|
||||
112,Microsoft Web Deploy,2,App
|
||||
113,Microsoft Web Farm Framework (WFF),2,App
|
||||
114,Microsoft Web Platform Installer,2,App
|
||||
115,Model Driven Workflow (MDW),2,App
|
||||
116,MongoDB,2,App
|
||||
117,Mozilla Firefox,2,App
|
||||
118,MQ Client,2,App
|
||||
119,MS Office 365,2,App
|
||||
120,MS SQL Server,2,App
|
||||
121,MS SQL Server Compact,2,App
|
||||
122,MySQL,2,App
|
||||
123,Neo4j,2,App
|
||||
124,Nexus Repository OSS,2,App
|
||||
125,Nix package manager,2,App
|
||||
126,OpenLDAP,2,App
|
||||
127,OpenText Exstream,2,App
|
||||
128,OpenVPN,2,App
|
||||
129,Oracle Access Management,2,App
|
||||
130,Oracle ADF,2,App
|
||||
131,Oracle APEX,2,App
|
||||
132,Oracle BI Publisher,2,App
|
||||
133,Oracle Business Intelligence,2,App
|
||||
134,Oracle Database,2,App
|
||||
135,Oracle Designer,2,App
|
||||
136,Oracle Enterprise Manager,2,App
|
||||
137,Oracle Forms,2,App
|
||||
138,Oracle Hyperion|Hyperion Interactive Reporting,2,App
|
||||
139,Oracle Hyperion|Hyperion Planning,2,App
|
||||
140,Oracle Net Services,2,App
|
||||
141,Oracle Real Application Clusters (RAC),2,App
|
||||
142,Oracle Retail Point-of-Service,2,App
|
||||
143,Oracle Service Bus,2,App
|
||||
144,Oracle Smart View,2,App
|
||||
145,Oracle SOA Suite,2,App
|
||||
146,Oracle SQL Developer,2,App
|
||||
147,Oracle TimesTen In-Memory Database,2,App
|
||||
148,Oracle Warehouse Builder (OWB),2,App
|
||||
149,Orbix,2,App
|
||||
150,Pentaho,2,App
|
||||
151,PeopleSoft,2,App
|
||||
152,Perkin Elmer Informatics (PKI),2,App
|
||||
153,Pervasive PSQL,2,App
|
||||
154,PIPE-FLO,2,App
|
||||
155,PKZIP,2,App
|
||||
156,Planview,2,App
|
||||
157,PostgreSQL,2,App
|
||||
158,Powerbuilder,2,App
|
||||
159,Primavera P6,2,App
|
||||
160,Pro*COBOL,2,App
|
||||
161,ProjectWise,2,App
|
||||
162,ProjectWise Web Server,2,App
|
||||
163,PVCS Version Manager,2,App
|
||||
164,QlikView,2,App
|
||||
165,RabbitMQ,2,App
|
||||
166,Rational ClearCase,2,App
|
||||
167,Rational ClearQuest,2,App
|
||||
168,Redis,2,App
|
||||
169,Remedy,2,App
|
||||
170,Riak,2,App
|
||||
171,RightFax,2,App
|
||||
172,Rumba,2,App
|
||||
173,SAP BusinessObjects BI server,2,App
|
||||
174,SAP ERP,2,App
|
||||
175,SAP HANA DB,2,App
|
||||
176,SAP MaxDB,2,App
|
||||
177,SAP NetWeaver Business Warehouse,2,App
|
||||
178,SAP SQL Anywhere,2,App
|
||||
179,SAP Web Dynpro,2,App
|
||||
180,Sentry,2,App
|
||||
181,SharePoint,2,App
|
||||
182,Siebel,2,App
|
||||
183,SNA Manager,2,App
|
||||
184,SnagIt,2,App
|
||||
185,solidDB,2,App
|
||||
186,SonarQube,2,App
|
||||
187,SpaceMonger,2,App
|
||||
188,Splunk,2,App
|
||||
189,SQLIO,2,App
|
||||
190,Sybase SQL Server,2,App
|
||||
191,Syncsort,2,App
|
||||
192,Sysinternal Tools,2,App
|
||||
193,Sysinternal Tools|*,2,App
|
||||
194,Sysinternal Tools|AccessEnum,2,App
|
||||
195,Sysinternal Tools|ClockRes,2,App
|
||||
196,Sysinternal Tools|Coreinfo,2,App
|
||||
197,Sysinternal Tools|DiskExt,2,App
|
||||
198,Sysinternal Tools|DiskMon,2,App
|
||||
199,Sysinternal Tools|Hex2dec,2,App
|
||||
200,Sysinternal Tools|Junction,2,App
|
||||
201,Sysinternal Tools|LDMDump,2,App
|
||||
202,Sysinternal Tools|LoadOrder,2,App
|
||||
203,Sysinternal Tools|PipeList,2,App
|
||||
204,Sysinternal Tools|Process Explorer,2,App
|
||||
205,Sysinternal Tools|PsKill,2,App
|
||||
206,Sysinternal Tools|PsPasswd,2,App
|
||||
207,Sysinternal Tools|SDelete,2,App
|
||||
208,Sysinternal Tools|ShareEnum,2,App
|
||||
209,Sysinternal Tools|Sync,2,App
|
||||
210,Sysinternal Tools|TCPView,2,App
|
||||
211,Sysinternal Tools|VMMap,2,App
|
||||
212,Sysinternal Tools|Whois,2,App
|
||||
213,Tableau,2,App
|
||||
214,TCPLink Enterprise Server,2,App
|
||||
215,Teradata,2,App
|
||||
216,Teradata QS Server,2,App
|
||||
217,TIBCO Business Works (BW),2,App
|
||||
218,TIBCO InConcert,2,App
|
||||
219,TIBCO Rendezvous,2,App
|
||||
220,Tivoli Access Manager (TAM),2,App
|
||||
221,TortoiseCVS,2,App
|
||||
222,TortoiseSVN,2,App
|
||||
223,TSO/ISPF,2,App
|
||||
224,TWS zCentric,2,App
|
||||
225,Uniface,2,App
|
||||
226,ViewNow X Server,2,App
|
||||
227,Virtual I/O Server,2,App
|
||||
228,Visibroker,2,App
|
||||
229,VMware Solution Exchange Marketplace (VSX),2,App
|
||||
230,VMware Tools,2,App
|
||||
231,VMware vCenter,2,App
|
||||
232,WebFOCUS,2,App
|
||||
233,WebLogic Integration,2,App
|
||||
234,WebSphere Commerce Suite (WCS),2,App
|
||||
235,WebSphere Message Broker,2,App
|
||||
236,Wherescape Red,2,App
|
||||
237,Windchill,2,App
|
||||
238,Windows Indexing Service,2,App
|
||||
239,Windows Terminal Server (WTS),2,App
|
||||
240,WingArc SVF,2,App
|
||||
241,WinMerge,2,App
|
||||
242,WinRAR,2,App
|
||||
243,WinSCP,2,App
|
||||
244,Wise Package Studio,2,App
|
||||
245,Wordpress,2,App
|
||||
246,XAMPP,2,App
|
||||
247,ZAP BI,2,App
|
||||
248,ZeroMQ,2,App
|
||||
249,Zerto Virtual Replication,2,App
|
||||
250,IBM PowerHA,2,App
|
||||
251,Tivoli Netcool/OMNIbus,2,App
|
||||
252,IBM ILOG Views,2,App
|
||||
253,IBM ILOG CPLEX,2,App
|
||||
254,IBM ILOG Jviews,2,App
|
||||
255,IBM ILOG Elixir,2,App
|
||||
256,IBM ILOG Supply Chain Apps,2,App
|
||||
257,ILOG Solver,2,App
|
||||
258,SQLite,2,App
|
||||
259,Apache HTTP Server,8,App Server
|
||||
260,Apache Tomcat,8,App Server
|
||||
261,ArcGIS Server,8,App Server
|
||||
262,Oracle WebLogic Server,8,App Server
|
||||
263,GlassFish,8,App Server
|
||||
264,HAProxy,8,App Server
|
||||
265,IBM HTTP Server,8,App Server
|
||||
266,IIS,8,App Server
|
||||
267,JBoss,8,App Server
|
||||
268,JBoss|*,8,App Server
|
||||
269,Kitura,8,App Server
|
||||
270,Lotus Domino,8,App Server
|
||||
271,Lucee,8,App Server
|
||||
272,Netscape Application Server (NAS),8,App Server
|
||||
273,Netscape Enterprise Server (NES),8,App Server
|
||||
274,Nginx,8,App Server
|
||||
275,Oracle Application Server,8,App Server
|
||||
276,Oracle WebCenter Content Server,8,App Server
|
||||
277,Pivotal tc Server,8,App Server
|
||||
278,Resin Web Server,8,App Server
|
||||
279,SAP NetWeaver App Server,8,App Server
|
||||
280,Spark,8,App Server
|
||||
281,Oracle iPlanet Web Server,8,App Server
|
||||
282,UltiDev Web Server Pro (UWS),8,App Server
|
||||
283,webMethods Integration Server,8,App Server
|
||||
284,Websphere Application Server (WAS),8,App Server
|
||||
285,WebSphere Liberty,8,App Server
|
||||
286,WebSphere Portal Server,8,App Server
|
||||
287,Websphere Process Server,8,App Server
|
||||
288,WebSphere Process Server,8,App Server
|
||||
289,Oracle Real-Time Decisions (RTD),8,App Server
|
||||
290,CA API Gateway,4,HW
|
||||
291,Citrix ADC SDX,4,HW
|
||||
292,Citrix ADC MPX,4,HW
|
||||
293,HP Nonstop,4,HW
|
||||
294,IBM DataPower Gateway,4,HW
|
||||
295,IBM Power Systems,4,HW
|
||||
296,Intel Xeon Processor,4,HW
|
||||
297,Net Optics Taps,4,HW
|
||||
298,Oracle Exadata,4,HW
|
||||
299,AutoIt,9,Lang
|
||||
300,AWK,9,Lang
|
||||
301,BASIC,9,Lang
|
||||
302,Brainscript,9,Lang
|
||||
303,C,9,Lang
|
||||
304,C#,9,Lang
|
||||
305,C++,9,Lang
|
||||
306,C++|Visual C++,9,Lang
|
||||
307,Cascading Style Sheets (CSS),9,Lang
|
||||
308,Clipper,9,Lang
|
||||
309,CLIST,9,Lang
|
||||
310,COBOL,9,Lang
|
||||
311,ColdFusion Markup Language (CFML),9,Lang
|
||||
312,Data Language Interface (DL/I),9,Lang
|
||||
313,Delphi,9,Lang
|
||||
314,Easytrieve,9,Lang
|
||||
315,Expect,9,Lang
|
||||
316,eXtensible HyperText Markup Language (XHTML),9,Lang
|
||||
317,Extensible Markup Language (XML),9,Lang
|
||||
318,Extensible Markup Language (XML)|MSXML,9,Lang
|
||||
319,Extensible Stylesheet Language (XSL),9,Lang
|
||||
320,Extensible Stylesheet Language Transformations (XLST),9,Lang
|
||||
321,FOCUS,9,Lang
|
||||
322,Fortran,9,Lang
|
||||
323,Go,9,Lang
|
||||
324,GraphQL,9,Lang
|
||||
325,Groovy,9,Lang
|
||||
326,HiveQL,9,Lang
|
||||
327,Hypertext Markup Language (HTML),9,Lang
|
||||
328,IBM High Level Assembler (HLASM),9,Lang
|
||||
329,IBM i Control Language (CL),9,Lang
|
||||
330,IBM Informix-4GL,9,Lang
|
||||
331,Java,9,Lang
|
||||
332,Java|Extensible Stylesheet Language (XSL),9,Lang
|
||||
333,Java|Java Enterprise Edition (Java EE),9,Lang
|
||||
334,Java|Java Standard Edition (Java SE),9,Lang
|
||||
335,Java|JavaServer Pages (JSP),9,Lang
|
||||
336,Java|JavaServer Pages (JSP)|Scriptlets,9,Lang
|
||||
337,JavaScript,9,Lang
|
||||
338,JCL,9,Lang
|
||||
339,Job Information Language (JIL),9,Lang
|
||||
340,JScript,9,Lang
|
||||
341,Lisp,9,Lang
|
||||
342,Niakwa Programming Language (NPL),9,Lang
|
||||
343,Objective C,9,Lang
|
||||
344,OpenEdge ABL,9,Lang
|
||||
345,Pascal,9,Lang
|
||||
346,Pascal|Object Pascal,9,Lang
|
||||
347,Perl,9,Lang
|
||||
348,Perl|ActivePerl,9,Lang
|
||||
349,Perl|Rex,9,Lang
|
||||
350,PHP,9,Lang
|
||||
351,PL/I,9,Lang
|
||||
352,PL/SQL,9,Lang
|
||||
353,PRO*C,9,Lang
|
||||
354,Python,9,Lang
|
||||
355,R,9,Lang
|
||||
356,Rexx,9,Lang
|
||||
357,RPG,9,Lang
|
||||
358,Ruby,9,Lang
|
||||
359,Salesforce Object Query Language (SOQL),9,Lang
|
||||
360,SAS,9,Lang
|
||||
361,Sass,9,Lang
|
||||
362,Scala,9,Lang
|
||||
363,Smalltalk,9,Lang
|
||||
364,Swift,9,Lang
|
||||
365,TCL,9,Lang
|
||||
366,Transact-SQL,9,Lang
|
||||
367,TypeScript,9,Lang
|
||||
368,VB.NET,9,Lang
|
||||
369,VBScript,9,Lang
|
||||
370,Visual Basic,9,Lang
|
||||
371,Visual Basic for Applications (VBA),9,Lang
|
||||
372,Visual FoxPro,9,Lang
|
||||
373,VoiceXML,9,Lang
|
||||
374,Xbase++,9,Lang
|
||||
375,Apache Lucene,12,Lib
|
||||
376,Apache Xerces,12,Lib
|
||||
377,Cascading Style Sheets (CSS)|Bootstrap,12,Lib
|
||||
378,Java|Apache Camel,12,Lib
|
||||
379,Java|Apache Commons BeanUtils,12,Lib
|
||||
380,Java|Apache PDFBox,12,Lib
|
||||
381,Java|Apache Velocity,12,Lib
|
||||
382,Java|EclipseLink,12,Lib
|
||||
383,Java|Enterprise JavaBeans (EJB),12,Lib
|
||||
384,Java|EZMorph,12,Lib
|
||||
385,Java|Google Web Toolkit (GWT),12,Lib
|
||||
386,Java|Hibernate,12,Lib
|
||||
387,Java|IBM SDK,12,Lib
|
||||
388,Java|Java Development Kit (JDK),12,Lib
|
||||
389,Java|Java Message Service (JMS),12,Lib
|
||||
390,Java|Java Web Start,12,Lib
|
||||
391,Java|JavaServer Faces (JSF),12,Lib
|
||||
392,Java|JDBC,12,Lib
|
||||
393,Java|JRuby Core,12,Lib
|
||||
394,Java|Log4j,12,Lib
|
||||
395,Java|Quartz,12,Lib
|
||||
396,Java|Remote Method Invocation (RMI),12,Lib
|
||||
397,Java|Servlet,12,Lib
|
||||
398,Java|Spring,12,Lib
|
||||
399,Java|Spring|Spring Boot,12,Lib
|
||||
400,Java|Spring|Spring Cloud Data Flow,12,Lib
|
||||
401,Java|Spring|Spring MVC,12,Lib
|
||||
402,Java|Struts,12,Lib
|
||||
403,Java|Swing,12,Lib
|
||||
404,Java|Vaadin,12,Lib
|
||||
405,JavaScript|AJAX,12,Lib
|
||||
406,JavaScript|AngularJS,12,Lib
|
||||
407,JavaScript|Draw2D,12,Lib
|
||||
408,JavaScript|Express.js,12,Lib
|
||||
409,JavaScript|Ext JS,12,Lib
|
||||
410,JavaScript|jqGrid,12,Lib
|
||||
411,JavaScript|JQuery,12,Lib
|
||||
412,JavaScript|Jquery|jQuery UI,12,Lib
|
||||
413,JavaScript|React,12,Lib
|
||||
414,JavaScript|script.aculo.us,12,Lib
|
||||
415,JavaScript|Valums AJAX File Uploader,12,Lib
|
||||
416,OWASP Enterprise Security API (ESAPI),12,Lib
|
||||
417,Perl|Oraperl,12,Lib
|
||||
418,Android,6,OS
|
||||
419,BeOS,6,OS
|
||||
420,Cisco IOS,6,OS
|
||||
421,DART,6,OS
|
||||
422,Fabric OS,6,OS
|
||||
423,GNU,6,OS
|
||||
424,IBM i,6,OS
|
||||
425,iOS,6,OS
|
||||
426,Linux,6,OS
|
||||
427,Linux|CentOS,6,OS
|
||||
428,Linux|Check Point,6,OS
|
||||
429,Linux|Debian,6,OS
|
||||
430,Linux|Junos OS,6,OS
|
||||
431,Linux|openSUSE,6,OS
|
||||
432,Linux|Oracle Linux,6,OS
|
||||
433,Linux|Photon OS,6,OS
|
||||
434,Linux|Red Hat Enterprise Linux,6,OS
|
||||
435,Linux|SUSE Linux Enterprise Server,6,OS
|
||||
436,Linux|Ubuntu,6,OS
|
||||
437,Linux|zLinux,6,OS
|
||||
438,macOS,6,OS
|
||||
439,MVS,6,OS
|
||||
440,MVS|OS/390,6,OS
|
||||
441,MVS|z/OS,6,OS
|
||||
442,OpenVMS,6,OS
|
||||
443,OS/2,6,OS
|
||||
444,Unix,6,OS
|
||||
445,Unix|AIX,6,OS
|
||||
446,Unix|BSD,6,OS
|
||||
447,Unix|BSD|FreeBSD,6,OS
|
||||
448,Unix|BSD|SunOS,6,OS
|
||||
449,Unix|HP-UX,6,OS
|
||||
450,Windows,6,OS
|
||||
451,Windows|Windows Desktop,6,OS
|
||||
452,Windows|Windows Server,6,OS
|
||||
453,Linux|Fedora,6,OS
|
||||
454,Linux|Amazon Linux,6,OS
|
||||
455,Clarify|Clear Basic,5,Plugin
|
||||
456,Eclipse|ATLAS Transformation Language (ATL),5,Plugin
|
||||
457,IBM BigFix Platform|Client Deploy Tool,5,Plugin
|
||||
458,IBM Integration Bus|Extended Structured Query Language (ESQL),5,Plugin
|
||||
459,IBM Tivoli Asset Management|Asset Discovery for Distributed,5,Plugin
|
||||
460,IBM Tivoli Storage Manager|TSM API,5,Plugin
|
||||
461,IBM Tivoli Storage Manager|TSM Client,5,Plugin
|
||||
462,IBM Tivoli Storage Manager|TSM Storage Agent,5,Plugin
|
||||
463,IBM Tivoli Storage Manager|VSS Requestor,5,Plugin
|
||||
464,Microsoft Exchange Server|Veeam Explorer,5,Plugin
|
||||
465,MS SQL Server|MS SQL Server Browser,5,Plugin
|
||||
466,MS SQL Server|Data Transformation Services,5,Plugin
|
||||
467,MS SQL Server|Log Reader Agent,5,Plugin
|
||||
468,MS SQL Server|SQL Server Analysis Services (SSAS),5,Plugin
|
||||
469,MS SQL Server|SQL Server Database Engine,5,Plugin
|
||||
470,MS SQL Server|SQL Server Integration Services (SSIS),5,Plugin
|
||||
471,MS SQL Server|SQL Server Management Studio,5,Plugin
|
||||
472,MS SQL Server|SQL Server Report Builder,5,Plugin
|
||||
473,MS SQL Server|SQL Server Reporting Services (SSRS),5,Plugin
|
||||
474,Oracle Database|Jserver,5,Plugin
|
||||
475,Oracle Database|Oracle Spatial and Graph,5,Plugin
|
||||
476,SAP ERP|SAP EHP,5,Plugin
|
||||
477,SAP ERP|SAP Kernel,5,Plugin
|
||||
478,Oracle Database|SQL*Plus,5,Plugin
|
||||
479,Sybase SQL Server|Sybase Central,5,Plugin
|
||||
480,Sybase SQL Server|Sybase Dsedit,5,Plugin
|
||||
481,TIBCO Business Works (BW)|Integration Manager,5,Plugin
|
||||
482,.NET Framework|Common Runtime Library,7,Runlib
|
||||
483,.NET Framework|log4net,7,Runlib
|
||||
484,.NET Framework|Magick.NET,7,Runlib
|
||||
485,.NET Framework|Windows Communication Foundation (WCF),7,Runlib
|
||||
486,.NET Framework|Windows Workflow Foundation (WF),7,Runlib
|
||||
487,.NET Framework|WinForms,7,Runlib
|
||||
488,ActiveX|ADO,7,Runlib
|
||||
489,IIS|Easy Migration Tool (IEMT),7,Runlib
|
||||
490,IIS|Application Request Routing (ARR),7,Runlib
|
||||
491,IIS|IIS Manager,7,Runlib
|
||||
492,JBoss|JBoss Seam,7,Runlib
|
||||
493,JBoss|Wildfly,7,Runlib
|
||||
494,Oracle Application Server|Oracle Transparent Gateway,7,Runlib
|
||||
495,Oracle WebCenter Content Server|Idoc Script,7,Runlib
|
||||
496,SAP NetWeaver App Server|ABAP,7,Runlib
|
||||
497,.NET Framework,10,Runtime
|
||||
498,Active Directory (AD),10,Runtime
|
||||
499,Active Server Pages (ASP),10,Runtime
|
||||
500,ActiveX,10,Runtime
|
||||
501,Apache Cordova,10,Runtime
|
||||
502,CICS,10,Runtime
|
||||
503,Docker,10,Runtime
|
||||
504,Flash,10,Runtime
|
||||
505,HTTP File Server,10,Runtime
|
||||
506,Java Runtime Environment (JRE),10,Runtime
|
||||
507,Node.js,10,Runtime
|
||||
508,Ruby on Rails,10,Runtime
|
||||
509,VisualForce,10,Runtime
|
||||
510,EMC Celerra,11,Storage
|
||||
511,Application Lifecycle Management (ALM),1,Technology
|
||||
512,Assembler Language,1,Technology
|
||||
513,Batch Management Software (BMS),1,Technology
|
||||
514,Business Object Reports,1,Technology
|
||||
515,Common Gateway Interface (CGI),1,Technology
|
||||
516,Compopent Object Model (COM),1,Technology
|
||||
517,Common Object Request Broker Architecture (CORBA),1,Technology
|
||||
518,CORBA Interface Definition Language (CORBA IDL),1,Technology
|
||||
519,Data Control Language (DCL),1,Technology
|
||||
520,Database (DB),1,Technology
|
||||
521,Electronic Data Interchange (EDI),1,Technology
|
||||
522,Application Web Server,1,Technology
|
||||
523,Java-based Document Object Model for XML (JDOM),1,Technology
|
||||
524,Lightweight Directory Access Protocol (LDAP),1,Technology
|
||||
525,Open Database Connectivity (ODBC),1,Technology
|
||||
526,Order Management System (OMS),1,Technology
|
||||
527,Oracle Web Services,1,Technology
|
||||
528,Reporting Services,1,Technology
|
||||
529,Representational State Transfer (REST),1,Technology
|
||||
530,Service-Oriented Architecture (SOA),1,Technology
|
||||
531,Simple Object Access Protocol (SOAP),1,Technology
|
||||
532,SQL,9,Lang
|
||||
533,YAML,1,Technology
|
||||
534,Model-view-controller (MVC),1,Technology
|
||||
535,Application Server,1,Technology
|
||||
536,Cloud,1,Technology
|
||||
537,Competency and Quality Assurance Server,1,Technology
|
||||
538,Device Provisioning Engines (DPE),1,Technology
|
||||
539,E-business solution,1,Technology
|
||||
540,Enterprise Service Bus(ESB),1,Technology
|
||||
541,File Server,1,Technology
|
||||
542,General Ledger,1,Technology
|
||||
543,HTTP client,1,Technology
|
||||
544,HTTP Server,1,Technology
|
||||
545,Integrated Safe System of Work (ISSOW),1,Technology
|
||||
546,Internet Exchange Point - Full Stack (ixp-ft),1,Technology
|
||||
547,Internet Message Access Protocol (IMAP),1,Technology
|
||||
548,JSON,1,Technology
|
||||
549,KVS Application Server,1,Technology
|
||||
550,KVS File Server,1,Technology
|
||||
551,KVS Proxy Server,1,Technology
|
||||
552,mainframe,1,Technology
|
||||
553,Manufacturing Execution System (MES),1,Technology
|
||||
554,Mobile,1,Technology
|
||||
555,NonSQL,1,Technology
|
||||
556,SaaS,1,Technology
|
||||
557,Storage Area Network (SAN),1,Technology
|
||||
558,Supplier Registration System Application Server,1,Technology
|
||||
559,Virtual Appliance,1,Technology
|
||||
560,Webtop,1,Technology
|
||||
561,Proxy Server,1,Technology
|
||||
562,Utility,1,Technology
|
||||
563,Citrix ADC,3,VM
|
||||
564,Citrix ADC VPX,3,VM
|
||||
565,Citrix ADC BLX,3,VM
|
||||
566,InterScan Messaging Security Virtual Appliance (IMSVA),3,VM
|
||||
567,Oracle VM,3,VM
|
||||
568,VMware ESXi,3,VM
|
||||
569,VMware Server,3,VM
|
||||
570,IBM WebSphere Transformation Extender (WTX),2,App
|
||||
571,Oracle Retail Point-of-Service,2,App
|
||||
572,Structured Query Language (SQL),1,Technology
|
||||
573,TSO/ISPF,2,App
|
||||
574,Model view controller (MVC),1,Technology
|
||||
575,|*,6,OS
|
||||
576,Linux|*,6,OS
|
||||
577,MVS|*,6,OS
|
||||
578,Unix|*,6,OS
|
||||
579,Unix|BSD|*,6,OS
|
||||
580,Windows|*,6,OS
|
||||
581,MS SQL Server|*,2,App
|
||||
582,C#|*,9,Lang
|
||||
583,C++|*,9,Lang
|
||||
584,Java|*,9,Lang
|
||||
585,Perl|*,9,Lang
|
||||
586,PHP|*,9,Lang
|
||||
587,Python|*,9,Lang
|
||||
588,Ruby|*,9,Lang
|
||||
589,JavaScript|*,9,Lang
|
||||
590,Unix|BSD|OpenBSD,6,OS
|
||||
591,z/VSE,6,OS
|
||||
592,Active Server Pages (ASP)|*,9,Lang
|
||||
593,MS-DOS,6,OS
|
||||
594,COBOL|*,9,Lang
|
||||
595,VME,6,OS
|
||||
596,Extensible Markup Language (XML)|*,9,Lang
|
||||
597,DOS/360,6,OS
|
||||
598,z/TPF,6,OS
|
||||
599,Pascal|*,9,Lang
|
||||
600,Oracle WebLogic Server|*,8,App Server
|
||||
601,Websphere ILOG JRules BRMS,2,App
|
||||
602,Unix|BSD|NetBSD,6,OS
|
||||
603,SharePoint|*,2,App
|
||||
604,IBM Tivoli Storage Manager|*,2,App
|
||||
605,IBM Spectrum Scale|*,2,App
|
||||
606,IBM Tivoli Asset Management|*,2,App
|
||||
607,Oracle Hyperion|*,2,App
|
||||
608,z/VM,6,OS
|
||||
609,IIS|*,8,App Server
|
||||
610,Oracle Application Server|*,8,App Server
|
||||
611,instana,10,Runtime
|
||||
612,credstash,2,App
|
||||
613,Snyk,2,App
|
||||
614,Akka,2,App
|
||||
615,Varnish,8,App Server
|
||||
616,Datadog,10,Runtime
|
||||
617,API,1,Technology
|
||||
618,Hazelcast,10,Runtime
|
||||
619,Infinispan,2,App
|
||||
620,Nuxeo,10,Runtime
|
||||
621,ArangoDB,2,App
|
||||
622,Eclipse Che,2,App
|
||||
623,Amazon S3,2,App
|
||||
624,ClickHouse,2,App
|
||||
625,MinIO,2,App
|
||||
626,Elasticsearch,2,App
|
||||
627,XtraDB,2,App
|
||||
628,Keycloak,2,App
|
||||
629,Grafana,2,App
|
||||
630,Mattermost,10,Runtime
|
||||
631,Synapse,2,App
|
||||
632,Cloud IAM,2,App
|
||||
633,Knative,10,Runtime
|
||||
634,Apache Cassandra,2,App
|
||||
635,Kubeflow,10,Runtime
|
||||
636,Qiskit,2,App
|
||||
637,Microsoft Azure,2,App
|
||||
638,Strimzi,10,Runtime
|
||||
639,Sematext,10,Runtime
|
||||
640,Eclipse hawkBit,2,App
|
||||
641,Eclipse Ditto,2,App
|
||||
642,MariaDB,2,App
|
||||
643,Zadara,2,App
|
||||
644,Istio,2,App
|
||||
645,Vault,2,App
|
||||
646,Apache Druid,2,App
|
||||
647,etcd,2,App
|
||||
648,Traefik,8,App Server
|
||||
649,IBM Cloud,2,App
|
||||
650,YugabyteDB,2,App
|
||||
651,CockroachDB,2,App
|
||||
652,Jaeger,10,Runtime
|
||||
653,Natural Programming Language,9,Lang
|
||||
654,AcuCOBOL,9,Lang
|
||||
655,Ada,9,Lang
|
||||
656,ADABAS,2,App
|
||||
657,ADSO,9,Lang
|
||||
658,Ansible,2,App
|
||||
659,Batch,9,Lang
|
||||
660,Powershell,9,Lang
|
||||
661,COM+,10,Runtime
|
||||
662,Dataflex,9,Lang
|
||||
663,DDS,9,Lang
|
||||
664,Forte,9,Lang
|
||||
665,Foxpro,9,Lang
|
||||
666,IBM DB2 Purescale,2,App
|
||||
667,IDMS DB,2,App
|
||||
668,IDMS DML,9,Lang
|
||||
669,Jaguar,8,App Server
|
||||
670,EAServer,8,App Server
|
||||
671,Apache Cassandra,2,App
|
||||
672,IBM Netezza,4,HW
|
||||
673,OpenEdge,9,Lang
|
||||
674,OpenROAD,9,Lang
|
||||
675,Oracle Reports,2,App
|
||||
676,SAP Replication Server,2,App
|
||||
677,Git,2,App
|
||||
678,GitLab,2,App
|
||||
679,VSAM,2,App
|
||||
680,Cloud<>Apache HTTP Server,2,App
|
||||
681,Cloud<>Windows|Windows Server,2,App
|
||||
682,Cloud<>MS SQL Server,2,App
|
||||
683,Cloud<>Azure SQL Server Database,2,App
|
||||
684,Cloud<>MySQL,2,App
|
||||
685,Cloud<>Oracle Database,2,App
|
||||
686,Cloud<>PostgreSQL,2,App
|
||||
687,Cloud<>AWS RDS,2,App
|
||||
688,Cloud<>SAP HANA DB,2,App
|
||||
689,BMS Map,9,Lang
|
||||
690,DB400,2,App
|
||||
691,ILE,9,Lang
|
||||
692,Integrated Data Store (IDS),2,App
|
||||
693,ISAM,2,App
|
||||
694,Oracle RDS,2,App
|
||||
695,SAP IQ,2,App
|
||||
696,Cloud<>Linux,2,App
|
||||
697,Apache Maven,2,App
|
||||
698,IBM Basic Assembly Language (BAL),9,Lang
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,6 +1,6 @@
|
|||
|
||||
*******************************************************************************
|
||||
Accuracy: 0.79090
|
||||
F1 Score: 0.80996
|
||||
Precision: 0.88827
|
||||
Recall: 0.79090
|
||||
Accuracy: 0.77655
|
||||
F1 Score: 0.79605
|
||||
Precision: 0.85637
|
||||
Recall: 0.77655
|
|
@ -32,7 +32,7 @@ torch.set_float32_matmul_precision('high')
|
|||
BATCH_SIZE = 256
|
||||
|
||||
# %%
|
||||
data_path = '../../../data_import/train.csv'
|
||||
data_path = '../../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
|
@ -49,20 +49,31 @@ for idx, val in enumerate(target_id_list):
|
|||
|
||||
# introduce pre-processing functions
|
||||
def preprocess_text(text):
|
||||
|
||||
# 1. Make all uppercase
|
||||
text = text.upper()
|
||||
text = text.lower()
|
||||
|
||||
# 2. Remove punctuations
|
||||
# text = re.sub(r'[^\w\s]', '', text) # Retains only alphanumeric and spaces
|
||||
# Remove any non alphanumeric character
|
||||
# text = re.sub(r'[^\w\s]', ' ', text) # Retains only alphanumeric and spaces
|
||||
text = re.sub(r"[-;:]", " ", text)
|
||||
|
||||
# Add space between digit followed by a letter
|
||||
text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
|
||||
|
||||
# Add space between letter followed by a digit
|
||||
text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
|
||||
|
||||
|
||||
# Substitute digits with '#'
|
||||
text = re.sub(r'\d+', 'x', text)
|
||||
|
||||
# standardize spacing
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# 3. Substitute digits with '#'
|
||||
text = re.sub(r'\d', '#', text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
# each element maps input to output
|
||||
|
@ -85,7 +96,7 @@ def process_df_to_dict(df):
|
|||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../../data_import/test.csv'
|
||||
data_path = '../../../esAppMod_data_import/test.csv'
|
||||
test_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
|
@ -45,17 +45,47 @@ def set_seed(seed):
|
|||
|
||||
set_seed(42)
|
||||
|
||||
SHUFFLES=5
|
||||
SHUFFLES=2
|
||||
|
||||
# %%
|
||||
|
||||
# import training file
|
||||
data_path = '../../data_import/train.csv'
|
||||
data_path = '../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
# rather than use pattern, we use the real thing and property
|
||||
entity_ids = train_df['entity_id'].to_list()
|
||||
target_id_list = sorted(list(set(entity_ids)))
|
||||
|
||||
def compute_normalized_class_weights(class_counts, max_resamples=SHUFFLES):
|
||||
"""
|
||||
Compute normalized class weights inversely proportional to class counts.
|
||||
The weights are normalized so that they sum to 1.
|
||||
|
||||
Args:
|
||||
class_counts (array-like): An array or list where each element represents the count of samples for a class.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: A normalized array of weights for each class.
|
||||
"""
|
||||
class_counts = np.array(class_counts)
|
||||
total_samples = np.sum(class_counts)
|
||||
class_weights = total_samples / class_counts
|
||||
# so that highest weight is 1
|
||||
normalized_weights = class_weights / np.max(class_weights)
|
||||
# Scale weights such that the highest weight corresponds to `max_resamples`
|
||||
resample_counts = normalized_weights * max_resamples
|
||||
# Round resamples to nearest integer
|
||||
resample_counts = np.round(resample_counts).astype(int)
|
||||
return resample_counts
|
||||
|
||||
# %%
|
||||
id_counts = train_df['entity_id'].value_counts()
|
||||
id_weights = compute_normalized_class_weights(id_counts, max_resamples=SHUFFLES)
|
||||
id_index = id_counts.index
|
||||
label2weight = {}
|
||||
for idx, label in enumerate(id_index):
|
||||
label2weight[label] = id_weights[idx]
|
||||
|
||||
|
||||
# %%
|
||||
id2label = {}
|
||||
|
@ -69,14 +99,26 @@ for idx, val in enumerate(target_id_list):
|
|||
def preprocess_text(text):
|
||||
|
||||
# 1. Make all uppercase
|
||||
text = text.upper()
|
||||
text = text.lower()
|
||||
|
||||
# 2. Remove punctuations
|
||||
# text = re.sub(r'[^\w\s]', '', text) # Retains only alphanumeric and spaces
|
||||
# Remove any non alphanumeric character
|
||||
# text = re.sub(r'[^\w\s]', ' ', text) # Retains only alphanumeric and spaces
|
||||
# replace dashes
|
||||
text = re.sub(r"[-;:]", " ", text)
|
||||
|
||||
# Add space between digit followed by a letter
|
||||
text = re.sub(r"(\d)([A-Z])", r"\1 \2", text)
|
||||
|
||||
# Add space between letter followed by a digit
|
||||
text = re.sub(r"([A-Z])(\d)", r"\1 \2", text)
|
||||
|
||||
|
||||
# Substitute digits with 'x'
|
||||
text = re.sub(r'\d+', 'x', text)
|
||||
|
||||
# standardize spacing
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
# 3. Substitute digits with '#'
|
||||
text = re.sub(r'\d', '#', text)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
|
@ -123,6 +165,42 @@ def shuffle_text(text, n_shuffles=SHUFFLES):
|
|||
|
||||
return all_processed
|
||||
|
||||
term_to_abbrev = {
|
||||
r'job entry system': 'jes',
|
||||
r'subversion': 'svn',
|
||||
r'borland database engine': 'bde',
|
||||
r'business intelligence and reporting tools': 'birt',
|
||||
r'lan management solution': 'lms',
|
||||
r'laboratory information management system': 'lims',
|
||||
r'ibm database 2': 'db/2',
|
||||
r'integrated development environment': 'ide',
|
||||
r'software development kit': 'sdk',
|
||||
r'hp operations orchestration': 'hpoo',
|
||||
r'hp server automation': 'hpsa',
|
||||
r'internet information server': 'iis',
|
||||
r'release 2': 'r2',
|
||||
r'red hat enterprise linux': 'rhel',
|
||||
r'oracle enterprise linux': 'oel',
|
||||
r'websphere application server': 'was',
|
||||
r'application development facility': 'adf',
|
||||
r'server analysis services': 'ssas'
|
||||
}
|
||||
|
||||
abbrev_to_term = {rf'\b{value}\b': key for key, value in term_to_abbrev.items()}
|
||||
|
||||
def replace_terms_with_abbreviations(text):
|
||||
for input, replacement in term_to_abbrev.items():
|
||||
text = re.sub(input, replacement, text)
|
||||
return text
|
||||
|
||||
def replace_abbreivations_with_terms(text):
|
||||
for input, replacement in abbrev_to_term.items():
|
||||
text = re.sub(input, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# outputs a list of dictionaries
|
||||
# processes dataframe into lists of dictionaries
|
||||
|
@ -134,9 +212,12 @@ def process_df_to_dict(df):
|
|||
for _, row in df.iterrows():
|
||||
# produce shuffling
|
||||
index = row['entity_id']
|
||||
desc = row['mention']
|
||||
desc = preprocess_text(desc)
|
||||
processed_descs = shuffle_text(desc, n_shuffles=SHUFFLES)
|
||||
parent_desc = row['mention']
|
||||
parent_desc = preprocess_text(parent_desc)
|
||||
# ensure at least 1 shuffle
|
||||
# no_of_shuffles = label2weight[index] + 1
|
||||
no_of_shuffles = SHUFFLES
|
||||
processed_descs = shuffle_text(parent_desc, n_shuffles=no_of_shuffles)
|
||||
|
||||
for desc in processed_descs:
|
||||
element = {
|
||||
|
@ -145,12 +226,38 @@ def process_df_to_dict(df):
|
|||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
# perform abbrev_to_term
|
||||
desc = replace_terms_with_abbreviations(parent_desc)
|
||||
no_of_shuffles = SHUFFLES
|
||||
processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
|
||||
|
||||
for desc in processed_descs:
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
# perform term to abbrev
|
||||
desc = replace_abbreivations_with_terms(parent_desc)
|
||||
no_of_shuffles = SHUFFLES
|
||||
processed_descs = shuffle_text(desc, n_shuffles=no_of_shuffles)
|
||||
|
||||
for desc in processed_descs:
|
||||
element = {
|
||||
'text' : desc,
|
||||
'label': label2id[index], # ensure labels starts from 0
|
||||
}
|
||||
output_list.append(element)
|
||||
|
||||
|
||||
return output_list
|
||||
|
||||
|
||||
def create_dataset():
|
||||
# train
|
||||
data_path = '../../data_import/train.csv'
|
||||
data_path = '../../esAppMod_data_import/train.csv'
|
||||
train_df = pd.read_csv(data_path, skipinitialspace=True)
|
||||
|
||||
|
||||
|
@ -169,8 +276,9 @@ def train():
|
|||
|
||||
# prepare tokenizer
|
||||
|
||||
# model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||
model_checkpoint = 'google-bert/bert-base-cased'
|
||||
model_checkpoint = "distilbert/distilbert-base-uncased"
|
||||
# model_checkpoint = 'google-bert/bert-base-cased'
|
||||
# model_checkpoint = 'prajjwal1/bert-small'
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", clean_up_tokenization_spaces=True)
|
||||
# Define additional special tokens
|
||||
# additional_special_tokens = ["<DESC>"]
|
||||
|
@ -246,14 +354,15 @@ def train():
|
|||
logging_strategy="epoch",
|
||||
# save_strategy="epoch",
|
||||
load_best_model_at_end=False,
|
||||
learning_rate=1e-4,
|
||||
per_device_train_batch_size=128,
|
||||
per_device_eval_batch_size=128,
|
||||
learning_rate=5e-5,
|
||||
per_device_train_batch_size=64,
|
||||
per_device_eval_batch_size=64,
|
||||
auto_find_batch_size=False,
|
||||
ddp_find_unused_parameters=False,
|
||||
weight_decay=0.01,
|
||||
save_total_limit=1,
|
||||
num_train_epochs=120,
|
||||
num_train_epochs=80,
|
||||
warmup_steps=400,
|
||||
bf16=True,
|
||||
push_to_hub=False,
|
||||
remove_unused_columns=False,
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
# %%
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
# %%
|
||||
# Load model and tokenizer
|
||||
# model_name = "bigscience/bloom-7b1" # Replace with your model
|
||||
model_name = "bigscience/bloomz-1b1"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Automatically map model layers to available GPUs
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name,
|
||||
device_map="auto", # Automatically split across multiple GPUs
|
||||
torch_dtype="auto" # Use FP16 if available
|
||||
)
|
||||
|
||||
# %%
|
||||
# Prepare input
|
||||
text = "The quick brown fox jumps over the lazy dog."
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
inputs = inputs.to("cuda")
|
||||
|
||||
# Generate output
|
||||
outputs = model.generate(inputs["input_ids"], max_length=50)
|
||||
|
||||
# Decode and print result
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
# %%
|
||||
# %%
|
||||
# Prepare input
|
||||
|
||||
def generate(text):
|
||||
|
||||
# Define prompt
|
||||
prompt = f"Answer Concisely: Give me a mapping between the acronym and descriptor in the format '(acronym: description): '{text}'"
|
||||
|
||||
# Generate acronym
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
inputs = inputs.to("cuda")
|
||||
outputs = model.generate(
|
||||
inputs["input_ids"],
|
||||
max_length=100,
|
||||
no_repeat_ngram_size=3)
|
||||
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
# Example usage
|
||||
# text = "Advanced Data Analytics Platform"
|
||||
text = 'ColdFusion Markup Language (CFML)'
|
||||
acronym = generate(text)
|
||||
print(f"Acronym: {acronym}")
|
||||
# %%
|
|
@ -0,0 +1,52 @@
|
|||
# %%
|
||||
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
||||
|
||||
# %%
|
||||
# Load model and tokenizer
|
||||
# model_name = "bigscience/bloom-7b1" # Replace with your model
|
||||
model_name = "google/flan-t5-large"
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Automatically map model layers to available GPUs
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(
|
||||
model_name,
|
||||
device_map="auto", # Automatically split across multiple GPUs
|
||||
torch_dtype="auto" # Use FP16 if available
|
||||
)
|
||||
|
||||
# %%
|
||||
# Prepare input
|
||||
text = "The quick brown fox jumps over the lazy dog."
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
inputs = inputs.to("cuda")
|
||||
|
||||
# Generate output
|
||||
outputs = model.generate(inputs["input_ids"], max_length=50)
|
||||
|
||||
# Decode and print result
|
||||
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
||||
# %%
|
||||
# %%
|
||||
# Prepare input
|
||||
|
||||
def generate_acronym(text):
|
||||
|
||||
# Define prompt
|
||||
prompt = f"Answer concisely: make a possible acronym from the following: '{text}'"
|
||||
|
||||
# Generate acronym
|
||||
inputs = tokenizer(prompt, return_tensors="pt")
|
||||
inputs = inputs.to("cuda")
|
||||
outputs = model.generate(
|
||||
inputs["input_ids"],
|
||||
max_length=100,
|
||||
no_repeat_ngram_size=3)
|
||||
return tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
# %%
|
||||
# Example usage
|
||||
# text = "Advanced Data Analytics Platform"
|
||||
text = "red hat enterprise linux"
|
||||
acronym = generate_acronym(text)
|
||||
print(f"Acronym: {acronym}")
|
||||
# %%
|
Loading…
Reference in New Issue