forked from azk0019/CourseProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtrain.py
More file actions
94 lines (73 loc) · 3.09 KB
/
train.py
File metadata and controls
94 lines (73 loc) · 3.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# importing necessary packages
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import json
from emot.emo_unicode import UNICODE_EMO
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
# Initialize variables
DATA_INPUT = './data/train.jsonl'
EARLY_STOP = True
EARLY_STOP_DELTA = 0.01
OVERWRITE = True
EPOCHS = 3
BATCH_SIZE = 100
LEARNING_RATE = 0.00004
MODEL_TYPE = 'bert'
MODEL_BASE = 'bert-base-cased'
OUTPUT = 'outputs/'
word_dist = []
train = []
test = []
# Converts emojis into text
def convert_emojis(text):
for emot in UNICODE_EMO:
text = text.replace(emot, "_".join(UNICODE_EMO[emot].replace(",", "").replace(":", "").split()))
return text
# Train BERT model
def bert_training(model_type, model_base, train_data, early_stop,
early_stop_delta, overwrite, epoch, batch_size,
learning_rate, output):
# Bringing in the training data
with open(train_data, 'r') as json_file:
json_list = list(json_file)
for json_str in json_list:
train.append(json.loads(json_str))
# Data cleaning
train_labels = [train[i]['label'] for i in range(len(train))]
train_response = [remove_stopwords(convert_emojis(train[i]['response'])) for i in range(len(train))]
# Split data into training and test sets
labels_train, labels_test, response_train, response_test = train_test_split(train_labels,
train_response,
test_size=0.2,
random_state=42)
# Convert SARCASM/NO SARCASM labels into 1s and 0s
labels_train_pd = (pd.DataFrame(labels_train) == 'SARCASM').astype(int)
labels_test_pd = (pd.DataFrame(labels_test) == 'SARCASM').astype(int)
response_train_pd = pd.DataFrame(response_train)
response_test_pd = pd.DataFrame(response_test)
train_bert = pd.DataFrame({
'text': response_train_pd[0].replace(r'\n', ' ', regex=True),
'label': labels_train_pd[0]
})
eval_bert = pd.DataFrame({
'text': response_test_pd[0].replace(r'\n', ' ', regex=True),
'label': labels_test_pd[0]
})
model_args = ClassificationArgs()
model_args.use_early_stopping = early_stop
model_args.early_stopping_delta = early_stop_delta
model_args.overwrite_output_dir = overwrite
model_args.num_train_epochs = epoch
model_args.train_batch_size = batch_size
model_args.learning_rate = learning_rate
model_args.output_dir = output
# Create a TransformerModel
model = ClassificationModel(model_type, model_base, use_cuda=False,
args=model_args)
# Train the model
model.train_model(train_bert)
# Evaluate the model
model.eval_model(eval_bert)
bert_training(MODEL_TYPE, MODEL_BASE, DATA_INPUT, EARLY_STOP,
EARLY_STOP_DELTA, OVERWRITE, EPOCHS, BATCH_SIZE, LEARNING_RATE, OUTPUT)