-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmhs_imports.py
More file actions
97 lines (74 loc) · 3.74 KB
/
mhs_imports.py
File metadata and controls
97 lines (74 loc) · 3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import torch
import torch.optim as optim
from fastai import *
from fastai.text import *
from fastai.callbacks import *
import transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
import pandas as pd
import numpy as np
from unidecode import unidecode
import re
transformer_tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
# Adapted from: https://www.kaggle.com/code/maroberti/fastai-with-transformers-bert-roberta/notebook
# Define model wrapper for FAI learner
class CustomTransformerModel(nn.Module):
def __init__(self, transformer_model: PreTrainedModel):
super(CustomTransformerModel, self).__init__()
self.transformer = transformer_model
def forward(self, input_ids, attention_mask=None):
# attention_mask
# Mask to avoid performing attention on padding token indices.
# Mask values selected in ``[0, 1]``:
# ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
attention_mask = (input_ids != transformer_tokenizer.pad_token_id).type(input_ids.type())
logits = self.transformer(input_ids,
attention_mask = attention_mask)[0]
return logits
# Define adaption of FAI Vocab class based on transformers tokenizer
class TransformersVocab(Vocab):
def __init__(self, tokenizer: PreTrainedTokenizer):
super(TransformersVocab, self).__init__(itos = [])
self.tokenizer = tokenizer
def numericalize(self, t:Collection[str]) -> List[int]:
return self.tokenizer.convert_tokens_to_ids(t)
def textify(self, nums:Collection[int], sep=' ') -> List[str]:
nums = np.array(nums).tolist()
return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)
def __getstate__(self):
return {'itos':self.itos, 'tokenizer':self.tokenizer}
def __setstate__(self, state:dict):
self.itos = state['itos']
self.tokenizer = state['tokenizer']
self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})
# Define Tokenizer based on FAI BaseTokenizer class for transformers
class TransformersBaseTokenizer(BaseTokenizer):
def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'roberta', **kwargs):
self._pretrained_tokenizer = pretrained_tokenizer
self.max_seq_len = 128 # stop sequence analysis at 128 tokens
self.model_type = model_type
def __call__(self, *args, **kwargs):
return self
def tokenizer(self, t:str) -> List[str]:
CLS = self._pretrained_tokenizer.cls_token
SEP = self._pretrained_tokenizer.sep_token
tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
tokens = [CLS] + tokens + [SEP]
return tokens
# Initialize predictor from loaded model
class MHSPredictor:
def __init__(self, model_path: str = "production.pkl"):
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])
self.model = load_learner("", model_path)
def predict(self, text: str):
return self.model.predict(self.__preprocess(text))
def __preprocess(self, text: str):
# preprocessing routines
text = re.sub(r'@[^ ]+', '@USER', text)
text = re.sub(r'https?:\/\/[^ ]+', '<HTTP>', text)
text = re.sub(r'(?:\n|\r)', ' ', text)
text = re.sub(r" +", ' ', text)
text = re.sub('(?: $|^ )', '', text)
return unidecode(text)