| from tokenizers import Tokenizer | |
| from tokenizers.models import BPE | |
| from transformers import PreTrainedTokenizerFast | |
| class gLM2Tokenizer(PreTrainedTokenizerFast): | |
| VOCAB = [ | |
| "<cls>", "<pad>", "<eos>", "<unk>", | |
| "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K", | |
| "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", | |
| "O", "a", "t", "c", "g", "<+>", "<->", "<mask>", "<sep>", | |
| ] | |
| def __init__( | |
| self, | |
| unk_token="<unk>", | |
| cls_token="<cls>", | |
| pad_token="<pad>", | |
| mask_token="<mask>", | |
| eos_token="<eos>", | |
| sep_token="<sep>", | |
| pos_token="<+>", | |
| neg_token="<->", | |
| **kwargs, | |
| ): | |
| all_tokens = self.VOCAB | |
| token_to_id = {tok: ind for ind, tok in enumerate(all_tokens)} | |
| bpe = BPE(token_to_id, merges=[], unk_token=str(unk_token)) | |
| tokenizer = Tokenizer(bpe) | |
| special_tokens = [cls_token, pad_token, | |
| mask_token, eos_token, sep_token, pos_token, neg_token] | |
| tokenizer.add_special_tokens( | |
| special_tokens, | |
| ) | |
| super().__init__( | |
| tokenizer_object=tokenizer, | |
| unk_token=unk_token, | |
| cls_token=cls_token, | |
| pad_token=pad_token, | |
| mask_token=mask_token, | |
| eos_token=eos_token, | |
| sep_token=sep_token, | |
| **kwargs, | |
| ) | |