Skip to content
This repository was archived by the owner on Jul 4, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions bpe.vocab
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#version: 0.2
d e
c o
e n
co de
b u
w i
w h
u b
r o
o k
ok en
o f</w>
l a
i s</w>
en code
e x
c h</w>
T h
1 change: 1 addition & 0 deletions build_tools/travis/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ pip install -U -r requirements.txt --progress-bar off
pip install spacy --progress-bar off
pip install nltk --progress-bar off
pip install sacremoses --progress-bar off
pip install subword_nmt --progress-bar off
pip install pandas --progress-bar off
pip install requests --progress-bar off

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ mock
# nltk
# spacy
# sacremoses
# subword-nmt

# Optional CUDA Utilties
# pynvrtc
Expand Down
23 changes: 23 additions & 0 deletions test_bpe.vocab
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#version: 0.2
h a
s ;
s; t</w>
p o
po s;t</w>
p a
pa j
paj a
paja m
pajam a
pajama s</w>
o w</w>
o u
o t</w>
n g</w>
m y</w>
l e
le p
i n</w>
b e
a pos;t</w>
& apos;t</w>
63 changes: 63 additions & 0 deletions tests/encoders/text/test_bytepair_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import unittest
import torch
import sys
from torchnlp.encoders.text import BPEEncoder


class TestBPETextTokenizer(unittest.TestCase):

def setUp(self):
self.corpus = ['This is a corpus of text that provides a bunch of tokens from which ',
'to build a vocabulary. It will be used when strings are encoded ',
'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.']

def test_vocab(self):
encoder = BPEEncoder(self.corpus, from_filenames=False)

# test if reserved_tokens were add to index_to_token.
self.assertEqual('<pad>', encoder.vocab[0])
self.assertEqual('<unk>', encoder.vocab[1])
self.assertEqual('</s>', encoder.vocab[2])
self.assertEqual('<s>', encoder.vocab[3])
self.assertEqual('<copy>', encoder.vocab[4])

# test if some high occurrence sub words are in the token.
self.assertIn('oken@@', encoder.index_to_token)
self.assertIn('encode@@', encoder.index_to_token)

expect_vocab_size = 57
self.assertEqual(expect_vocab_size, encoder.vocab_size)

def test_encode(self):
if sys.version_info.minor > 5:
original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'
encoder = BPEEncoder(self.corpus, from_filenames=False)

# excepted encode.
expect = [5, 6, 6, 7, 56, 32, 43, 1, 14, 1, 34, 42, 47, 32, 41, 36, 14, 17,
42, 49, 50, 51, 33, 9, 52, 53, 15, 14, 53, 26, 21, 54, 44, 55, 37]

encode_lst = encoder.encode(original).numpy().tolist()

self.assertListEqual(expect, encode_lst)

def test_decoder(self):
if sys.version_info.minor > 5:
encoded = torch.tensor([5, 6, 6, 7, 56, 32, 43, 1, 14, 1, 34, 42, 47, 32,
41, 36, 14, 17, 42, 49, 50, 51, 33, 9, 52, 53, 15,
14, 53, 26, 21, 54, 44, 55, 37])

encoder = BPEEncoder(self.corpus, from_filenames=False)

expect = "This is a coded s<unk> t<unk> ce encoded by the SubwordTextTokenizer."

self.assertEqual(expect, encoder.decode(encoded))

def test_encode_decode(self):
original = "This is a coded sentence encoded by the SubwordTextTokenizer."
expect = "This is a coded s<unk> t<unk> ce encoded by the SubwordTextTokenizer."

encoder = BPEEncoder(self.corpus, from_filenames=False)

decode_encode_str = encoder.decode(encoder.encode(original))
self.assertEqual(expect, decode_encode_str)
101 changes: 101 additions & 0 deletions tests/encoders/text/test_bytepair_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import unittest
import pickle

from torchnlp.encoders.text.bpe_text_tokenizer import BPETextTokenizer


class TestBPETextTokenizer(unittest.TestCase):

def setUp(self):
self.corpus = [
"One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't",
'know.', 'Groucho Marx',
"I haven't slept for 10 days... because that would be too long.", 'Mitch Hedberg'
]

def test_pre_tokenizer(self):
expected = ['One morning I shot an elephant in my pajamas . How he got in my pajamas ,'
' I don &apos;t',
'know .',
'Groucho Marx',
'I haven &apos;t slept for 10 days ... because that would be too long .',
'Mitch Hedberg']

self.assertListEqual(expected, [BPETextTokenizer.pre_tokenize(sen) for sen in self.corpus])

def test_get_vocabulary(self):
# tokenizer = BPETextTokenizer('test_bpe', use_moses=True)
def segment_words(line):
return BPETextTokenizer._segment_words(line, BPETextTokenizer.pre_tokenize)
token_counts = BPETextTokenizer.get_vocabulary(self.corpus,
segment_words, from_filenames=False)
expected = {
"&apos;t": 2,
".": 3,
"...": 1,
"Groucho": 1,
"Marx": 1,
"Mitch": 1,
"Hedberg": 1,
"I": 3,
"in": 2,
"my": 2,
"know": 1,
"because": 1,
"pajamas": 2,
}
self.assertDictContainsSubset(expected, token_counts)

def test_learn_bpe(self):
tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(self.corpus, from_filenames=False)
expected = {('&', 'apos;t</w>'): 21, ('a', 'pos;t</w>'): 20, ('b', 'e'): 19,
('i', 'n</w>'): 18, ('le', 'p'): 17, ('l', 'e'): 16, ('m', 'y</w>'): 15,
('n', 'g</w>'): 14, ('o', 't</w>'): 13, ('o', 'u'): 12, ('o', 'w</w>'): 11,
('pajama', 's</w>'): 10, ('pajam', 'a'): 9, ('paja', 'm'): 8, ('paj', 'a'): 7,
('pa', 'j'): 6, ('p', 'a'): 5, ('po', 's;t</w>'): 4, ('p', 'o'): 3,
('s;', 't</w>'): 2, ('s', ';'): 1, ('h', 'a'): 0}
self.assertDictEqual(expected, tokenizer.bpe.bpe_codes)

def test_encode_decode(self):
corpus = ['This is a corpus of text that provides a bunch of tokens from which ',
'to build a vocabulary. It will be used when strings are encoded ',
'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.']

original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'

tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(corpus, from_filenames=False)

# Encoding should be reversible.
encoded = tokenizer.encode(original)
decoded = tokenizer.decode(encoded)
self.assertEqual(original, decoded)

# The substrings coded@@ and en@@ are frequent enough in the corpus that
# they should appear in the vocabulary even though they are substrings
# of other included strings.
subtoken_strings = encoded
self.assertIn('en@@', subtoken_strings)
self.assertIn('code@@', subtoken_strings)

def test_build_vocab(self):
tokenizer = BPETextTokenizer('test_bpe')
tokenizer.build_from_corpus(self.corpus, from_filenames=False)

# test the all item in vocab.
expect = {'O@@': 1, 'n@@': 4, 'e': 4, 'm@@': 1, 'o@@': 5, 'r@@': 4, 'i@@': 2,
'ng': 2, 'I': 3, 's@@': 3, 'h@@': 3, 'ot': 2, 'a@@': 4, 'n': 3,
'e@@': 3, 'lep@@': 2, 'ha@@': 3, 't': 3, 'in': 2, 'my': 2,
'pajamas': 2, '.': 4, 'H@@': 2, 'ow': 2, 'g@@': 1, ',': 1, 'd@@': 3,
'&apos;t': 2, 'k@@': 1, 'G@@': 1, 'ou@@': 2, 'c@@': 3, 'o': 2,
'M@@': 2, 'x': 1, 'v@@': 1, 'f@@': 1, 'r': 1, '1@@': 1, '0': 1,
'y@@': 1, 's': 1, '.@@': 2, 'be@@': 2, 'u@@': 1, 't@@': 3,
'w@@': 1, 'l@@': 2, 'd': 1, 'b@@': 1, 'h': 1, 'g': 1}

self.assertDictEqual(expect, tokenizer.vocab)


def test_is_pickleable():
tokenizer = BPETextTokenizer('test_bpe')
pickle.dumps(tokenizer)
4 changes: 3 additions & 1 deletion torchnlp/encoders/text/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
from torchnlp.encoders.text.text_encoder import TextEncoder
from torchnlp.encoders.text.treebank_encoder import TreebankEncoder
from torchnlp.encoders.text.whitespace_encoder import WhitespaceEncoder
from torchnlp.encoders.text.bytepair_encoder import BPEEncoder

__all__ = [
'CharacterEncoder', 'DEFAULT_COPY_INDEX', 'DEFAULT_COPY_TOKEN', 'DEFAULT_EOS_INDEX',
'DEFAULT_EOS_TOKEN', 'DEFAULT_PADDING_INDEX', 'DEFAULT_PADDING_TOKEN',
'DEFAULT_RESERVED_TOKENS', 'DEFAULT_SOS_INDEX', 'DEFAULT_SOS_TOKEN', 'DEFAULT_UNKNOWN_INDEX',
'DEFAULT_UNKNOWN_TOKEN', 'DelimiterEncoder', 'MosesEncoder', 'pad_tensor',
'stack_and_pad_tensors', 'TextEncoder', 'SpacyEncoder', 'StaticTokenizerEncoder',
'SubwordEncoder', 'TreebankEncoder', 'WhitespaceEncoder', 'BatchedSequences'
'SubwordEncoder', 'TreebankEncoder', 'WhitespaceEncoder', 'BatchedSequences',
'BPEEncoder'
]
86 changes: 86 additions & 0 deletions torchnlp/encoders/text/bpe_text_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import codecs
from subword_nmt import learn_bpe, apply_bpe
from collections import Counter
from sacremoses import MosesTokenizer, MosesDetokenizer


class BPETextTokenizer(object):
_moses_tok = MosesTokenizer(lang='en')
_moses_detok = MosesDetokenizer(lang='en')

def __init__(self, file_prefix=None, separator='@@'):
if file_prefix is not None:
self.codes_file = '{}.vocab'.format(file_prefix)

self.separator = separator
self.bpe = None
self.vocab = None

@staticmethod
def pre_tokenize(line):
return BPETextTokenizer._moses_tok.tokenize(line, return_str=True)

@staticmethod
def _segment_words(line, pre_apply=None):
if pre_apply is not None:
line = pre_apply(line)
line = str(line)
return line.strip('\r\n ').split()

@staticmethod
def get_vocabulary(item_list, segment=_segment_words, from_filenames=True):
vocab = Counter()
if from_filenames:
for fname in item_list:
with codecs.open(fname, encoding='UTF-8') as f:
for line in f:
for word in segment(line):
vocab[word] += 1
else:
for line in item_list:
for word in segment(line):
vocab[word] += 1
return vocab

def build_from_corpus(self, item_list, min_count=2, num_symbols=10000,
total_symbols=False, from_filenames=True):
def segment_words(line):
return self._segment_words(line, self.pre_tokenize)

vocab_words = self.get_vocabulary(item_list, segment_words, from_filenames=from_filenames)

vocab_list = ['{0} {1}'.format(key, freq)
for (key, freq) in vocab_words.items()]

with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
learn_bpe.learn_bpe(vocab_list, output, num_symbols=num_symbols,
min_frequency=min_count, verbose=False,
is_dict=True, total_symbols=total_symbols)

with codecs.open(self.codes_file, encoding='UTF-8') as codes:
self.bpe = apply_bpe.BPE(codes, separator=self.separator)

self.vocab = dict(self.get_vocabulary(item_list=item_list, segment=self.segment,
from_filenames=from_filenames))

def segment(self, line):
if not hasattr(self, 'bpe'):
raise NameError('Learn bpe first!')
line = self.pre_tokenize(line)
return self.bpe.segment(line.strip('\r\n ')).split(' ')

def encode(self, raw_text):
return self.segment(raw_text)

def decode(self, bpe_text, delimiter=' '):
decode_string = delimiter.join(bpe_text)
try:
decode_string = decode_string.decode('utf-8')
except Exception:
pass
decode_string = decode_string \
.replace(self.separator + ' ', '') \
.replace(self.separator, '')
decode_string = str(decode_string).strip('\r\n ').split()
decode_string = self._moses_detok.tokenize(decode_string)
return decode_string
Loading