PetrochukM · Columbine21 · May 14, 2020 · May 14, 2020 · May 14, 2020 · May 14, 2020
diff --git a/bpe.vocab b/bpe.vocab
@@ -0,0 +1,19 @@
+#version: 0.2
+d e
+c o
+e n
+co de
+b u
+w i
+w h
+u b
+r o
+o k
+ok en
+o f</w>
+l a
+i s</w>
+en code
+e x
+c h</w>
+T h
diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh
@@ -27,6 +27,7 @@ pip install -U -r requirements.txt --progress-bar off
 pip install spacy --progress-bar off
 pip install nltk --progress-bar off
 pip install sacremoses --progress-bar off
+pip install subword_nmt --progress-bar off
 pip install pandas --progress-bar off
 pip install requests --progress-bar off
 

diff --git a/requirements.txt b/requirements.txt
@@ -23,6 +23,7 @@ mock
 # nltk
 # spacy
 # sacremoses
+# subword-nmt
 
 # Optional CUDA Utilties
 # pynvrtc

diff --git a/test_bpe.vocab b/test_bpe.vocab
@@ -0,0 +1,23 @@
+#version: 0.2
+h a
+s ;
+s; t</w>
+p o
+po s;t</w>
+p a
+pa j
+paj a
+paja m
+pajam a
+pajama s</w>
+o w</w>
+o u
+o t</w>
+n g</w>
+m y</w>
+l e
+le p
+i n</w>
+b e
+a pos;t</w>
+& apos;t</w>
diff --git a/tests/encoders/text/test_bytepair_encoder.py b/tests/encoders/text/test_bytepair_encoder.py
@@ -0,0 +1,63 @@
+import unittest
+import torch
+import sys
+from torchnlp.encoders.text import BPEEncoder
+
+
+class TestBPETextTokenizer(unittest.TestCase):
+
+    def setUp(self):
+        self.corpus = ['This is a corpus of text that provides a bunch of tokens from which ',
+                       'to build a vocabulary. It will be used when strings are encoded ',
+                       'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.']
+
+    def test_vocab(self):
+        encoder = BPEEncoder(self.corpus, from_filenames=False)
+
+        # test if reserved_tokens were add to index_to_token.
+        self.assertEqual('<pad>', encoder.vocab[0])
+        self.assertEqual('<unk>', encoder.vocab[1])
+        self.assertEqual('</s>', encoder.vocab[2])
+        self.assertEqual('<s>', encoder.vocab[3])
+        self.assertEqual('<copy>', encoder.vocab[4])
+
+        # test if some high occurrence sub words are in the token.
+        self.assertIn('oken@@', encoder.index_to_token)
+        self.assertIn('encode@@', encoder.index_to_token)
+
+        expect_vocab_size = 57
+        self.assertEqual(expect_vocab_size, encoder.vocab_size)
+
+    def test_encode(self):
+        if sys.version_info.minor > 5:
+            original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'
+            encoder = BPEEncoder(self.corpus, from_filenames=False)
+
+            # excepted encode.
+            expect = [5, 6, 6, 7, 56, 32, 43, 1, 14, 1, 34, 42, 47, 32, 41, 36, 14, 17,
+                      42, 49, 50, 51, 33, 9, 52, 53, 15, 14, 53, 26, 21, 54, 44, 55, 37]
+
+            encode_lst = encoder.encode(original).numpy().tolist()
+
+            self.assertListEqual(expect, encode_lst)
+
+    def test_decoder(self):
+        if sys.version_info.minor > 5:
+            encoded = torch.tensor([5, 6, 6, 7, 56, 32, 43, 1, 14, 1, 34, 42, 47, 32,
+                                    41, 36, 14, 17, 42, 49, 50, 51, 33, 9, 52, 53, 15,
+                                    14, 53, 26, 21, 54, 44, 55, 37])
+
+            encoder = BPEEncoder(self.corpus, from_filenames=False)
+
+            expect = "This is a coded s<unk> t<unk> ce encoded by the SubwordTextTokenizer."
+
+            self.assertEqual(expect, encoder.decode(encoded))
+
+    def test_encode_decode(self):
+        original = "This is a coded sentence encoded by the SubwordTextTokenizer."
+        expect = "This is a coded s<unk> t<unk> ce encoded by the SubwordTextTokenizer."
+
+        encoder = BPEEncoder(self.corpus, from_filenames=False)
+
+        decode_encode_str = encoder.decode(encoder.encode(original))
+        self.assertEqual(expect, decode_encode_str)
diff --git a/tests/encoders/text/test_bytepair_tokenizer.py b/tests/encoders/text/test_bytepair_tokenizer.py
@@ -0,0 +1,101 @@
+import unittest
+import pickle
+
+from torchnlp.encoders.text.bpe_text_tokenizer import BPETextTokenizer
+
+
+class TestBPETextTokenizer(unittest.TestCase):
+
+    def setUp(self):
+        self.corpus = [
+            "One morning I shot an elephant in my pajamas. How he got in my pajamas, I don't",
+            'know.', 'Groucho Marx',
+            "I haven't slept for 10 days... because that would be too long.", 'Mitch Hedberg'
+        ]
+
+    def test_pre_tokenizer(self):
+        expected = ['One morning I shot an elephant in my pajamas . How he got in my pajamas ,'
+                    ' I don &apos;t',
+                    'know .',
+                    'Groucho Marx',
+                    'I haven &apos;t slept for 10 days ... because that would be too long .',
+                    'Mitch Hedberg']
+
+        self.assertListEqual(expected, [BPETextTokenizer.pre_tokenize(sen) for sen in self.corpus])
+
+    def test_get_vocabulary(self):
+        # tokenizer = BPETextTokenizer('test_bpe', use_moses=True)
+        def segment_words(line):
+            return BPETextTokenizer._segment_words(line, BPETextTokenizer.pre_tokenize)
+        token_counts = BPETextTokenizer.get_vocabulary(self.corpus,
+                                                       segment_words, from_filenames=False)
+        expected = {
+            "&apos;t": 2,
+            ".": 3,
+            "...": 1,
+            "Groucho": 1,
+            "Marx": 1,
+            "Mitch": 1,
+            "Hedberg": 1,
+            "I": 3,
+            "in": 2,
+            "my": 2,
+            "know": 1,
+            "because": 1,
+            "pajamas": 2,
+        }
+        self.assertDictContainsSubset(expected, token_counts)
+
+    def test_learn_bpe(self):
+        tokenizer = BPETextTokenizer('test_bpe')
+        tokenizer.build_from_corpus(self.corpus, from_filenames=False)
+        expected = {('&', 'apos;t</w>'): 21, ('a', 'pos;t</w>'): 20, ('b', 'e'): 19,
+                    ('i', 'n</w>'): 18, ('le', 'p'): 17, ('l', 'e'): 16, ('m', 'y</w>'): 15,
+                    ('n', 'g</w>'): 14, ('o', 't</w>'): 13, ('o', 'u'): 12, ('o', 'w</w>'): 11,
+                    ('pajama', 's</w>'): 10, ('pajam', 'a'): 9, ('paja', 'm'): 8, ('paj', 'a'): 7,
+                    ('pa', 'j'): 6, ('p', 'a'): 5, ('po', 's;t</w>'): 4, ('p', 'o'): 3,
+                    ('s;', 't</w>'): 2, ('s', ';'): 1, ('h', 'a'): 0}
+        self.assertDictEqual(expected, tokenizer.bpe.bpe_codes)
+
+    def test_encode_decode(self):
+        corpus = ['This is a corpus of text that provides a bunch of tokens from which ',
+                  'to build a vocabulary. It will be used when strings are encoded ',
+                  'with a SubwordTextTokenizer subclass. The encoder was coded by a coder.']
+
+        original = 'This is a coded sentence encoded by the SubwordTextTokenizer.'
+
+        tokenizer = BPETextTokenizer('test_bpe')
+        tokenizer.build_from_corpus(corpus, from_filenames=False)
+
+        # Encoding should be reversible.
+        encoded = tokenizer.encode(original)
+        decoded = tokenizer.decode(encoded)
+        self.assertEqual(original, decoded)
+
+        # The substrings coded@@ and en@@ are frequent enough in the corpus that
+        # they should appear in the vocabulary even though they are substrings
+        # of other included strings.
+        subtoken_strings = encoded
+        self.assertIn('en@@', subtoken_strings)
+        self.assertIn('code@@', subtoken_strings)
+
+    def test_build_vocab(self):
+        tokenizer = BPETextTokenizer('test_bpe')
+        tokenizer.build_from_corpus(self.corpus, from_filenames=False)
+
+        # test the all item in vocab.
+        expect = {'O@@': 1, 'n@@': 4, 'e': 4, 'm@@': 1, 'o@@': 5, 'r@@': 4, 'i@@': 2,
+                  'ng': 2, 'I': 3, 's@@': 3, 'h@@': 3, 'ot': 2, 'a@@': 4, 'n': 3,
+                  'e@@': 3, 'lep@@': 2, 'ha@@': 3, 't': 3, 'in': 2, 'my': 2,
+                  'pajamas': 2, '.': 4, 'H@@': 2, 'ow': 2, 'g@@': 1, ',': 1, 'd@@': 3,
+                  '&apos;t': 2, 'k@@': 1, 'G@@': 1, 'ou@@': 2, 'c@@': 3, 'o': 2,
+                  'M@@': 2, 'x': 1, 'v@@': 1, 'f@@': 1, 'r': 1, '1@@': 1, '0': 1,
+                  'y@@': 1, 's': 1, '.@@': 2, 'be@@': 2, 'u@@': 1, 't@@': 3,
+                  'w@@': 1, 'l@@': 2, 'd': 1, 'b@@': 1, 'h': 1, 'g': 1}
+
+        self.assertDictEqual(expect, tokenizer.vocab)
+
+
+def test_is_pickleable():
+    tokenizer = BPETextTokenizer('test_bpe')
+    pickle.dumps(tokenizer)
diff --git a/torchnlp/encoders/text/__init__.py b/torchnlp/encoders/text/__init__.py
@@ -21,12 +21,14 @@
 from torchnlp.encoders.text.text_encoder import TextEncoder
 from torchnlp.encoders.text.treebank_encoder import TreebankEncoder
 from torchnlp.encoders.text.whitespace_encoder import WhitespaceEncoder
+from torchnlp.encoders.text.bytepair_encoder import BPEEncoder
 
 __all__ = [
     'CharacterEncoder', 'DEFAULT_COPY_INDEX', 'DEFAULT_COPY_TOKEN', 'DEFAULT_EOS_INDEX',
     'DEFAULT_EOS_TOKEN', 'DEFAULT_PADDING_INDEX', 'DEFAULT_PADDING_TOKEN',
     'DEFAULT_RESERVED_TOKENS', 'DEFAULT_SOS_INDEX', 'DEFAULT_SOS_TOKEN', 'DEFAULT_UNKNOWN_INDEX',
     'DEFAULT_UNKNOWN_TOKEN', 'DelimiterEncoder', 'MosesEncoder', 'pad_tensor',
     'stack_and_pad_tensors', 'TextEncoder', 'SpacyEncoder', 'StaticTokenizerEncoder',
-    'SubwordEncoder', 'TreebankEncoder', 'WhitespaceEncoder', 'BatchedSequences'
+    'SubwordEncoder', 'TreebankEncoder', 'WhitespaceEncoder', 'BatchedSequences',
+    'BPEEncoder'
 ]
diff --git a/torchnlp/encoders/text/bpe_text_tokenizer.py b/torchnlp/encoders/text/bpe_text_tokenizer.py
@@ -0,0 +1,86 @@
+import codecs
+from subword_nmt import learn_bpe, apply_bpe
+from collections import Counter
+from sacremoses import MosesTokenizer, MosesDetokenizer
+
+
+class BPETextTokenizer(object):
+    _moses_tok = MosesTokenizer(lang='en')
+    _moses_detok = MosesDetokenizer(lang='en')
+
+    def __init__(self, file_prefix=None, separator='@@'):
+        if file_prefix is not None:
+            self.codes_file = '{}.vocab'.format(file_prefix)
+
+        self.separator = separator
+        self.bpe = None
+        self.vocab = None
+
+    @staticmethod
+    def pre_tokenize(line):
+        return BPETextTokenizer._moses_tok.tokenize(line, return_str=True)
+
+    @staticmethod
+    def _segment_words(line, pre_apply=None):
+        if pre_apply is not None:
+            line = pre_apply(line)
+        line = str(line)
+        return line.strip('\r\n ').split()
+
+    @staticmethod
+    def get_vocabulary(item_list, segment=_segment_words, from_filenames=True):
+        vocab = Counter()
+        if from_filenames:
+            for fname in item_list:
+                with codecs.open(fname, encoding='UTF-8') as f:
+                    for line in f:
+                        for word in segment(line):
+                            vocab[word] += 1
+        else:
+            for line in item_list:
+                for word in segment(line):
+                    vocab[word] += 1
+        return vocab
+
+    def build_from_corpus(self, item_list, min_count=2, num_symbols=10000,
+                          total_symbols=False, from_filenames=True):
+        def segment_words(line):
+            return self._segment_words(line, self.pre_tokenize)
+
+        vocab_words = self.get_vocabulary(item_list, segment_words, from_filenames=from_filenames)
+
+        vocab_list = ['{0} {1}'.format(key, freq)
+                      for (key, freq) in vocab_words.items()]
+
+        with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
+            learn_bpe.learn_bpe(vocab_list, output, num_symbols=num_symbols,
+                                min_frequency=min_count, verbose=False,
+                                is_dict=True, total_symbols=total_symbols)
+
+        with codecs.open(self.codes_file, encoding='UTF-8') as codes:
+            self.bpe = apply_bpe.BPE(codes, separator=self.separator)
+
+        self.vocab = dict(self.get_vocabulary(item_list=item_list, segment=self.segment,
+                                              from_filenames=from_filenames))
+
+    def segment(self, line):
+        if not hasattr(self, 'bpe'):
+            raise NameError('Learn bpe first!')
+        line = self.pre_tokenize(line)
+        return self.bpe.segment(line.strip('\r\n ')).split(' ')
+
+    def encode(self, raw_text):
+        return self.segment(raw_text)
+
+    def decode(self, bpe_text, delimiter=' '):
+        decode_string = delimiter.join(bpe_text)
+        try:
+            decode_string = decode_string.decode('utf-8')
+        except Exception:
+            pass
+        decode_string = decode_string \
+            .replace(self.separator + ' ', '') \
+            .replace(self.separator, '')
+        decode_string = str(decode_string).strip('\r\n ').split()
+        decode_string = self._moses_detok.tokenize(decode_string)
+        return decode_string