Add SpanBERT module (#300)

gpengzhi · web-flow · commit ab9683ff6d58 · 2020-02-25T18:04:20.000-05:00
* Add SpanBERT module

* Clean the code

* Fix CI

* Fix CI

* Resolve comments
diff --git a/texar/torch/data/tokenizers/bert_tokenizer.py b/texar/torch/data/tokenizers/bert_tokenizer.py
@@ -26,6 +26,7 @@
 from texar.torch.data.tokenizers.tokenizer_base import TokenizerBase
 from texar.torch.data.tokenizers.bert_tokenizer_utils import \
     load_vocab, BasicTokenizer, WordpieceTokenizer
+from texar.torch.hyperparams import HParams
 from texar.torch.utils.utils import truncate_seq_pair
 
 __all__ = [
@@ -74,6 +75,10 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase):
         'scibert-scivocab-cased': 512,
         'scibert-basevocab-uncased': 512,
         'scibert-basevocab-cased': 512,
+
+        # SpanBERT
+        'spanbert-base-cased': 512,
+        'spanbert-large-cased': 512,
     }
     _VOCAB_FILE_NAMES = {'vocab_file': 'vocab.txt'}
     _VOCAB_FILE_MAP = {
@@ -98,13 +103,30 @@ class BERTTokenizer(PretrainedBERTMixin, TokenizerBase):
             'scibert-scivocab-cased': 'vocab.txt',
             'scibert-basevocab-uncased': 'vocab.txt',
             'scibert-basevocab-cased': 'vocab.txt',
+
+            # SpanBERT
+            'spanbert-base-cased': 'vocab.txt',
+            'spanbert-large-cased': 'vocab.txt',
         }
     }
 
     def __init__(self,
                  pretrained_model_name: Optional[str] = None,
                  cache_dir: Optional[str] = None,
                  hparams=None):
+
+        # SpanBERT checkpoint files do not include vocabulary file, use
+        # standard BERT directly when user use the pre-trained SpanBERT.
+        if pretrained_model_name is not None:
+            if pretrained_model_name.startswith('spanbert'):
+                pretrained_model_name = pretrained_model_name.lstrip('span')
+        elif hparams is not None:
+            hparams = HParams(hparams, None)
+            if hparams.pretrained_model_name is not None and \
+                    hparams.pretrained_model_name.startswith('spanbert'):
+                pretrained_model_name = \
+                    hparams.pretrained_model_name.lstrip('span')
+
         self.load_pretrained_config(pretrained_model_name, cache_dir, hparams)
 
         super().__init__(hparams=None)
diff --git a/texar/torch/modules/encoders/bert_encoder.py b/texar/torch/modules/encoders/bert_encoder.py
@@ -75,9 +75,14 @@ def __init__(self,
         # Segment embedding for each type of tokens
         self.segment_embedder = None
         if self._hparams.get('type_vocab_size', 0) > 0:
-            self.segment_embedder = WordEmbedder(
-                vocab_size=self._hparams.type_vocab_size,
-                hparams=self._hparams.segment_embed)
+            if self.pretrained_model_name is not None and \
+                    self.pretrained_model_name.startswith('spanbert'):
+                # Do not construct segment_embedder for SpanBERT
+                pass
+            else:
+                self.segment_embedder = WordEmbedder(
+                    vocab_size=self._hparams.type_vocab_size,
+                    hparams=self._hparams.segment_embed)
 
         # Position embedding
         self.position_embedder = PositionEmbedder(
@@ -289,7 +294,10 @@ def forward(self,  # type: ignore
                 inputs: Union[torch.Tensor, torch.LongTensor],
                 sequence_length: Optional[torch.LongTensor] = None,
                 segment_ids: Optional[torch.LongTensor] = None):
-        r"""Encodes the inputs.
+        r"""Encodes the inputs. Note that the SpanBERT model does not use
+        segmentation embedding. As a result, SpanBERT does not require
+        `segment_ids` as an input when you use pre-trained SpanBERT checkpoint
+        files.
 
         Args:
             inputs: Either a **2D Tensor** of shape `[batch_size, max_time]`,
diff --git a/texar/torch/modules/pretrained/bert.py b/texar/torch/modules/pretrained/bert.py
@@ -34,6 +34,7 @@
 _BIOBERT_PATH = "https://github.com/naver/biobert-pretrained/releases/download/"
 _SCIBERT_PATH = "https://s3-us-west-2.amazonaws.com/ai2-s2-research/" \
                 "scibert/tensorflow_models/"
+_SPANBERT_PATH = "https://dl.fbaipublicfiles.com/fairseq/models/"
 
 
 class PretrainedBERTMixin(PretrainedMixin, ABC):
@@ -97,6 +98,21 @@ class PretrainedBERTMixin(PretrainedMixin, ABC):
         * ``scibert-basevocab-cased``: Cased version of the model trained on
           the original BERT vocabulary.
 
+    * **SpanBERT**: proposed in (`Joshi et al`. 2019)
+      `SpanBERT: Improving Pre-training by Representing and Predicting Spans`_.
+      As a variant of the standard BERT model, SpanBERT extends BERT by
+      (1) masking contiguous random spans, rather than random tokens, and
+      (2) training the span boundary representations to predict the entire
+      content of the masked span, without relying on the individual token
+      representations within it. Differing from the standard BERT, the
+      SpanBERT model does not use segmentation embedding. Available model names
+      include:
+
+        * ``spanbert-base-cased``: SpanBERT using the BERT-base architecture,
+          12-layer, 768-hidden, 12-heads , 110M parameters.
+        * ``spanbert-large-cased``: SpanBERT using the BERT-large architecture,
+          24-layer, 1024-hidden, 16-heads, 340M parameters.
+
     We provide the following BERT classes:
 
       * :class:`~texar.torch.modules.BERTEncoder` for text encoding.
@@ -111,6 +127,9 @@ class PretrainedBERTMixin(PretrainedMixin, ABC):
 
     .. _`SciBERT: A Pretrained Language Model for Scientific Text`:
         https://arxiv.org/abs/1903.10676
+
+    .. _`SpanBERT: Improving Pre-training by Representing and Predicting Spans`:
+        https://arxiv.org/abs/1907.10529
     """
 
     _MODEL_NAME = "BERT"
@@ -150,6 +169,12 @@ class PretrainedBERTMixin(PretrainedMixin, ABC):
             _SCIBERT_PATH + 'scibert_basevocab_uncased.tar.gz',
         'scibert-basevocab-cased':
             _SCIBERT_PATH + 'scibert_basevocab_cased.tar.gz',
+
+        # SpanBERT
+        'spanbert-base-cased':
+            _SPANBERT_PATH + "spanbert_hf_base.tar.gz",
+        'spanbert-large-cased':
+            _SPANBERT_PATH + "spanbert_hf.tar.gz",
     }
     _MODEL2CKPT = {
         # Standard BERT
@@ -172,6 +197,10 @@ class PretrainedBERTMixin(PretrainedMixin, ABC):
         'scibert-scivocab-cased': 'bert_model.ckpt',
         'scibert-basevocab-uncased': 'bert_model.ckpt',
         'scibert-basevocab-cased': 'bert_model.ckpt',
+
+        # SpanBERT
+        'spanbert-base-cased': 'pytorch_model.bin',
+        'spanbert-large-cased': 'pytorch_model.bin',
     }
 
     @classmethod
@@ -182,13 +211,14 @@ def _transform_config(cls, pretrained_model_name: str,
         config_path = None
 
         for file in files:
-            if file == 'bert_config.json':
+            if file in ('bert_config.json', 'config.json'):
                 config_path = os.path.join(root, file)
                 with open(config_path) as f:
                     config_ckpt = json.loads(f.read())
                     hidden_dim = config_ckpt['hidden_size']
                     vocab_size = config_ckpt['vocab_size']
-                    type_vocab_size = config_ckpt['type_vocab_size']
+                    if not pretrained_model_name.startswith('spanbert'):
+                        type_vocab_size = config_ckpt['type_vocab_size']
                     position_size = config_ckpt['max_position_embeddings']
                     embedding_dropout = config_ckpt['hidden_dropout_prob']
                     num_blocks = config_ckpt['num_hidden_layers']
@@ -208,11 +238,6 @@ def _transform_config(cls, pretrained_model_name: str,
                 'dim': hidden_dim
             },
             'vocab_size': vocab_size,
-            'segment_embed': {
-                'name': 'token_type_embeddings',
-                'dim': hidden_dim
-            },
-            'type_vocab_size': type_vocab_size,
             'position_embed': {
                 'name': 'position_embeddings',
                 'dim': hidden_dim
@@ -256,10 +281,74 @@ def _transform_config(cls, pretrained_model_name: str,
             }
         }
 
+        if not pretrained_model_name.startswith('spanbert'):
+            configs.update({
+                'segment_embed': {
+                    'name': 'token_type_embeddings',
+                    'dim': hidden_dim},
+                'type_vocab_size': type_vocab_size,
+            })
+
         return configs
 
     def _init_from_checkpoint(self, pretrained_model_name: str,
                               cache_dir: str, **kwargs):
+        if pretrained_model_name.startswith('spanbert'):
+            global_tensor_map = {
+                'bert.embeddings.word_embeddings.weight':
+                    'word_embedder._embedding',
+                'bert.embeddings.position_embeddings.weight':
+                    'position_embedder._embedding',
+                'bert.embeddings.LayerNorm.weight':
+                    'encoder.input_normalizer.weight',
+                'bert.embeddings.LayerNorm.bias':
+                    'encoder.input_normalizer.bias',
+            }
+
+            attention_tensor_map = {
+                "attention.self.key.bias": "self_attns.{}.K_dense.bias",
+                "attention.self.query.bias": "self_attns.{}.Q_dense.bias",
+                "attention.self.value.bias": "self_attns.{}.V_dense.bias",
+                "attention.output.dense.bias": "self_attns.{}.O_dense.bias",
+                "attention.output.LayerNorm.weight":
+                    "poswise_layer_norm.{}.weight",
+                "attention.output.LayerNorm.bias": "poswise_layer_norm.{}.bias",
+                "intermediate.dense.bias": "poswise_networks.{}._layers.0.bias",
+                "output.dense.bias": "poswise_networks.{}._layers.2.bias",
+                "output.LayerNorm.weight": "output_layer_norm.{}.weight",
+                "output.LayerNorm.bias": "output_layer_norm.{}.bias",
+                "attention.self.key.weight": "self_attns.{}.K_dense.weight",
+                "attention.self.query.weight": "self_attns.{}.Q_dense.weight",
+                "attention.self.value.weight": "self_attns.{}.V_dense.weight",
+                "attention.output.dense.weight": "self_attns.{}.O_dense.weight",
+                "intermediate.dense.weight":
+                    "poswise_networks.{}._layers.0.weight",
+                "output.dense.weight": "poswise_networks.{}._layers.2.weight",
+            }
+            checkpoint_path = os.path.abspath(os.path.join(
+                cache_dir, self._MODEL2CKPT[pretrained_model_name]))
+
+            device = next(self.parameters()).device
+            params = torch.load(checkpoint_path, map_location=device)
+
+            for name, tensor in params.items():
+                if name in global_tensor_map:
+                    v_name = global_tensor_map[name]
+                    pointer = self._name_to_variable(v_name)
+                    assert pointer.shape == tensor.shape
+                    pointer.data = tensor.data.type(pointer.dtype)
+                elif name.startswith('bert.encoder.layer.'):
+                    name = name.lstrip('bert.encoder.layer.')
+                    layer_num, layer_name = name.split('.', 1)
+                    if layer_name in attention_tensor_map:
+                        v_name = attention_tensor_map[layer_name]
+                        pointer = self._name_to_variable(
+                            'encoder.' + v_name.format(layer_num))
+                        assert pointer.shape == tensor.shape
+                        pointer.data = tensor.data.type(pointer.dtype)
+
+            return
+
         try:
             import numpy as np
             import tensorflow as tf
diff --git a/texar/torch/modules/pretrained/bert_test.py b/texar/torch/modules/pretrained/bert_test.py
@@ -102,6 +102,75 @@ def test_load_pretrained_bert_AND_transform_bert_to_texar_config(self):
 
         self.assertDictEqual(model_config, exp_config)
 
+    @pretrained_test
+    def test_load_spanbert_AND_transform_spanbert_to_texar_config(
+            self):
+        pretrained_model_dir = PretrainedBERTMixin.download_checkpoint(
+            pretrained_model_name="spanbert-base-cased")
+
+        info = list(os.walk(pretrained_model_dir))
+        _, _, files = info[0]
+        self.assertIn('config.json', files)
+        self.assertIn('pytorch_model.bin', files)
+
+        model_config = PretrainedBERTMixin._transform_config(
+            pretrained_model_name="spanbert-base-cased",
+            cache_dir=pretrained_model_dir)
+
+        exp_config = {
+            'hidden_size': 768,
+            'embed': {
+                'name': 'word_embeddings',
+                'dim': 768
+            },
+            'vocab_size': 28996,
+            'position_embed': {
+                'name': 'position_embeddings',
+                'dim': 768
+            },
+            'position_size': 512,
+            'encoder': {
+                'name': 'encoder',
+                'embedding_dropout': 0.1,
+                'num_blocks': 12,
+                'multihead_attention': {
+                    'use_bias': True,
+                    'num_units': 768,
+                    'num_heads': 12,
+                    'output_dim': 768,
+                    'dropout_rate': 0.1,
+                    'name': 'self'
+                },
+                'residual_dropout': 0.1,
+                'dim': 768,
+                'use_bert_config': True,
+                'eps': 1e-12,
+                'poswise_feedforward': {
+                    'layers': [
+                        {
+                            'type': 'Linear',
+                            'kwargs': {
+                                'in_features': 768,
+                                'out_features': 3072,
+                                'bias': True
+                            }
+                        },
+                        {'type': 'BertGELU'},
+                        {
+                            'type': 'Linear',
+                            'kwargs': {
+                                'in_features': 3072,
+                                'out_features': 768,
+                                'bias': True
+                            }
+                        }
+                    ]
+                }
+            }
+        }
+
+        self.assertDictEqual(model_config, exp_config)
+
 
 if __name__ == "__main__":
     unittest.main()