fastnlp
diff --git a/‎examples/readme_example.py
Lines changed: 2 additions & 2 deletions b/‎examples/readme_example.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎fastNLP/core/batch.py
Lines changed: 8 additions & 47 deletions b/‎fastNLP/core/batch.py
Lines changed: 8 additions & 47 deletions
diff --git a/‎fastNLP/core/predictor.py
Lines changed: 6 additions & 2 deletions b/‎fastNLP/core/predictor.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎fastNLP/core/preprocess.py
Lines changed: 19 additions & 12 deletions b/‎fastNLP/core/preprocess.py
Lines changed: 19 additions & 12 deletions
diff --git a/‎fastNLP/core/action.py renamed to ‎fastNLP/core/sampler.py
Lines changed: 70 additions & 100 deletions b/‎fastNLP/core/action.py renamed to ‎fastNLP/core/sampler.py
Lines changed: 70 additions & 100 deletions
@@ -5,7 +5,7 @@
 from fastNLP.core.trainer import ClassificationTrainer
 from fastNLP.loader.dataset_loader import ClassDatasetLoader
 from fastNLP.models.base_model import BaseModel
-from fastNLP.modules import aggregation
+from fastNLP.modules import aggregator
 from fastNLP.modules import decoder
 from fastNLP.modules import encoder
 
@@ -21,7 +21,7 @@ def __init__(self, num_classes, vocab_size):
         self.emb = encoder.Embedding(nums=vocab_size, dims=300)
         self.enc = encoder.Conv(
             in_channels=300, out_channels=100, kernel_size=3)
-        self.agg = aggregation.MaxPool()
+        self.agg = aggregator.MaxPool()
         self.dec = decoder.MLP(size_layer=[100, num_classes])
 
     def forward(self, x):
 
@@ -2,10 +2,6 @@
 
 import torch
 
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.field import TextField, LabelField
-from fastNLP.core.instance import Instance
-
 
 class Batch(object):
     """Batch is an iterable object which iterates over mini-batches.
@@ -16,6 +12,14 @@ class Batch(object):
     """
 
     def __init__(self, dataset, batch_size, sampler, use_cuda):
+        """
+
+        :param dataset: a DataSet object
+        :param batch_size: int, the size of the batch
+        :param sampler: a Sampler object
+        :param use_cuda: bool, whetjher to use GPU
+
+        """
         self.dataset = dataset
         self.batch_size = batch_size
         self.sampler = sampler
@@ -81,46 +85,3 @@ def __next__(self):
             self.curidx += endidx
             return batch_x, batch_y
 
-
-if __name__ == "__main__":
-    """simple running example
-    """
-    texts = ["i am a cat",
-             "this is a test of new batch",
-             "haha"
-             ]
-    labels = [0, 1, 0]
-
-    # prepare vocabulary
-    vocab = {}
-    for text in texts:
-        for tokens in text.split():
-            if tokens not in vocab:
-                vocab[tokens] = len(vocab)
-    print("vocabulary: ", vocab)
-
-    # prepare input dataset    
-    data = DataSet()
-    for text, label in zip(texts, labels):
-        x = TextField(text.split(), False)
-        y = LabelField(label, is_target=True)
-        ins = Instance(text=x, label=y)
-        data.append(ins)
-
-    # use vocabulary to index data
-    data.index_field("text", vocab)
-
-
-    # define naive sampler for batch class
-    class SeqSampler:
-        def __call__(self, dataset):
-            return list(range(len(dataset)))
-
-
-    # use batch to iterate dataset
-    data_iterator = Batch(data, 2, SeqSampler(), False)
-    for epoch in range(1):
-        for batch_x, batch_y in data_iterator:
-            print(batch_x)
-            print(batch_y)
-            # do stuff
@@ -1,10 +1,10 @@
 import numpy as np
 import torch
 
-from fastNLP.core.action import SequentialSampler
 from fastNLP.core.batch import Batch
 from fastNLP.core.dataset import create_dataset_from_lists
 from fastNLP.core.preprocess import load_pickle
+from fastNLP.core.sampler import SequentialSampler
 
 
 class Predictor(object):
@@ -62,9 +62,13 @@ def mode(self, network, test=True):
 
     def data_forward(self, network, x):
         """Forward through network."""
-        y = network(**x)
         if self._task == "seq_label":
+            y = network(x["word_seq"], x["word_seq_origin_len"])
             y = network.prediction(y)
+        elif self._task == "text_classify":
+            y = network(x["word_seq"])
+        else:
+            raise NotImplementedError("Unknown task type {}.".format(self._task))
         return y
 
     def prepare_input(self, data):
 
@@ -52,21 +52,28 @@ def pickle_exist(pickle_path, pickle_name):
         return False
 
 
-class BasePreprocess(object):
-    """Base class of all preprocessors.
-    Preprocessors are responsible for converting data of strings into data of indices.
+class Preprocessor(object):
+    """Preprocessors are responsible for converting data of strings into data of indices.
     During the pre-processing, the following pickle files will be built:
 
-        - "word2id.pkl", a mapping from words(tokens) to indices
-        - "id2word.pkl", a reversed dictionary
+        - "word2id.pkl", a Vocabulary object, mapping words to indices.
+        - "class2id.pkl", a Vocabulary object, mapping labels to indices.
+        - "data_train.pkl", a DataSet object for training
+        - "data_dev.pkl", a DataSet object for validation, if train_dev_split > 0.
+        - "data_test.pkl", a DataSet object for testing, if test_data is not None.
 
     These four pickle files are expected to be saved in the given pickle directory once they are constructed.
     Preprocessors will check if those files are already in the directory and will reuse them in future calls.
     """
 
-    def __init__(self):
+    def __init__(self, label_is_seq=False):
+        """
+
+        :param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve
+                several special tokens for sequence processing.
+        """
         self.data_vocab = Vocabulary()
-        self.label_vocab = Vocabulary()
+        self.label_vocab = Vocabulary(need_default=label_is_seq)
 
     @property
     def vocab_size(self):
@@ -259,20 +266,20 @@ def convert_to_dataset(self, data, vocab, label_vocab):
         return data_set
 
 
-class SeqLabelPreprocess(BasePreprocess):
+class SeqLabelPreprocess(Preprocessor):
     def __init__(self):
-
+        print("[FastNLP warning] SeqLabelPreprocess is about to deprecate. Please use Preprocess directly.")
         super(SeqLabelPreprocess, self).__init__()
 
 
-
-class ClassPreprocess(BasePreprocess):
+class ClassPreprocess(Preprocessor):
     def __init__(self):
+        print("[FastNLP warning] ClassPreprocess is about to deprecate. Please use Preprocess directly.")
         super(ClassPreprocess, self).__init__()
 
 
 if __name__ == "__main__":
-    p = BasePreprocess()
+    p = Preprocessor()
     train_dev_data = [[["I", "am", "a", "good", "student", "."], "0"],
                       [["You", "are", "pretty", "."], "1"]
                       ]
 
@@ -1,5 +1,3 @@
-from collections import Counter
-
 import numpy as np
 import torch
 
@@ -17,6 +15,56 @@ def convert_to_torch_tensor(data_list, use_cuda):
     return data_list
 
 
+class BaseSampler(object):
+    """The base class of all samplers.
+
+        Sub-classes must implement the __call__ method.
+        __call__ takes a DataSet object and returns a list of int - the sampling indices.
+    """
+
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class SequentialSampler(BaseSampler):
+    """Sample data in the original order.
+
+    """
+
+    def __call__(self, data_set):
+        return list(range(len(data_set)))
+
+
+class RandomSampler(BaseSampler):
+    """Sample data in random permutation order.
+
+    """
+
+    def __call__(self, data_set):
+        return list(np.random.permutation(len(data_set)))
+
+
+def simple_sort_bucketing(lengths):
+    """
+
+    :param lengths: list of int, the lengths of all examples.
+    :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length
+        threshold for each bucket (This is usually None.).
+    :return data: 2-level list
+            ::
+
+                [
+                    [index_11, index_12, ...],  # bucket 1
+                    [index_21, index_22, ...],  # bucket 2
+                    ...
+                ]
+
+    """
+    lengths_mapping = [(idx, length) for idx, length in enumerate(lengths)]
+    sorted_lengths = sorted(lengths_mapping, key=lambda x: x[1])
+    # TODO: need to return buckets
+    return [idx for idx, _ in sorted_lengths]
+
 def k_means_1d(x, k, max_iter=100):
     """Perform k-means on 1-D data.
 
@@ -46,18 +94,10 @@ def k_means_1d(x, k, max_iter=100):
     return np.array(centroids), assign
 
 
-def k_means_bucketing(all_inst, buckets):
+def k_means_bucketing(lengths, buckets):
     """Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths.
 
-    :param all_inst: 3-level list
-            E.g. ::
-
-                [
-                    [[word_11, word_12, word_13], [label_11. label_12]],  # sample 1
-                    [[word_21, word_22, word_23], [label_21. label_22]],  # sample 2
-                    ...
-                ]
-
+    :param lengths: list of int, the length of all samples.
     :param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length
         threshold for each bucket (This is usually None.).
     :return data: 2-level list
@@ -72,7 +112,6 @@ def k_means_bucketing(all_inst, buckets):
     """
     bucket_data = [[] for _ in buckets]
     num_buckets = len(buckets)
-    lengths = np.array([len(inst[0]) for inst in all_inst])
     _, assignments = k_means_1d(lengths, num_buckets)
 
     for idx, bucket_id in enumerate(assignments):
@@ -81,102 +120,33 @@ def k_means_bucketing(all_inst, buckets):
     return bucket_data
 
 
-class BaseSampler(object):
-    """The base class of all samplers.
-
-    """
-
-    def __call__(self, *args, **kwargs):
-        raise NotImplementedError
-
-
-class SequentialSampler(BaseSampler):
-    """Sample data in the original order.
-
-    """
-
-    def __call__(self, data_set):
-        return list(range(len(data_set)))
-
-
-class RandomSampler(BaseSampler):
-    """Sample data in random permutation order.
-
-    """
-
-    def __call__(self, data_set):
-        return list(np.random.permutation(len(data_set)))
-
-
-
-class Batchifier(object):
-    """Wrap random or sequential sampler to generate a mini-batch.
-
-    """
-
-    def __init__(self, sampler, batch_size, drop_last=True):
-        """
-
-        :param sampler: a Sampler object
-        :param batch_size: int, the size of the mini-batch
-        :param drop_last: bool, whether to drop the last examples that are not enough to make a mini-batch.
-
-        """
-        super(Batchifier, self).__init__()
-        self.sampler = sampler
-        self.batch_size = batch_size
-        self.drop_last = drop_last
-
-    def __iter__(self):
-        batch = []
-        for example in self.sampler:
-            batch.append(example)
-            if len(batch) == self.batch_size:
-                yield batch
-                batch = []
-        if 0 < len(batch) < self.batch_size and self.drop_last is False:
-            yield batch
-
-
-class BucketBatchifier(Batchifier):
+class BucketSampler(BaseSampler):
     """Partition all samples into multiple buckets, each of which contains sentences of approximately the same length.
     In sampling, first random choose a bucket. Then sample data from it.
     The number of buckets is decided dynamically by the variance of sentence lengths.
-    TODO: merge it into Batch
+
     """
 
-    def __init__(self, data_set, batch_size, num_buckets, drop_last=True, sampler=None):
+    def __call__(self, data_set, batch_size, num_buckets):
+        return self._process(data_set, batch_size, num_buckets)
+
+    def _process(self, data_set, batch_size, num_buckets, use_kmeans=False):
         """
 
-        :param data_set: three-level list, shape [num_samples, 2]
+        :param data_set: a DataSet object
         :param batch_size: int
         :param num_buckets: int, number of buckets for grouping these sequences.
-        :param drop_last: bool, useless currently.
-        :param sampler: Sampler, useless currently.
+        :param use_kmeans: bool, whether to use k-means to create buckets.
 
         """
-        super(BucketBatchifier, self).__init__(sampler, batch_size, drop_last)
         buckets = ([None] * num_buckets)
-        self.data = data_set
-        self.batch_size = batch_size
-        self.length_freq = dict(Counter([len(example) for example in data_set]))
-        self.buckets = k_means_bucketing(data_set, buckets)
-
-    def __iter__(self):
-        """Make a min-batch of data."""
-        for _ in range(len(self.data) // self.batch_size):
-            bucket_samples = self.buckets[np.random.randint(0, len(self.buckets))]
-            np.random.shuffle(bucket_samples)
-            yield [self.data[idx] for idx in bucket_samples[:batch_size]]
-
-
-if __name__ == "__main__":
-    import random
-
-    data = [[[y] * random.randint(0, 50), [y]] for y in range(500)]
-    batch_size = 8
-    iterator = iter(BucketBatchifier(data, batch_size, num_buckets=5))
-    for d in iterator:
-        print("\nbatch:")
-        for dd in d:
-            print(len(dd[0]), end=" ")
+        if use_kmeans is True:
+            buckets = k_means_bucketing(data_set, buckets)
+        else:
+            buckets = simple_sort_bucketing(data_set)
+        index_list = []
+        for _ in range(len(data_set) // batch_size):
+            chosen_bucket = buckets[np.random.randint(0, len(buckets))]
+            np.random.shuffle(chosen_bucket)
+            index_list += [idx for idx in chosen_bucket[:batch_size]]
+        return index_list