Merge branch 'dev'

cosmic-cortex · cosmic-cortex · commit 451c968b445a · 2019-11-11T11:36:50.000+01:00
diff --git a/docs/source/content/examples/pool-based_sampling.ipynb b/docs/source/content/examples/pool-based_sampling.ipynb
@@ -8,7 +8,7 @@
     "\n",
     "## Overview\n",
     "\n",
-    "In this example, the we apply an `ActiveLearner` onto the iris dataset using pool-based sampling. In this setting, we assume a small set of labeled data $\\mathcal{L}$ and a large set of unlabeled data $\\mathcal{U}$ such that $\\left| \\mathcal{L} \\right| \\ll \\left| \\mathcal{U} \\right|$. In his review of the active learning literature, Settles covers a high-level overview of the general pool-based sampling algorithm:\n",
+    "In this example, we apply an `ActiveLearner` onto the iris dataset using pool-based sampling. In this setting, we assume a small set of labeled data $\\mathcal{L}$ and a large set of unlabeled data $\\mathcal{U}$ such that $\\left| \\mathcal{L} \\right| \\ll \\left| \\mathcal{U} \\right|$. In his review of the active learning literature, Settles covers a high-level overview of the general pool-based sampling algorithm:\n",
     "\n",
     "> Queries are selectively drawn from the pool, which is usually assumed to be closed (i.e., static or non-changing), although this is not strictly necessary. Typically, instances are queried in a greedy fashion, according to an informativeness measure used to evaluate all instances in the pool (or, perhaps if $\\mathcal{U}$ is very large, some subsample thereof).\n",
     "\n",
diff --git a/modAL/models/base.py b/modAL/models/base.py
@@ -30,6 +30,8 @@ class BaseLearner(ABC, BaseEstimator):
             for instance, modAL.uncertainty.uncertainty_sampling.
         X_training: Initial training samples, if available.
         y_training: Initial training labels corresponding to initial training samples.
+        force_all_finite: When True, forces all values of the data finite.
+            When False, accepts np.nan and np.inf values.
         bootstrap_init: If initial training data is available, bootstrapping can be done during the first training.
             Useful when building Committee models with bagging.
         **fit_kwargs: keyword arguments.
@@ -47,6 +49,7 @@ def __init__(self,
                  X_training: Optional[modALinput] = None,
                  y_training: Optional[modALinput] = None,
                  bootstrap_init: bool = False,
+                 force_all_finite: bool = True,
                  **fit_kwargs
                  ) -> None:
         assert callable(query_strategy), 'query_strategy must be callable'
@@ -59,6 +62,9 @@ def __init__(self,
         if X_training is not None:
             self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)
 
+        assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool'
+        self.force_all_finite = force_all_finite
+
     def _add_training_data(self, X: modALinput, y: modALinput) -> None:
         """
         Adds the new data and label to the known data, but does not retrain the model.
@@ -71,7 +77,8 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None:
             If the classifier has been fitted, the features in X have to agree with the training samples which the
             classifier has seen.
         """
-        check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None)
+        check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
+                  force_all_finite=self.force_all_finite)
 
         if self.X_training is None:
             self.X_training = X
@@ -117,7 +124,8 @@ def _fit_on_new(self, X: modALinput, y: modALinput, bootstrap: bool = False, **f
         Returns:
             self
         """
-        check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None)
+        check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
+                  force_all_finite=self.force_all_finite)
 
         if not bootstrap:
             self.estimator.fit(X, y, **fit_kwargs)
@@ -146,7 +154,8 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg
         Returns:
             self
         """
-        check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None)
+        check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
+                  force_all_finite=self.force_all_finite)
         self.X_training, self.y_training = X, y
         return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs)
 
diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -734,6 +734,25 @@ def test_teach(self):
 
                 learner.teach(X, y, bootstrap=bootstrap, only_new=only_new)
 
+    def test_nan(self):
+        X_training_nan = np.ones(shape=(10, 2)) * np.nan
+        X_training_inf = np.ones(shape=(10, 2)) * np.inf
+        y_training = np.random.randint(0, 2, size=10)
+
+        learner = modAL.models.learners.ActiveLearner(
+            X_training=X_training_nan, y_training=y_training,
+            estimator=mock.MockEstimator(),
+            force_all_finite=False
+        )
+        learner.teach(X_training_nan, y_training)
+
+        learner = modAL.models.learners.ActiveLearner(
+            X_training=X_training_inf, y_training=y_training,
+            estimator=mock.MockEstimator(),
+            force_all_finite=False
+        )
+        learner.teach(X_training_inf, y_training)
+
     def test_keras(self):
         pass