Implemented test for DataSourceList.sample_spatial_and_temporal_positions_for_examples

JackKelly · JackKelly · commit e6ef5218db20 · 2021-10-25T17:53:11.000+01:00
diff --git a/nowcasting_dataset/data_sources/data_source_list.py b/nowcasting_dataset/data_sources/data_source_list.py
@@ -53,12 +53,9 @@ def get_t0_datetimes_across_all_data_sources(self, freq: str) -> pd.DatetimeInde
 
         return t0_datetimes
 
-    def sample_position_of_every_example_of_every_split(
-        self,
-        t0_datetimes: pd.DatetimeIndex,
-        split_method: SplitMethod,
-        n_examples_per_split: dict[SplitName, int],
-    ) -> dict[SplitName, pd.DataFrame]:
+    def sample_spatial_and_temporal_positions_for_examples(
+        self, t0_datetimes: pd.DatetimeIndex, n_examples: int
+    ) -> pd.DataFrame:
         """
         Computes the geospatial and temporal position of each training example.
 
@@ -68,33 +65,21 @@ def sample_position_of_every_example_of_every_split(
         Args:
             t0_datetimes: All available t0 datetimes.  Can be computed with
                 `DataSourceList.get_t0_datetimes_across_all_data_sources()`
-            split_method: The method used to split data into train, validation, and test.
-            n_examples_per_split: The number of examples requested for each split.
+            n_examples: The number of examples requested.
 
         Returns:
-            A dict where the keys are a SplitName, and the values are a pd.DataFrame.
-            Each row of each DataFrame specifies the position of each example, using
+            Each row of each the DataFrame specifies the position of each example, using
             columns: 't0_datetime_UTC', 'x_center_OSGB', 'y_center_OSGB'.
         """
-        # Split t0_datetimes into train, test and validation sets.
-        t0_datetimes_per_split = split_data(datetimes=t0_datetimes, method=split_method)
-        t0_datetimes_per_split = t0_datetimes_per_split._asdict()
-
         data_source_which_defines_geo_position = self[0]
-
-        positions_per_split: dict[SplitName, pd.DataFrame] = {}
-        for split_name, t0_datetimes_for_split in t0_datetimes_per_split.items():
-            n_examples = n_examples_per_split[split_name]
-            shuffled_t0_datetimes = np.random.choice(t0_datetimes_for_split, shape=n_examples)
-            x_locations, y_locations = data_source_which_defines_geo_position.get_locations(
-                shuffled_t0_datetimes
-            )
-            positions_per_split[split_name] = pd.DataFrame(
-                {
-                    "t0_datetime_UTC": shuffled_t0_datetimes,
-                    "x_center_OSGB": x_locations,
-                    "y_center_OSGB": y_locations,
-                }
-            )
-
-        return positions_per_split
+        shuffled_t0_datetimes = np.random.choice(t0_datetimes, size=n_examples)
+        x_locations, y_locations = data_source_which_defines_geo_position.get_locations(
+            shuffled_t0_datetimes
+        )
+        return pd.DataFrame(
+            {
+                "t0_datetime_UTC": shuffled_t0_datetimes,
+                "x_center_OSGB": x_locations,
+                "y_center_OSGB": y_locations,
+            }
+        )
diff --git a/nowcasting_dataset/dataset/datamodule.py b/nowcasting_dataset/dataset/datamodule.py
@@ -14,8 +14,8 @@
 from nowcasting_dataset.data_sources.metadata.metadata_data_source import MetadataDataSource
 from nowcasting_dataset.data_sources.sun.sun_data_source import SunDataSource
 from nowcasting_dataset.dataset import datasets
-from nowcasting_dataset.dataset.split.split import split_data, SplitMethod, SplitName
-from nowcasting_dataset.data_source_list import DataSourceList
+from nowcasting_dataset.dataset.split.split import split_data, SplitMethod
+from nowcasting_dataset.data_sources.data_source_list import DataSourceList
 
 
 with warnings.catch_warnings():
diff --git a/nowcasting_dataset/dataset/split/split.py b/nowcasting_dataset/dataset/split/split.py
@@ -39,7 +39,10 @@ class SplitName(Enum):
     TEST = "test"
 
 
-SplitData = namedtuple(typename="SplitData", field_names=["train", "validation", "test"])
+SplitData = namedtuple(
+    typename="SplitData",
+    field_names=[SplitName.TRAIN.value, SplitName.VALIDATION.value, SplitName.TEST.value],
+)
 
 
 def split_data(