openclimatefix-archives
diff --git a/‎nowcasting_dataset/data_sources/data_source.py‎
Lines changed: 3 additions & 0 deletions b/‎nowcasting_dataset/data_sources/data_source.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎nowcasting_dataset/data_sources/datasource_output.py‎
Lines changed: 34 additions & 0 deletions b/‎nowcasting_dataset/data_sources/datasource_output.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎nowcasting_dataset/dataset/batch.py‎
Lines changed: 45 additions & 30 deletions b/‎nowcasting_dataset/dataset/batch.py‎
Lines changed: 45 additions & 30 deletions
diff --git a/‎nowcasting_dataset/dataset/datasets.py‎
Lines changed: 11 additions & 8 deletions b/‎nowcasting_dataset/dataset/datasets.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎nowcasting_dataset/filesystem/utils.py‎
Lines changed: 6 additions & 0 deletions b/‎nowcasting_dataset/filesystem/utils.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎scripts/generate_data_for_tests/get_test_data.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/generate_data_for_tests/get_test_data.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/data/batch/datetime/0.nc‎
20.9 KB b/‎tests/data/batch/datetime/0.nc‎
20.9 KB
diff --git a/‎tests/data/batch/gsp/0.nc‎
27 KB b/‎tests/data/batch/gsp/0.nc‎
27 KB
diff --git a/‎tests/data/batch/metadata/0.nc‎
17.3 KB b/‎tests/data/batch/metadata/0.nc‎
17.3 KB
diff --git a/‎tests/data/batch/nwp/0.nc‎
2.3 MB b/‎tests/data/batch/nwp/0.nc‎
2.3 MB
@@ -121,6 +121,9 @@ def get_batch(
                 output.to_numpy()
             examples.append(output)
 
+        # could add option here, to save each data source using
+        # 1. # DataSourceOutput.to_xr_dataset() to make it a dataset
+        # 2. DataSourceOutput.save_netcdf(), save to netcdf
         return DataSourceOutput.create_batch_from_examples(examples)
 
     def datetime_index(self) -> pd.DatetimeIndex:
 
@@ -1,5 +1,10 @@
 """ General Data Source output pydantic class. """
 from __future__ import annotations
+import os
+from nowcasting_dataset.filesystem.utils import make_folder
+from nowcasting_dataset.utils import get_netcdf_filename
+
+from pathlib import Path
 from pydantic import BaseModel, Field
 import pandas as pd
 import xarray as xr
@@ -32,6 +37,10 @@ class Config:
         "then this item stores one data item i.e Example",
     )
 
+    def get_name(self) -> str:
+        """ Get the name of the class """
+        return self.__class__.__name__.lower()
+
     def to_numpy(self):
         """Change to numpy"""
         for k, v in self.dict().items():
@@ -93,6 +102,31 @@ def get_datetime_index(self):
         """ Datetime index for the data """
         pass
 
+    def save_netcdf(self, batch_i: int, path: Path, xr_dataset: xr.Dataset):
+        """
+        Save batch to netcdf file
+
+        Args:
+            batch_i: the batch id, used to make the filename
+            path: the path where it will be saved. This can be local or in the cloud.
+            xr_dataset: xr dataset that has batch information in it
+        """
+        filename = get_netcdf_filename(batch_i)
+
+        name = self.get_name()
+
+        # make folder
+        folder = os.path.join(path, name)
+        if batch_i == 0:
+            # only need to make the folder once, or check that there folder is there once
+            make_folder(path=folder)
+
+        # make file
+        local_filename = os.path.join(folder, filename)
+
+        encoding = {name: {"compression": "lzf"} for name in xr_dataset.data_vars}
+        xr_dataset.to_netcdf(local_filename, engine="h5netcdf", mode="w", encoding=encoding)
+
     def select_time_period(
         self,
         keys: List[str],
 
@@ -1,11 +1,14 @@
 """ batch functions """
 import logging
+import os
 from pathlib import Path
-from typing import List, Optional, Union
+from typing import List, Optional, Union, Dict
 
 import xarray as xr
 from pydantic import BaseModel, Field
 
+from nowcasting_dataset.filesystem.utils import make_folder
+
 from nowcasting_dataset.config.model import Configuration
 
 from nowcasting_dataset.data_sources.datetime.datetime_model import Datetime
@@ -72,21 +75,23 @@ class Batch(Example):
         "then this item stores one data item",
     )
 
-    def batch_to_dataset(self) -> xr.Dataset:
+    def batch_to_dict_dataset(self) -> Dict[str, xr.Dataset]:
         """Change batch to xr.Dataset so it can be saved and compressed"""
-        return batch_to_dataset(batch=self)
+        return batch_to_dict_dataset(batch=self)
 
     @staticmethod
-    def load_batch_from_dataset(xr_dataset: xr.Dataset):
-        """Change xr.Datatset to Batch object"""
+    def load_batch_from_dict_dataset(xr_dataset: Dict[str, xr.Dataset]):
+        """Change dictionary of xr.Datatset to Batch object"""
         # get a list of data sources
         data_sources_names = Example.__fields__.keys()
 
         # collect data sources
         data_sources_dict = {}
         for data_source_name in data_sources_names:
             cls = Example.__fields__[data_source_name].type_
-            data_sources_dict[data_source_name] = cls.from_xr_dataset(xr_dataset=xr_dataset)
+            data_sources_dict[data_source_name] = cls.from_xr_dataset(
+                xr_dataset=xr_dataset[data_source_name]
+            )
 
         data_sources_dict["batch_size"] = data_sources_dict["metadata"].batch_size
 
@@ -168,43 +173,57 @@ def save_netcdf(self, batch_i: int, path: Path):
             path: the path where it will be saved. This can be local or in the cloud.
 
         """
-        batch_xr = self.batch_to_dataset()
+        batch_xr = self.batch_to_dict_dataset()
 
-        encoding = {name: {"compression": "lzf"} for name in batch_xr.data_vars}
-        filename = get_netcdf_filename(batch_i)
-        local_filename = path / filename
-        batch_xr.to_netcdf(local_filename, engine="h5netcdf", mode="w", encoding=encoding)
+        for data_source in self.data_sources:
+            xr_dataset = batch_xr[data_source.get_name()]
+            data_source.save_netcdf(batch_i=batch_i, path=path, xr_dataset=xr_dataset)
 
     @staticmethod
-    def load_netcdf(local_netcdf_filename: Path):
+    def load_netcdf(local_netcdf_path: Union[Path, str], batch_idx: int):
         """Load batch from netcdf file"""
-        netcdf_batch = xr.load_dataset(local_netcdf_filename)
+        data_sources_names = Example.__fields__.keys()
 
-        return Batch.load_batch_from_dataset(netcdf_batch)
+        # collect data sources
+        batch_dict = {}
+        for data_source_name in data_sources_names:
 
+            local_netcdf_filename = os.path.join(
+                local_netcdf_path, data_source_name, f"{batch_idx}.nc"
+            )
+            xr_dataset = xr.load_dataset(local_netcdf_filename)
 
-def batch_to_dataset(batch: Batch) -> xr.Dataset:
-    """Concat all the individual fields in an Example into a single Dataset.
+            batch_dict[data_source_name] = xr_dataset
+
+        return Batch.load_batch_from_dict_dataset(batch_dict)
+
+
+def batch_to_dict_dataset(batch: Batch) -> Dict[str, xr.Dataset]:
+    """Concat all the individual fields in an Example into a dictionary of Datasets.
 
     Args:
       batch: List of Example objects, which together constitute a single batch.
     """
-    datasets = []
+    individual_datasets = {}
+    split_batch = batch.split()
+
+    # loop over each data source
+    for data_source in split_batch[0].data_sources:
 
-    # loop over each item in the batch
-    for i, example in enumerate(batch.split()):
+        datasets = []
+        name = data_source.get_name()
 
-        individual_datasets = []
+        # loop over each item in the batch
+        for i, example in enumerate(split_batch):
 
-        for data_source in example.data_sources:
             if data_source is not None:
-                individual_datasets.append(data_source.to_xr_dataset(i))
+                datasets.append(getattr(split_batch[i], name).to_xr_dataset(i))
 
         # Merge
-        merged_ds = xr.merge(individual_datasets)
-        datasets.append(merged_ds)
+        merged_ds = xr.concat(datasets, dim="example")
+        individual_datasets[name] = merged_ds
 
-    return xr.concat(datasets, dim="example")
+    return individual_datasets
 
 
 def write_batch_locally(batch: Union[Batch, dict], batch_i: int, path: Path):
@@ -219,8 +238,4 @@ def write_batch_locally(batch: Union[Batch, dict], batch_i: int, path: Path):
     if type(batch):
         batch = Batch(**batch)
 
-    dataset = batch.batch_to_dataset()
-    encoding = {name: {"compression": "lzf"} for name in dataset.data_vars}
-    filename = get_netcdf_filename(batch_i)
-    local_filename = path / filename
-    dataset.to_netcdf(local_filename, engine="h5netcdf", mode="w", encoding=encoding)
+    batch.save_netcdf(batch_i=batch_i, path=path)
@@ -15,7 +15,7 @@
 
 from nowcasting_dataset import data_sources
 from nowcasting_dataset import utils as nd_utils
-from nowcasting_dataset.filesystem.utils import download_to_local
+from nowcasting_dataset.filesystem.utils import download_to_local, delete_all_files_in_temp_path
 from nowcasting_dataset.config.model import Configuration
 from nowcasting_dataset.consts import (
     GSP_YIELD,
@@ -185,21 +185,24 @@ def __getitem__(self, batch_idx: int) -> Batch:
                 "batch_idx must be in the range" f" [0, {self.n_batches}), not {batch_idx}!"
             )
         netcdf_filename = nd_utils.get_netcdf_filename(batch_idx)
-        remote_netcdf_filename = os.path.join(self.src_path, netcdf_filename)
-        local_netcdf_filename = os.path.join(self.tmp_path, netcdf_filename)
+        # remote_netcdf_folder = os.path.join(self.src_path, netcdf_filename)
+        # local_netcdf_filename = os.path.join(self.tmp_path, netcdf_filename)
 
         if self.cloud in ["gcp", "aws"]:
+            # TODO check this works for mulitple files
             download_to_local(
-                remote_filename=remote_netcdf_filename,
-                local_filename=local_netcdf_filename,
+                remote_filename=self.src_path,
+                local_filename=self.tmp_path,
             )
+            local_netcdf_folder = self.tmp_path
         else:
-            local_netcdf_filename = remote_netcdf_filename
+            local_netcdf_folder = self.src_path
 
-        batch = Batch.load_netcdf(local_netcdf_filename)
+        batch = Batch.load_netcdf(local_netcdf_folder, batch_idx=batch_idx)
         # netcdf_batch = xr.load_dataset(local_netcdf_filename)
         if self.cloud != "local":
-            os.remove(local_netcdf_filename)
+            # remove files in a folder, but not the folder itself
+            delete_all_files_in_temp_path(self.src_path)
 
         # batch = example.xr_to_example(batch_xr=netcdf_batch, required_keys=self.required_keys)
 
 
@@ -127,3 +127,9 @@ def upload_one_file(
     """
     filesystem = fsspec.open(remote_filename).fs
     filesystem.put(local_filename, remote_filename)
+
+
+def make_folder(path: Union[str, Path]):
+    """ Make folder """
+    filesystem = fsspec.open(path).fs
+    filesystem.mkdir(path)
@@ -148,4 +148,4 @@
 c.process.sat_channels = c.process.sat_channels[0:1]
 
 f = Batch.fake(configuration=c)
-f.save_netcdf(batch_i=0, path=Path(f"{local_path}/tests/data"))
+f.save_netcdf(batch_i=0, path=Path(f"{local_path}/tests/data/batch"))