Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit 02cfe7b

Browse files
Merge pull request #599 from openclimatefix/issue/598-id-metadata
add id to metadata
2 parents b8e8947 + 5e985ef commit 02cfe7b

24 files changed

+431
-321
lines changed

nowcasting_dataset/consts.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,5 @@
133133
SPATIAL_AND_TEMPORAL_LOCATIONS_OF_EACH_EXAMPLE_FILENAME = (
134134
"spatial_and_temporal_locations_of_each_example.csv"
135135
)
136-
SPATIAL_AND_TEMPORAL_LOCATIONS_COLUMN_NAMES = ("t0_datetime_UTC", "x_center_OSGB", "y_center_OSGB")
137136

138137
LOG_LEVELS = ("DEBUG", "INFO", "WARNING", "ERROR")

nowcasting_dataset/data_sources/data_source.py

Lines changed: 24 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
import nowcasting_dataset.filesystem.utils as nd_fs_utils
1414
import nowcasting_dataset.time as nd_time
1515
from nowcasting_dataset import square, utils
16-
from nowcasting_dataset.consts import SPATIAL_AND_TEMPORAL_LOCATIONS_COLUMN_NAMES
1716
from nowcasting_dataset.data_sources.datasource_output import DataSourceOutput
17+
from nowcasting_dataset.data_sources.metadata.metadata_model import SpaceTimeLocation
1818
from nowcasting_dataset.dataset.xr_utils import (
1919
convert_coordinates_to_indexes_for_list_datasets,
2020
join_list_dataset_to_batch_dataset,
@@ -137,7 +137,7 @@ def open(self):
137137
@utils.exception_logger
138138
def create_batches(
139139
self,
140-
spatial_and_temporal_locations_of_each_example: pd.DataFrame,
140+
spatial_and_temporal_locations_of_each_example: List[SpaceTimeLocation],
141141
idx_of_first_batch: int,
142142
batch_size: int,
143143
dst_path: Path,
@@ -174,16 +174,6 @@ def create_batches(
174174
)
175175
assert upload_every_n_batches >= 0, "`upload_every_n_batches` must be >= 0"
176176

177-
spatial_and_temporal_locations_of_each_example_columns = (
178-
spatial_and_temporal_locations_of_each_example.columns.to_list()
179-
)
180-
assert spatial_and_temporal_locations_of_each_example_columns == list(
181-
SPATIAL_AND_TEMPORAL_LOCATIONS_COLUMN_NAMES
182-
), (
183-
f"The provided data columns {spatial_and_temporal_locations_of_each_example_columns}"
184-
f" do not match {SPATIAL_AND_TEMPORAL_LOCATIONS_COLUMN_NAMES=}"
185-
)
186-
187177
self.open()
188178

189179
# Figure out where to write batches to:
@@ -200,7 +190,7 @@ def create_batches(
200190
batch_idx=batch_idx, batch_size=batch_size
201191
)
202192

203-
locations_for_batch = spatial_and_temporal_locations_of_each_example.iloc[
193+
locations_for_batch = spatial_and_temporal_locations_of_each_example[
204194
start_example_idx:end_example_idx
205195
]
206196
locations_for_batches.append(locations_for_batch)
@@ -211,11 +201,7 @@ def create_batches(
211201
logger.debug(f"{self.__class__.__name__} creating batch {batch_idx}!")
212202

213203
# Generate batch.
214-
batch = self.get_batch(
215-
t0_datetimes_utc=locations_for_batch.t0_datetime_UTC,
216-
x_centers_osgb=locations_for_batch.x_center_OSGB,
217-
y_centers_osgb=locations_for_batch.y_center_OSGB,
218-
)
204+
batch = self.get_batch(locations=locations_for_batch)
219205

220206
# Save batch to disk.
221207
batch.save_netcdf(
@@ -239,43 +225,26 @@ def create_batches(
239225
dst_path=dst_path, local_path=path_to_write_to
240226
)
241227

242-
def get_batch(
243-
self,
244-
t0_datetimes_utc: pd.DatetimeIndex,
245-
x_centers_osgb: Iterable[Number],
246-
y_centers_osgb: Iterable[Number],
247-
) -> DataSourceOutput:
228+
def get_batch(self, locations: List[SpaceTimeLocation]) -> DataSourceOutput:
248229
"""
249230
Get Batch Data
250231
251232
Args:
252-
t0_datetimes_utc: list of timestamps for the datetime of the batches.
253-
The batch will also include data for historic and future depending
254-
on `history_minutes` and `future_minutes`.
255-
The batch size is given by the length of the t0_datetimes.
256-
x_centers_osgb: x center batch locations
257-
y_centers_osgb: y center batch locations
233+
locations: List of locations object
234+
A location object contains
235+
- a timestamp of the example (t0_datetime_utc),
236+
- the x center location of the example (x_location_osgb)
237+
- the y center location of the example(y_location_osgb)
258238
259239
Returns: Batch data.
260240
"""
261-
assert len(t0_datetimes_utc) == len(x_centers_osgb), (
262-
f"len(t0_datetimes) != len(x_locations): "
263-
f"{len(t0_datetimes_utc)} != {len(x_centers_osgb)}"
264-
)
265-
assert len(t0_datetimes_utc) == len(y_centers_osgb), (
266-
f"len(t0_datetimes) != len(y_locations): "
267-
f"{len(t0_datetimes_utc)} != {len(y_centers_osgb)}"
268-
)
269-
zipped = list(zip(t0_datetimes_utc, x_centers_osgb, y_centers_osgb))
270-
batch_size = len(t0_datetimes_utc)
241+
242+
batch_size = len(locations)
271243

272244
with futures.ThreadPoolExecutor(max_workers=batch_size) as executor:
273245
future_examples = []
274-
for coords in zipped:
275-
t0_datetime, x_location, y_location = coords
276-
future_example = executor.submit(
277-
self.get_example, t0_datetime, x_location, y_location
278-
)
246+
for location in locations:
247+
future_example = executor.submit(self.get_example, location)
279248
future_examples.append(future_example)
280249

281250
# Get the examples back. Loop round each future so we can log a helpful error.
@@ -378,9 +347,7 @@ def _get_time_slice(self, t0_datetime_utc: pd.Timestamp):
378347

379348
def get_example(
380349
self,
381-
t0_datetime_utc: pd.Timestamp, #: Datetime of "now": The most recent obs.
382-
x_center_osgb: Number, #: Centre, in OSGB coordinates.
383-
y_center_osgb: Number, #: Centre, in OSGB coordinates.
350+
location: SpaceTimeLocation, #: Location object of the most recent observation
384351
) -> xr.Dataset:
385352
"""Must be overridden by child classes."""
386353
raise NotImplementedError()
@@ -452,23 +419,24 @@ def data(self):
452419
raise RuntimeError("Please run `open()` before accessing data!")
453420
return self._data
454421

455-
def get_example(
456-
self, t0_datetime_utc: pd.Timestamp, x_center_osgb: Number, y_center_osgb: Number
457-
) -> xr.Dataset:
422+
def get_example(self, location: SpaceTimeLocation) -> xr.Dataset:
458423
"""
459424
Get Example data
460425
461426
Args:
462-
t0_datetime_utc: list of timestamps for the datetime of the batches.
463-
The batch will also include data for historic and future depending
464-
on `history_minutes` and `future_minutes`.
465-
x_center_osgb: x center batch locations
466-
y_center_osgb: y center batch locations
427+
location: A location object of the example which contains
428+
- a timestamp of the example (t0_datetime_utc),
429+
- the x center location of the example (x_location_osgb)
430+
- the y center location of the example(y_location_osgb)
467431
468432
Returns: Example Data
469433
470434
"""
471435

436+
t0_datetime_utc = location.t0_datetime_utc
437+
x_center_osgb = location.x_center_osgb
438+
y_center_osgb = location.y_center_osgb
439+
472440
logger.debug(
473441
f"Getting example for {t0_datetime_utc=}, " f"{x_center_osgb=} and {y_center_osgb=}"
474442
)

nowcasting_dataset/data_sources/fake/batch.py

Lines changed: 40 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
)
2323
from nowcasting_dataset.data_sources.gsp.eso import get_gsp_metadata_from_eso
2424
from nowcasting_dataset.data_sources.gsp.gsp_model import GSP
25-
from nowcasting_dataset.data_sources.metadata.metadata_model import Metadata
25+
from nowcasting_dataset.data_sources.metadata.metadata_model import Metadata, SpaceTimeLocation
2626
from nowcasting_dataset.data_sources.nwp.nwp_model import NWP
2727
from nowcasting_dataset.data_sources.optical_flow.optical_flow_model import OpticalFlow
2828
from nowcasting_dataset.data_sources.pv.pv_model import PV
@@ -108,9 +108,9 @@ def gsp_fake(
108108
t0_datetimes_utc = make_t0_datetimes_utc(batch_size)
109109
x_centers_osgb, y_centers_osgb = make_random_x_and_y_osgb_centers(batch_size)
110110
else:
111-
t0_datetimes_utc = metadata.t0_datetime_utc
112-
x_centers_osgb = metadata.x_center_osgb
113-
y_centers_osgb = metadata.y_center_osgb
111+
t0_datetimes_utc = metadata.t0_datetimes_utc
112+
x_centers_osgb = metadata.x_centers_osgb
113+
y_centers_osgb = metadata.y_centers_osgb
114114

115115
# make batch of arrays
116116
xr_datasets = [
@@ -157,13 +157,17 @@ def metadata_fake(
157157
# choose random index
158158
index = np.random.choice(len(metadata), size=batch_size)
159159

160-
lat = metadata.iloc[index].centroid_lat
161-
lon = metadata.iloc[index].centroid_lon
160+
lat = list(metadata.iloc[index].centroid_lat)
161+
lon = list(metadata.iloc[index].centroid_lon)
162+
ids = list(metadata.iloc[index].index)
163+
id_types = ["gsp"] * batch_size
162164

163165
else:
164166
# get random OSGB center in the UK
165167
lat = np.random.uniform(51, 55, batch_size)
166168
lon = np.random.uniform(-2.5, 1, batch_size)
169+
ids = [None] * batch_size
170+
id_types = [None] * batch_size
167171

168172
x_centers_osgb, y_centers_osgb = lat_lon_to_osgb(lat=lat, lon=lon)
169173

@@ -172,13 +176,19 @@ def metadata_fake(
172176
batch_size=batch_size, temporally_align_examples=temporally_align_examples
173177
)
174178

175-
metadata_dict = {}
176-
metadata_dict["batch_size"] = batch_size
177-
metadata_dict["x_center_osgb"] = list(x_centers_osgb)
178-
metadata_dict["y_center_osgb"] = list(y_centers_osgb)
179-
metadata_dict["t0_datetime_utc"] = list(t0_datetimes_utc)
179+
# would be good to parrelize this
180+
locations = [
181+
SpaceTimeLocation(
182+
t0_datetime_utc=t0_datetimes_utc[i],
183+
x_center_osgb=x_centers_osgb[i],
184+
y_center_osgb=y_centers_osgb[i],
185+
id=ids[i],
186+
id_type=id_types[i],
187+
)
188+
for i in range(0, batch_size)
189+
]
180190

181-
return Metadata(**metadata_dict)
191+
return Metadata(batch_size=batch_size, space_time_locations=locations)
182192

183193

184194
def nwp_fake(
@@ -201,9 +211,9 @@ def nwp_fake(
201211
t0_datetimes_utc = make_t0_datetimes_utc(batch_size)
202212
x_centers_osgb, y_centers_osgb = make_random_x_and_y_osgb_centers(batch_size)
203213
else:
204-
t0_datetimes_utc = metadata.t0_datetime_utc
205-
x_centers_osgb = metadata.x_center_osgb
206-
y_centers_osgb = metadata.y_center_osgb
214+
t0_datetimes_utc = metadata.t0_datetimes_utc
215+
x_centers_osgb = metadata.x_centers_osgb
216+
y_centers_osgb = metadata.y_centers_osgb
207217

208218
# make batch of arrays
209219
xr_arrays = [
@@ -248,9 +258,9 @@ def pv_fake(
248258
t0_datetimes_utc = make_t0_datetimes_utc(batch_size)
249259
x_centers_osgb, y_centers_osgb = make_random_x_and_y_osgb_centers(batch_size)
250260
else:
251-
t0_datetimes_utc = metadata.t0_datetime_utc
252-
x_centers_osgb = metadata.x_center_osgb
253-
y_centers_osgb = metadata.y_center_osgb
261+
t0_datetimes_utc = metadata.t0_datetimes_utc
262+
x_centers_osgb = metadata.x_centers_osgb
263+
y_centers_osgb = metadata.y_centers_osgb
254264

255265
# make batch of arrays
256266
xr_datasets = [
@@ -296,9 +306,9 @@ def satellite_fake(
296306
t0_datetimes_utc = make_t0_datetimes_utc(batch_size)
297307
x_centers_osgb, y_centers_osgb = make_random_x_and_y_osgb_centers(batch_size)
298308
else:
299-
t0_datetimes_utc = metadata.t0_datetime_utc
300-
x_centers_osgb = metadata.x_center_osgb
301-
y_centers_osgb = metadata.y_center_osgb
309+
t0_datetimes_utc = metadata.t0_datetimes_utc
310+
x_centers_osgb = metadata.x_centers_osgb
311+
y_centers_osgb = metadata.y_centers_osgb
302312

303313
# make batch of arrays
304314
xr_arrays = [
@@ -340,9 +350,9 @@ def hrv_satellite_fake(
340350
t0_datetimes_utc = make_t0_datetimes_utc(batch_size)
341351
x_centers_osgb, y_centers_osgb = make_random_x_and_y_osgb_centers(batch_size)
342352
else:
343-
t0_datetimes_utc = metadata.t0_datetime_utc
344-
x_centers_osgb = metadata.x_center_osgb
345-
y_centers_osgb = metadata.y_center_osgb
353+
t0_datetimes_utc = metadata.t0_datetimes_utc
354+
x_centers_osgb = metadata.x_centers_osgb
355+
y_centers_osgb = metadata.y_centers_osgb
346356

347357
# make batch of arrays
348358
xr_arrays = [
@@ -385,9 +395,9 @@ def optical_flow_fake(
385395
t0_datetimes_utc = make_t0_datetimes_utc(batch_size)
386396
x_centers_osgb, y_centers_osgb = make_random_x_and_y_osgb_centers(batch_size)
387397
else:
388-
t0_datetimes_utc = metadata.t0_datetime_utc
389-
x_centers_osgb = metadata.x_center_osgb
390-
y_centers_osgb = metadata.y_center_osgb
398+
t0_datetimes_utc = metadata.t0_datetimes_utc
399+
x_centers_osgb = metadata.x_centers_osgb
400+
y_centers_osgb = metadata.y_centers_osgb
391401

392402
# make batch of arrays
393403
xr_arrays = [
@@ -421,7 +431,7 @@ def sun_fake(
421431
if metadata is None:
422432
t0_datetimes_utc = make_t0_datetimes_utc(batch_size)
423433
else:
424-
t0_datetimes_utc = metadata.t0_datetime_utc
434+
t0_datetimes_utc = metadata.t0_datetimes_utc
425435

426436
# create dataset with both azimuth and elevation, index with time
427437
# make batch of arrays
@@ -442,8 +452,8 @@ def topographic_fake(batch_size, image_size_pixels, metadata: Optional[Metadata]
442452
if metadata is None:
443453
x_centers_osgb, y_centers_osgb = make_random_x_and_y_osgb_centers(batch_size)
444454
else:
445-
x_centers_osgb = metadata.x_center_osgb
446-
y_centers_osgb = metadata.y_center_osgb
455+
x_centers_osgb = metadata.x_centers_osgb
456+
y_centers_osgb = metadata.y_centers_osgb
447457

448458
# make batch of arrays
449459
xr_arrays = []

0 commit comments

Comments
 (0)