Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit 15c56c2

Browse files
Merge pull request #562 from openclimatefix/issue/554-both-pv
multiple input data files for pv
2 parents 2d09fbd + 49521d8 commit 15c56c2

File tree

25 files changed

+873
-187
lines changed

25 files changed

+873
-187
lines changed

notebooks/2021-09/2021-09-07/gsp_regions_20181031.geojson

Lines changed: 336 additions & 0 deletions
Large diffs are not rendered by default.

nowcasting_dataset/config/gcp.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,13 @@ input_data:
3333
pv:
3434
forecast_minutes: 60
3535
history_minutes: 30
36-
pv_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf
37-
pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv
36+
pv_files_groups:
37+
- label: passiv
38+
pv_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/passiv.netcdf
39+
pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/Passive/ocf_formatted/v0/system_metadata.csv
40+
- label: pvoutput
41+
pv_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc
42+
pv_metadata_filename: gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_metadata.csv
3843
get_center: false
3944

4045
#---------------------- Satellite -------------

nowcasting_dataset/config/model.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
DEFAULT_N_GSP_PER_EXAMPLE,
2828
DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
2929
NWP_VARIABLE_NAMES,
30+
PV_PROVIDERS,
3031
SAT_VARIABLE_NAMES,
3132
)
3233
from nowcasting_dataset.dataset.split import split
@@ -175,17 +176,32 @@ def check_start_and_end_datetime(cls, values):
175176
return values
176177

177178

178-
class PV(DataSourceMixin, StartEndDatetimeMixin):
179-
"""PV configuration model"""
179+
class PVFiles(BaseModel):
180+
"""Model to hold pv file and metadata file"""
180181

181182
pv_filename: str = Field(
182183
"gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_timeseries_batch.nc",
183-
description=("The NetCDF file holding the solar PV power timeseries."),
184+
description="The NetCDF files holding the solar PV power timeseries.",
184185
)
185186
pv_metadata_filename: str = Field(
186187
"gs://solar-pv-nowcasting-data/PV/PVOutput.org/UK_PV_metadata.csv",
187-
description="The CSV file describing each PV system.",
188+
description="Tthe CSV files describing each PV system.",
188189
)
190+
191+
label: str = Field("pvoutput", description="Label of where the pv data came from")
192+
193+
@validator("label")
194+
def v_label0(cls, v):
195+
"""Validate 'label'"""
196+
assert v in PV_PROVIDERS
197+
return v
198+
199+
200+
class PV(DataSourceMixin, StartEndDatetimeMixin):
201+
"""PV configuration model"""
202+
203+
pv_files_groups: List[PVFiles] = [PVFiles()]
204+
189205
n_pv_systems_per_example: int = Field(
190206
DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE,
191207
description="The number of PV systems samples per example. "
@@ -201,6 +217,34 @@ class PV(DataSourceMixin, StartEndDatetimeMixin):
201217
"PVDataSource is used to define the geospatial positions of each example.",
202218
)
203219

220+
pv_filename: str = Field(
221+
None,
222+
description="The NetCDF files holding the solar PV power timeseries.",
223+
)
224+
pv_metadata_filename: str = Field(
225+
None,
226+
description="Tthe CSV files describing each PV system.",
227+
)
228+
229+
@classmethod
230+
def model_validation(cls, v):
231+
"""Move old way of storing filenames to new way"""
232+
233+
if (v.pv_filename is not None) and (v.pv_metadata_filename is not None):
234+
logger.warning(
235+
"Loading pv files the old way, and moving them the new way. "
236+
"Please update configuration file"
237+
)
238+
label = "pvoutput" if "pvoutput" in v.pv_filename.lower() else "passiv"
239+
pv_file = PVFiles(
240+
pv_filename=v.pv_filename, pv_metadata_filename=v.pv_metadata_filename, label=label
241+
)
242+
v.pv_files_groups = [pv_file]
243+
v.pv_filename = None
244+
v.pv_metadata_filename = None
245+
246+
return v
247+
204248

205249
class Satellite(DataSourceMixin, TimeResolutionMixin):
206250
"""Satellite configuration model"""

nowcasting_dataset/config/on_premises.yaml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,13 @@ input_data:
3030

3131
#---------------------- PV -------------------
3232
pv:
33-
pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/passiv.netcdf
34-
pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/system_metadata_OCF_ONLY.csv
33+
pv_files_groups:
34+
- label: passiv
35+
pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/passiv.netcdf
36+
pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/Passiv/ocf_formatted/v0/system_metadata_OCF_ONLY.csv
37+
- label: pvoutput
38+
pv_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/PVOutput.org/UK_PV_timeseries_batch.nc
39+
pv_metadata_filename: /mnt/storage_b/data/ocf/solar_pv_nowcasting/nowcasting_dataset_pipeline/PV/PVOutput.org/UK_PV_metadata.csv
3540
get_center: false
3641
history_minutes: 90
3742
log_level: "INFO"

nowcasting_dataset/consts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,3 +135,5 @@
135135
)
136136

137137
LOG_LEVELS = ("DEBUG", "INFO", "WARNING", "ERROR")
138+
139+
PV_PROVIDERS = ["passiv", "pvoutput"]

nowcasting_dataset/data_sources/datasource_output.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def check_dataset_less_than_or_equal_to(
9999
):
100100
"""Check data is less than a certain value"""
101101
if (data > max_value).any():
102-
message = f"Some {self.__class__.__name__} data values are less than {max_value}"
102+
message = f"Some {self.__class__.__name__} data values are more than {max_value}"
103103
if variable_name is not None:
104104
message += f" ({variable_name})"
105105
logger.error(message)

nowcasting_dataset/data_sources/pv/pv_data_source.py

Lines changed: 75 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import datetime
44
import functools
5-
import io
65
import logging
76
from dataclasses import dataclass
87
from numbers import Number
@@ -16,7 +15,8 @@
1615

1716
import nowcasting_dataset.filesystem.utils as nd_fs_utils
1817
from nowcasting_dataset import geospatial
19-
from nowcasting_dataset.consts import DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE
18+
from nowcasting_dataset.config.model import PVFiles
19+
from nowcasting_dataset.consts import DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE, PV_PROVIDERS
2020
from nowcasting_dataset.data_sources.data_source import ImageDataSource
2121
from nowcasting_dataset.data_sources.metadata.metadata_model import SpaceTimeLocation
2222
from nowcasting_dataset.data_sources.pv.pv_model import PV
@@ -33,8 +33,7 @@ class PVDataSource(ImageDataSource):
3333
defined by image_size_pixels and meters_per_pixel.
3434
"""
3535

36-
filename: Union[str, Path]
37-
metadata_filename: Union[str, Path]
36+
files_groups: List[Union[PVFiles, dict]]
3837
# TODO: Issue #425: Use config to set start_dt and end_dt.
3938
start_datetime: Optional[datetime.datetime] = None
4039
end_datetime: Optional[datetime.datetime] = None
@@ -48,15 +47,20 @@ class PVDataSource(ImageDataSource):
4847

4948
def __post_init__(self, image_size_pixels: int, meters_per_pixel: int):
5049
"""Post Init"""
50+
51+
if type(self.files_groups[0]) == dict:
52+
self.files_groups = [PVFiles(**files) for files in self.files_groups]
53+
5154
super().__post_init__(image_size_pixels, meters_per_pixel)
5255

5356
self.rng = np.random.default_rng()
5457
self.load()
5558

5659
def check_input_paths_exist(self) -> None:
5760
"""Check input paths exist. If not, raise a FileNotFoundError."""
58-
for filename in [self.filename, self.metadata_filename]:
59-
nd_fs_utils.check_path_exists(filename)
61+
for pv_files in self.files_groups:
62+
for filename in [pv_files.pv_filename, pv_files.pv_metadata_filename]:
63+
nd_fs_utils.check_path_exists(filename)
6064

6165
def load(self):
6266
"""
@@ -73,9 +77,23 @@ def get_data_model_for_batch():
7377

7478
def _load_metadata(self):
7579

76-
logger.debug(f"Loading PV metadata from {self.metadata_filename}")
80+
logger.debug(f"Loading PV metadata from {self.files_groups}")
81+
82+
# collect all metadata together
83+
pv_metadata = []
84+
for pv_files in self.files_groups:
85+
metadata_filename = pv_files.pv_metadata_filename
86+
87+
# read metadata file
88+
metadata = pd.read_csv(metadata_filename, index_col="system_id")
89+
90+
# encode index, to make sure the indexes are unique
91+
metadata.index = encode_label(indexes=metadata.index, label=pv_files.label)
92+
93+
pv_metadata.append(metadata)
94+
pv_metadata = pd.concat(pv_metadata)
7795

78-
pv_metadata = pd.read_csv(self.metadata_filename, index_col="system_id")
96+
# drop any systems with no lon or lat
7997
pv_metadata.dropna(subset=["longitude", "latitude"], how="any", inplace=True)
8098

8199
pv_metadata["location_x"], pv_metadata["location_y"] = geospatial.lat_lon_to_osgb(
@@ -99,15 +117,33 @@ def _load_metadata(self):
99117

100118
def _load_pv_power(self):
101119

102-
logger.debug(f"Loading PV Power data from {self.filename}")
120+
logger.debug(f"Loading PV Power data from {self.files_groups}")
103121

104-
pv_power = load_solar_pv_data(
105-
self.filename, start_dt=self.start_datetime, end_dt=self.end_datetime
106-
)
122+
# collect all PV power timeseries together
123+
pv_power_all = []
124+
for pv_files in self.files_groups:
125+
filename = pv_files.pv_filename
126+
127+
# get pv power data
128+
pv_power = load_solar_pv_data(
129+
filename, start_dt=self.start_datetime, end_dt=self.end_datetime
130+
)
131+
132+
# encode index, to make sure the columns are unique
133+
new_columns = encode_label(indexes=pv_power.columns, label=pv_files.label)
134+
pv_power.columns = new_columns
135+
136+
pv_power_all.append(pv_power)
137+
138+
pv_power = pd.concat(pv_power_all, axis="columns")
139+
assert not pv_power.columns.duplicated().any()
107140

108141
# A bit of hand-crafted cleaning
109-
if 30248 in pv_power.columns:
110-
pv_power[30248]["2018-10-29":"2019-01-03"] = np.NaN
142+
bad_pvputput_indexes = [30248]
143+
bad_pvputput_indexes = encode_label(bad_pvputput_indexes, label="pvoutput")
144+
for bad_index in bad_pvputput_indexes:
145+
if bad_index in pv_power.columns:
146+
pv_power[bad_index]["2018-10-29":"2019-01-03"] = np.NaN
111147

112148
# Drop columns and rows with all NaNs.
113149
pv_power.dropna(axis="columns", how="all", inplace=True)
@@ -418,3 +454,28 @@ def drop_pv_systems_which_produce_overnight(pv_power: pd.DataFrame) -> pd.DataFr
418454
bad_systems = pv_power.columns[pv_above_threshold_at_night]
419455
print(len(bad_systems), "bad PV systems found and removed!")
420456
return pv_power.drop(columns=bad_systems)
457+
458+
459+
def encode_label(indexes: List[str], label: str):
460+
"""
461+
Encode the label to a list of indexes.
462+
463+
The new encoding must be integers and unique.
464+
It would be useful if the indexes can read and deciphered by humans.
465+
This is done by times the original index by 10
466+
and adding 1 for passive or 2 for other lables
467+
468+
Args:
469+
indexes: list of indexes
470+
label: either 'passiv' or 'pvoutput'
471+
472+
Returns: list of indexes encoded by label
473+
"""
474+
assert label in PV_PROVIDERS
475+
# this encoding does work if the number of pv providers is more than 10
476+
assert len(PV_PROVIDERS) < 10
477+
478+
label_index = PV_PROVIDERS.index(label)
479+
new_index = [str(int(col) * 10 + label_index) for col in indexes]
480+
481+
return new_index

nowcasting_dataset/manager/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,20 @@ def initialize_data_sources(
6969
config_for_data_source, pattern_to_remove=f"^{data_source_name}_"
7070
)
7171

72+
# TODO: #631 remove
73+
if data_source_name == "pv":
74+
config_for_data_source.pop("filename")
75+
config_for_data_source.pop("metadata_filename")
76+
7277
data_source_class = MAP_DATA_SOURCE_NAME_TO_CLASS[data_source_name]
7378
try:
7479
data_source = data_source_class(**config_for_data_source)
7580
except Exception:
76-
logger.exception(f"Exception whilst instantiating {data_source_name}!")
81+
logger.exception(
82+
f"Exception whilst instantiating {data_source_name}! "
83+
f"Tried with configuration {config_for_data_source} "
84+
f"in {data_source_class}"
85+
)
7786
raise
7887
self.data_sources[data_source_name] = data_source
7988

requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ gcsfs
99
dask
1010
pvlib
1111
pyproj
12+
pytest
13+
coverage<6.3
14+
pytest-cov
15+
jedi
1216
mypy
1317
pydantic
1418
tqdm

0 commit comments

Comments
 (0)