22
33import datetime
44import functools
5- import io
65import logging
76from dataclasses import dataclass
87from numbers import Number
1615
1716import nowcasting_dataset .filesystem .utils as nd_fs_utils
1817from nowcasting_dataset import geospatial
19- from nowcasting_dataset .consts import DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE
18+ from nowcasting_dataset .config .model import PVFiles
19+ from nowcasting_dataset .consts import DEFAULT_N_PV_SYSTEMS_PER_EXAMPLE , PV_PROVIDERS
2020from nowcasting_dataset .data_sources .data_source import ImageDataSource
2121from nowcasting_dataset .data_sources .metadata .metadata_model import SpaceTimeLocation
2222from nowcasting_dataset .data_sources .pv .pv_model import PV
@@ -33,8 +33,7 @@ class PVDataSource(ImageDataSource):
3333 defined by image_size_pixels and meters_per_pixel.
3434 """
3535
36- filename : Union [str , Path ]
37- metadata_filename : Union [str , Path ]
36+ files_groups : List [Union [PVFiles , dict ]]
3837 # TODO: Issue #425: Use config to set start_dt and end_dt.
3938 start_datetime : Optional [datetime .datetime ] = None
4039 end_datetime : Optional [datetime .datetime ] = None
@@ -48,15 +47,20 @@ class PVDataSource(ImageDataSource):
4847
4948 def __post_init__ (self , image_size_pixels : int , meters_per_pixel : int ):
5049 """Post Init"""
50+
51+ if type (self .files_groups [0 ]) == dict :
52+ self .files_groups = [PVFiles (** files ) for files in self .files_groups ]
53+
5154 super ().__post_init__ (image_size_pixels , meters_per_pixel )
5255
5356 self .rng = np .random .default_rng ()
5457 self .load ()
5558
5659 def check_input_paths_exist (self ) -> None :
5760 """Check input paths exist. If not, raise a FileNotFoundError."""
58- for filename in [self .filename , self .metadata_filename ]:
59- nd_fs_utils .check_path_exists (filename )
61+ for pv_files in self .files_groups :
62+ for filename in [pv_files .pv_filename , pv_files .pv_metadata_filename ]:
63+ nd_fs_utils .check_path_exists (filename )
6064
6165 def load (self ):
6266 """
@@ -73,9 +77,23 @@ def get_data_model_for_batch():
7377
7478 def _load_metadata (self ):
7579
76- logger .debug (f"Loading PV metadata from { self .metadata_filename } " )
80+ logger .debug (f"Loading PV metadata from { self .files_groups } " )
81+
82+ # collect all metadata together
83+ pv_metadata = []
84+ for pv_files in self .files_groups :
85+ metadata_filename = pv_files .pv_metadata_filename
86+
87+ # read metadata file
88+ metadata = pd .read_csv (metadata_filename , index_col = "system_id" )
89+
90+ # encode index, to make sure the indexes are unique
91+ metadata .index = encode_label (indexes = metadata .index , label = pv_files .label )
92+
93+ pv_metadata .append (metadata )
94+ pv_metadata = pd .concat (pv_metadata )
7795
78- pv_metadata = pd . read_csv ( self . metadata_filename , index_col = "system_id" )
96+ # drop any systems with no lon or lat
7997 pv_metadata .dropna (subset = ["longitude" , "latitude" ], how = "any" , inplace = True )
8098
8199 pv_metadata ["location_x" ], pv_metadata ["location_y" ] = geospatial .lat_lon_to_osgb (
@@ -99,15 +117,33 @@ def _load_metadata(self):
99117
100118 def _load_pv_power (self ):
101119
102- logger .debug (f"Loading PV Power data from { self .filename } " )
120+ logger .debug (f"Loading PV Power data from { self .files_groups } " )
103121
104- pv_power = load_solar_pv_data (
105- self .filename , start_dt = self .start_datetime , end_dt = self .end_datetime
106- )
122+ # collect all PV power timeseries together
123+ pv_power_all = []
124+ for pv_files in self .files_groups :
125+ filename = pv_files .pv_filename
126+
127+ # get pv power data
128+ pv_power = load_solar_pv_data (
129+ filename , start_dt = self .start_datetime , end_dt = self .end_datetime
130+ )
131+
132+ # encode index, to make sure the columns are unique
133+ new_columns = encode_label (indexes = pv_power .columns , label = pv_files .label )
134+ pv_power .columns = new_columns
135+
136+ pv_power_all .append (pv_power )
137+
138+ pv_power = pd .concat (pv_power_all , axis = "columns" )
139+ assert not pv_power .columns .duplicated ().any ()
107140
108141 # A bit of hand-crafted cleaning
109- if 30248 in pv_power .columns :
110- pv_power [30248 ]["2018-10-29" :"2019-01-03" ] = np .NaN
142+ bad_pvputput_indexes = [30248 ]
143+ bad_pvputput_indexes = encode_label (bad_pvputput_indexes , label = "pvoutput" )
144+ for bad_index in bad_pvputput_indexes :
145+ if bad_index in pv_power .columns :
146+ pv_power [bad_index ]["2018-10-29" :"2019-01-03" ] = np .NaN
111147
112148 # Drop columns and rows with all NaNs.
113149 pv_power .dropna (axis = "columns" , how = "all" , inplace = True )
@@ -418,3 +454,28 @@ def drop_pv_systems_which_produce_overnight(pv_power: pd.DataFrame) -> pd.DataFr
418454 bad_systems = pv_power .columns [pv_above_threshold_at_night ]
419455 print (len (bad_systems ), "bad PV systems found and removed!" )
420456 return pv_power .drop (columns = bad_systems )
457+
458+
459+ def encode_label (indexes : List [str ], label : str ):
460+ """
461+ Encode the label to a list of indexes.
462+
463+ The new encoding must be integers and unique.
464+ It would be useful if the indexes can read and deciphered by humans.
465+ This is done by times the original index by 10
466+ and adding 1 for passive or 2 for other lables
467+
468+ Args:
469+ indexes: list of indexes
470+ label: either 'passiv' or 'pvoutput'
471+
472+ Returns: list of indexes encoded by label
473+ """
474+ assert label in PV_PROVIDERS
475+ # this encoding does work if the number of pv providers is more than 10
476+ assert len (PV_PROVIDERS ) < 10
477+
478+ label_index = PV_PROVIDERS .index (label )
479+ new_index = [str (int (col ) * 10 + label_index ) for col in indexes ]
480+
481+ return new_index
0 commit comments