Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit 3947a24

Browse files
Merge pull request #667 from openclimatefix/issue/load-more-pv-data
pv live interpolation options
2 parents f3790e5 + 447558e commit 3947a24

File tree

5 files changed

+81
-18
lines changed

5 files changed

+81
-18
lines changed

nowcasting_dataset/config/model.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,16 @@ class PV(DataSourceMixin, StartEndDatetimeMixin):
231231
False, description="Option if to use live data from the nowcasting pv database"
232232
)
233233

234+
live_interpolate_minutes: int = Field(
235+
30, description="The number of minutes we allow PV data to interpolate"
236+
)
237+
live_load_extra_minutes: int = Field(
238+
0,
239+
description="The number of extra minutes in the past we should load. Then the recent "
240+
"values can be interpolated, and the extra minutes removed. This is "
241+
"because some live data takes ~1 hour to come in.",
242+
)
243+
234244
@classmethod
235245
def model_validation(cls, v):
236246
"""Move old way of storing filenames to new way"""

nowcasting_dataset/data_sources/pv/live.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,23 +45,42 @@ def get_metadata_from_database() -> pd.DataFrame:
4545
return pv_systems_df
4646

4747

48-
def get_pv_power_from_database(history_duration: timedelta) -> pd.DataFrame:
48+
def get_pv_power_from_database(
49+
history_duration: timedelta, interpolate_minutes: int, load_extra_minutes: int
50+
) -> pd.DataFrame:
4951
"""
5052
Get pv power from database
5153
52-
Returns: pandas data frame with the following columns
53-
- pv systems indexes
54+
Args:
55+
history_duration: a timedelta of how many minutes to load in the past
56+
interpolate_minutes: how many minutes we should interpolate the data froward for
57+
load_extra_minutes: the extra minutes we should load, in order to load more data.
58+
This is because some data from a site lags significantly behind 'now'
59+
60+
Returns:pandas data frame with the following columns pv systems indexes
5461
The index is the datetime
5562
5663
"""
5764

65+
logger.info("Loading PV data from database")
66+
logger.debug(f"{history_duration=} {interpolate_minutes=} {load_extra_minutes=}")
67+
68+
extra_duration = timedelta(minutes=load_extra_minutes)
69+
now = datetime.now(tz=timezone.utc)
70+
start_utc = now - history_duration
71+
start_utc_extra = start_utc - extra_duration
72+
73+
# create empty dataframe with 5 mins periods
74+
empty_df = pd.DataFrame(index=pd.date_range(start=start_utc_extra, end=now, freq="5T"))
75+
5876
# make database connection
5977
url = os.getenv("DB_URL_PV")
6078
db_connection = DatabaseConnection(url=url, base=Base_PV)
6179

6280
with db_connection.get_session() as session:
63-
start_utc = datetime.now(tz=timezone.utc) - history_duration
64-
pv_yields: List[PVYieldSQL] = get_pv_yield(session=session, start_utc=start_utc)
81+
pv_yields: List[PVYieldSQL] = get_pv_yield(session=session, start_utc=start_utc_extra)
82+
83+
logger.debug(f"Found {len(pv_yields)} PV yields from the database")
6584

6685
pv_yields_df = pd.DataFrame(
6786
[(PVYield.from_orm(pv_yield)).__dict__ for pv_yield in pv_yields]
@@ -72,9 +91,10 @@ def get_pv_power_from_database(history_duration: timedelta) -> pd.DataFrame:
7291
else:
7392
logger.debug(f"Found {len(pv_yields_df)} pv yields")
7493

75-
# get the system id from 'pv_system_id=xxxx provider=.....'
76-
print(pv_yields_df.columns)
77-
print(pv_yields_df["pv_system"])
94+
if len(pv_yields_df) == 0:
95+
return pv_yields_df
96+
97+
# get the system id from 'pv_system_id=xxxx provider=.....'
7898
pv_yields_df["pv_system_id"] = (
7999
pv_yields_df["pv_system"].astype(str).str.split(" ").str[0].str.split("=").str[-1]
80100
)
@@ -90,7 +110,16 @@ def get_pv_power_from_database(history_duration: timedelta) -> pd.DataFrame:
90110

91111
pv_yields_df.columns = encode_label(pv_yields_df.columns, label="pvoutput")
92112

93-
# interpolate in between, maximum 30 mins
94-
pv_yields_df.interpolate(limit=3, limit_area="inside", inplace=True)
113+
# interpolate in between, maximum 'live_interpolate_minutes' mins
114+
# note data is in 5 minutes chunks
115+
pv_yields_df = empty_df.join(pv_yields_df)
116+
limit = int(interpolate_minutes / 5)
117+
if limit > 0:
118+
pv_yields_df.interpolate(limit=limit, inplace=True)
119+
120+
# filter out the extra minutes loaded
121+
logger.debug(f"{len(pv_yields_df)} of datetimes before filter on {start_utc}")
122+
pv_yields_df = pv_yields_df[pv_yields_df.index >= start_utc]
123+
logger.debug(f"{len(pv_yields_df)} of datetimes after filter on {start_utc}")
95124

96125
return pv_yields_df

nowcasting_dataset/data_sources/pv/pv_data_source.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class PVDataSource(ImageDataSource):
5050
load_from_gcs: bool = True # option to load data from gcs, or local file
5151
get_center: bool = True
5252
is_live: bool = False
53+
live_interpolate_minutes: int = 30
54+
live_load_extra_minutes: int = 60
5355

5456
def __post_init__(
5557
self, image_size_pixels_height: int, image_size_pixels_width: int, meters_per_pixel: int
@@ -172,7 +174,11 @@ def _load_pv_power(self):
172174
pv_power = pd.concat(pv_power_all, axis="columns")
173175

174176
else:
175-
pv_power = get_pv_power_from_database(history_duration=self.history_duration)
177+
pv_power = get_pv_power_from_database(
178+
history_duration=self.history_duration,
179+
interpolate_minutes=self.live_interpolate_minutes,
180+
load_extra_minutes=self.live_load_extra_minutes,
181+
)
176182
logger.debug(f"Found {len(pv_power)} pv power datetimes from database ")
177183
logger.debug(f"Found {len(pv_power.columns)} pv power pv system ids from database")
178184

tests/data_sources/pv/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ def pv_yields_and_systems(db_session_pv):
5252
5353
Pv systems: Two systems
5454
PV yields:
55-
FOr system 1, pv yields from 4 to 10 at 5 minutes
56-
For system 2: 1 pv yield at 16.00
55+
For system 1, pv yields from 4 to 10 at 5 minutes. Last one at 09.55
56+
For system 2: 1 pv yield at 04.00
5757
"""
5858

5959
pv_system_sql_1: PVSystemSQL = PVSystem(

tests/data_sources/pv/test_pv_live.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@ def test_get_metadata_from_database(pv_yields_and_systems):
1919
assert len(meteadata) == 2
2020

2121

22-
@freeze_time("2022-01-01")
22+
@freeze_time("2022-01-01 05:00")
2323
def test_get_pv_power_from_database(pv_yields_and_systems):
2424
"""Get pv power from database"""
25-
pv_power = get_pv_power_from_database(history_duration=timedelta(hours=1))
25+
pv_power = get_pv_power_from_database(
26+
history_duration=timedelta(hours=1), load_extra_minutes=30, interpolate_minutes=30
27+
)
2628

27-
assert len(pv_power) == 72 # 6 hours at 5 mins = 6*12
29+
assert len(pv_power) == 13 # 1 hours at 5 mins = 6*12
2830
assert len(pv_power.columns) == 2
2931
assert pv_power.columns[0] == "11"
3032
assert (
@@ -33,7 +35,23 @@ def test_get_pv_power_from_database(pv_yields_and_systems):
3335
)
3436

3537

36-
@freeze_time("2022-01-01")
38+
@freeze_time("2022-01-01 10:55:00")
39+
def test_get_pv_power_from_database_interpolate(pv_yields_and_systems):
40+
"""Get pv power from database, test out get extra minutes and interpolate"""
41+
42+
pv_power = get_pv_power_from_database(
43+
history_duration=timedelta(hours=0.5), load_extra_minutes=0, interpolate_minutes=0
44+
)
45+
assert len(pv_power) == 0 # last data point is at 09:55
46+
47+
pv_power = get_pv_power_from_database(
48+
history_duration=timedelta(hours=1), load_extra_minutes=60, interpolate_minutes=30
49+
)
50+
assert len(pv_power) == 13 # 1 hours at 5 mins = 12
51+
assert pv_power.isna().sum().sum() == 6 # the last 30 mins is still nans
52+
53+
54+
@freeze_time("2022-01-01 05:00")
3755
def test_get_example_and_batch(pv_yields_and_systems):
3856
"""Test PVDataSource with data source from database"""
3957

@@ -61,6 +79,6 @@ def test_get_example_and_batch(pv_yields_and_systems):
6179
assert len(pv_data_source.pv_metadata) > 0
6280

6381
locations = pv_data_source.get_locations(pv_data_source.pv_power.index)
64-
assert len(locations) == 72 # 6 hours at 5 mins
82+
assert len(locations) == 7 # 30 minutes at 5 mins, inclusive
6583

6684
_ = pv_data_source.get_example(location=locations[0])

0 commit comments

Comments
 (0)