Skip to content
This repository was archived by the owner on Sep 11, 2023. It is now read-only.

Commit 84686a1

Browse files
committed
options for inteperolate live pv data
1 parent 769980d commit 84686a1

File tree

5 files changed

+61
-16
lines changed

5 files changed

+61
-16
lines changed

nowcasting_dataset/config/model.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,16 @@ class PV(DataSourceMixin, StartEndDatetimeMixin):
231231
False, description="Option if to use live data from the nowcasting pv database"
232232
)
233233

234+
live_interpolate_minutes: int = Field(
235+
30, description="The number of minutes we allow PV data to interpolate"
236+
)
237+
live_load_extra_minutes: int = Field(
238+
0,
239+
description="The number of extra minutes in the past we should load. Then the recent "
240+
"values can be interpolated, and the extra minutes removed. This is "
241+
"because some live data takes ~1 hour to come in.",
242+
)
243+
234244
@classmethod
235245
def model_validation(cls, v):
236246
"""Move old way of storing filenames to new way"""

nowcasting_dataset/data_sources/pv/live.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,23 +45,40 @@ def get_metadata_from_database() -> pd.DataFrame:
4545
return pv_systems_df
4646

4747

48-
def get_pv_power_from_database(history_duration: timedelta) -> pd.DataFrame:
48+
def get_pv_power_from_database(
49+
history_duration: timedelta, interpolate_minutes: int = 30, load_extra_minutes: int = 60
50+
) -> pd.DataFrame:
4951
"""
5052
Get pv power from database
5153
54+
:param history_duration: a timedelta of how many minutes to load in the past
55+
:param interpolate_minutes: how many minutes we should interpolate the data froward for
56+
:param load_extra_minutes: the extra minutes we should load, in order to load more data.
57+
This is because some data from a site lags significantly behind 'now'
58+
5259
Returns: pandas data frame with the following columns
5360
- pv systems indexes
5461
The index is the datetime
5562
5663
"""
5764

65+
logger.info("Loading PV data from database")
66+
logger.debug(f"{history_duration=} {interpolate_minutes=} {load_extra_minutes=}")
67+
68+
extra_duration = timedelta(minutes=load_extra_minutes)
69+
now = datetime.now(tz=timezone.utc)
70+
start_utc = now - history_duration
71+
start_utc_extra = start_utc - extra_duration
72+
73+
# create empty dataframe with 5 mins periods
74+
empty_df = pd.DataFrame(index=pd.date_range(start=start_utc_extra, end=now, freq="5T"))
75+
5876
# make database connection
5977
url = os.getenv("DB_URL_PV")
6078
db_connection = DatabaseConnection(url=url, base=Base_PV)
6179

6280
with db_connection.get_session() as session:
63-
start_utc = datetime.now(tz=timezone.utc) - history_duration
64-
pv_yields: List[PVYieldSQL] = get_pv_yield(session=session, start_utc=start_utc)
81+
pv_yields: List[PVYieldSQL] = get_pv_yield(session=session, start_utc=start_utc_extra)
6582

6683
pv_yields_df = pd.DataFrame(
6784
[(PVYield.from_orm(pv_yield)).__dict__ for pv_yield in pv_yields]
@@ -72,9 +89,10 @@ def get_pv_power_from_database(history_duration: timedelta) -> pd.DataFrame:
7289
else:
7390
logger.debug(f"Found {len(pv_yields_df)} pv yields")
7491

75-
# get the system id from 'pv_system_id=xxxx provider=.....'
76-
print(pv_yields_df.columns)
77-
print(pv_yields_df["pv_system"])
92+
if len(pv_yields_df) == 0:
93+
return pv_yields_df
94+
95+
# get the system id from 'pv_system_id=xxxx provider=.....'
7896
pv_yields_df["pv_system_id"] = (
7997
pv_yields_df["pv_system"].astype(str).str.split(" ").str[0].str.split("=").str[-1]
8098
)
@@ -90,7 +108,14 @@ def get_pv_power_from_database(history_duration: timedelta) -> pd.DataFrame:
90108

91109
pv_yields_df.columns = encode_label(pv_yields_df.columns, label="pvoutput")
92110

93-
# interpolate in between, maximum 30 mins
94-
pv_yields_df.interpolate(limit=3, limit_area="inside", inplace=True)
111+
# interpolate in between, maximum 'live_interpolate_minutes' mins
112+
# note data is in 5 minutes chunks
113+
pv_yields_df = empty_df.join(pv_yields_df)
114+
limit = int(interpolate_minutes / 5)
115+
if limit > 0:
116+
pv_yields_df.interpolate(limit=limit, inplace=True)
117+
118+
# filter out the extra minutes loaded
119+
pv_yields_df = pv_yields_df[pv_yields_df.index >= start_utc]
95120

96121
return pv_yields_df

nowcasting_dataset/data_sources/pv/pv_data_source.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ class PVDataSource(ImageDataSource):
5050
load_from_gcs: bool = True # option to load data from gcs, or local file
5151
get_center: bool = True
5252
is_live: bool = False
53+
live_interpolate_minutes: int = 30
54+
live_load_extra_minutes: int = 60
5355

5456
def __post_init__(
5557
self, image_size_pixels_height: int, image_size_pixels_width: int, meters_per_pixel: int
@@ -172,7 +174,11 @@ def _load_pv_power(self):
172174
pv_power = pd.concat(pv_power_all, axis="columns")
173175

174176
else:
175-
pv_power = get_pv_power_from_database(history_duration=self.history_duration)
177+
pv_power = get_pv_power_from_database(
178+
history_duration=self.history_duration,
179+
interpolate_minutes=self.live_interpolate_minutes,
180+
load_extra_minutes=self.live_load_extra_minutes,
181+
)
176182
logger.debug(f"Found {len(pv_power)} pv power datetimes from database ")
177183
logger.debug(f"Found {len(pv_power.columns)} pv power pv system ids from database")
178184

tests/data_sources/pv/conftest.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@ def pv_yields_and_systems(db_session_pv):
5252
5353
Pv systems: Two systems
5454
PV yields:
55-
FOr system 1, pv yields from 4 to 10 at 5 minutes
56-
For system 2: 1 pv yield at 16.00
55+
For system 1, pv yields from 4 to 10 at 5 minutes. Last one at 09.55
56+
For system 2: 1 pv yield at 04.00
5757
"""
5858

5959
pv_system_sql_1: PVSystemSQL = PVSystem(

tests/data_sources/pv/test_pv_live.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,16 +33,20 @@ def test_get_pv_power_from_database(pv_yields_and_systems):
3333
)
3434

3535

36-
@freeze_time("2022-01-01 17:00")
36+
@freeze_time("2022-01-01 10:55:00")
3737
def test_get_pv_power_from_database_interpolate(pv_yields_and_systems):
3838
"""Get pv power from database, test out get extra minutes and interpolate"""
39-
pv_power = get_pv_power_from_database(history_duration=timedelta(hours=1))
40-
assert len(pv_power) == 0 # last data point is at 16:00
4139

4240
pv_power = get_pv_power_from_database(
43-
history_duration=timedelta(hours=1), load_extra_minutes=60, interpolate_minutes=60
41+
history_duration=timedelta(hours=0.5), load_extra_minutes=0, interpolate_minutes=0
42+
)
43+
assert len(pv_power) == 0 # last data point is at 09:55
44+
45+
pv_power = get_pv_power_from_database(
46+
history_duration=timedelta(hours=1), load_extra_minutes=60, interpolate_minutes=30
4447
)
45-
assert len(pv_power) == 12 # 1 hours at 5 mins = 12
48+
assert len(pv_power) == 13 # 1 hours at 5 mins = 12
49+
assert pv_power.isna().sum().sum() == 6 # the last 30 mins is still nans
4650

4751

4852
@freeze_time("2022-01-01")

0 commit comments

Comments
 (0)