Skip to content

Commit b4db824

Browse files
committed
Move extracting curve data from oedb turbine library to own function to allow testing independent from oedb data
1 parent d57801e commit b4db824

File tree

2 files changed

+154
-51
lines changed

2 files changed

+154
-51
lines changed

tests/test_data_handling.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
get_turbine_types,
1818
restore_default_turbine_data,
1919
store_turbine_data_from_oedb,
20+
_process_and_save_oedb_data,
2021
)
2122

2223

@@ -99,6 +100,51 @@ def test_store_turbine_data_from_oedb(self, caplog):
99100
assert "No cp-curve but has_cp_curve=True" not in caplog.text
100101
assert "No power curve but has_power_curve=True" not in caplog.text
101102

103+
def test__prepare_and_save_oedb_turbine_curve_data(self, caplog):
104+
"""Test `_prepare_and_save_oedb_turbine_curve_data` function."""
105+
# prepare dummy turbine data
106+
# turbine 0 everything okay, turbine 1 duplicated wind speeds, turbine 2
107+
# power curve values broken
108+
turbine_data = pd.DataFrame(
109+
data={
110+
"id": [0, 1, 2],
111+
"turbine_type": ["turbine 0", "turbine 1", "turbine 2"],
112+
"has_power_curve": [True, True, True],
113+
"has_cp_curve": [True, True, True],
114+
"power_curve_wind_speeds": ["[15, 20, 25]", "[15, 15, 25]", "[15, 20, 25]"],
115+
"power_curve_values": ["[15, 20, 25]", "[15, 20, 25]", "[15, 20, [25]"],
116+
"power_coefficient_curve_wind_speeds": ["[15, 20, 25]", "[15, 20, 25]", "[15, 20, 25]"],
117+
"power_coefficient_curve_values": ["[15, 20, 25]", "[15, 20, 25]", "[15, 20, 25]"],
118+
"thrust_coefficient_curve_wind_speeds": [0, 1, 2],
119+
"thrust_coefficient_curve_values": [0, 1, 2],
120+
"nominal_power": [0, 1, 2],
121+
},
122+
index=[0, 1, 2]
123+
)
124+
125+
# run test with low / default threshold - data is not overwritten
126+
t = {}
127+
for fn in os.listdir(self.orig_path):
128+
t[fn] = os.path.getmtime(os.path.join(self.orig_path, fn))
129+
with caplog.at_level(logging.WARNING):
130+
_process_and_save_oedb_data(turbine_data)
131+
for fn in os.listdir(self.orig_path):
132+
assert t[fn] == os.path.getmtime(os.path.join(self.orig_path, fn))
133+
assert "The turbine library data contains too many faulty " in caplog.text
134+
135+
# run test with high threshold
136+
for fn in os.listdir(self.orig_path):
137+
t[fn] = os.path.getmtime(os.path.join(self.orig_path, fn))
138+
with caplog.at_level(logging.WARNING):
139+
_process_and_save_oedb_data(turbine_data, threshold=0.95)
140+
for fn in os.listdir(self.orig_path):
141+
assert t[fn] < os.path.getmtime(os.path.join(self.orig_path, fn))
142+
assert "The turbine library data contains faulty power_curves" in caplog.text
143+
assert not turbine_data.at[2, "has_power_curve"]
144+
assert not turbine_data.at[1, "has_power_curve"]
145+
assert turbine_data.at[1, "has_cp_curve"]
146+
assert turbine_data.at[0, "has_power_curve"]
147+
102148
def test_wrong_url_load_turbine_data(self):
103149
"""Load turbine data from oedb with a wrong schema."""
104150
with pytest.raises(

windpowerlib/data.py

Lines changed: 108 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -158,21 +158,34 @@ def load_turbine_data_from_oedb(schema="supply", table="wind_turbine_library"):
158158

159159

160160
def store_turbine_data_from_oedb(
161-
schema="supply", table="wind_turbine_library"
161+
schema="supply", table="wind_turbine_library", threshold=0.2
162162
):
163163
r"""
164164
Loads turbine library from the OpenEnergy database (oedb).
165165
166166
Turbine data is saved to csv files ('oedb_power_curves.csv',
167167
'oedb_power_coefficient_curves.csv' and 'oedb_nominal_power') for offline
168168
usage of the windpowerlib. If the files already exist they are overwritten.
169+
In case the turbine library on the oedb contains too many faulty turbines,
170+
the already existing files are not overwritten. The accepted percentage of faulty
171+
turbines can be set through the parameter `threshold`.
169172
170173
Parameters
171174
----------
172175
schema : str
173176
Database schema of the turbine library.
174177
table : str
175178
Table name of the turbine library.
179+
threshold : float
180+
In case there are turbines in the turbine library with faulty data (e.g.
181+
duplicate wind speed entries in the power (coefficient) curve data), the
182+
threshold defines the share of accepted faulty turbine ata up to which the
183+
existing turbine data is overwritten by the newly downloaded data.
184+
For example, a threshold of 0.1 means that more than 10% of the
185+
turbines would need to have invalid data in order to discard the downloaded
186+
data. This is to make sure that in the rare case the oedb data is too buggy,
187+
the turbine data that is by default provided with the windpowerlib is not
188+
overwritten by poor data.
176189
177190
Returns
178191
-------
@@ -182,11 +195,40 @@ def store_turbine_data_from_oedb(
182195
183196
"""
184197
turbine_data = fetch_turbine_data_from_oedb(schema=schema, table=table)
185-
# standard file name for saving data
186-
filename = os.path.join(os.path.dirname(__file__), "oedb", "{0}.csv")
198+
turbine_data = _process_and_save_oedb_data(
199+
turbine_data, threshold=threshold
200+
)
201+
check_turbine_data(
202+
filename = os.path.join(os.path.dirname(__file__), "oedb", "{0}.csv")
203+
)
204+
return turbine_data
205+
206+
207+
def _process_and_save_oedb_data(turbine_data, threshold=0.2):
208+
"""
209+
Helper function to extract power (coefficient) curve data from the turbine library.
210+
211+
Parameters
212+
-----------
213+
turbine_data : :pandas:`pandas.DataFrame<frame>`
214+
Raw turbine data downloaded from the oedb with
215+
:func:`fetch_turbine_data_from_oedb`.
216+
threshold : float
217+
See parameter `threshold` in func:`store_turbine_data_from_oedb`
218+
for more information.
219+
220+
Returns
221+
--------
222+
:pandas:`pandas.DataFrame<frame>`
223+
Turbine data of different turbines such as 'manufacturer',
224+
'turbine_type', 'nominal_power'.
187225
188-
# get all power (coefficient) curves and save them to file
189-
for curve_type in ["power_curve", "power_coefficient_curve"]:
226+
"""
227+
curve_types = ["power_curve", "power_coefficient_curve"]
228+
# get all power (coefficient) curves
229+
curve_dict = {}
230+
broken_turbines_dict = {}
231+
for curve_type in curve_types:
190232
broken_turbine_data = []
191233
curves_df = pd.DataFrame(columns=["wind_speed"])
192234
for index in turbine_data.index:
@@ -222,67 +264,82 @@ def store_turbine_data_from_oedb(
222264
curves_df = pd.merge(
223265
left=curves_df, right=df, how="outer", on="wind_speed"
224266
)
267+
else:
268+
broken_turbine_data.append(
269+
turbine_data.loc[index, "turbine_type"])
225270
except:
226271
broken_turbine_data.append(turbine_data.loc[index, "turbine_type"])
227-
228-
# warning in case of broken turbine data
229-
if len(broken_turbine_data) > 0:
230-
issue_link = ("https://github.com/OpenEnergyPlatform/data-preprocessing"
231-
"/issues/28")
232-
# in case only some data is faulty, only give out warning
233-
if len(broken_turbine_data) < 0.2 * len(turbine_data):
234-
logging.warning(
235-
f"The turbine library data contains faulty {curve_type}s. The "
236-
f"{curve_type} data can therefore not be loaded for the following "
237-
f"turbines: {broken_turbine_data}. "
238-
f"Please report this in the following issue, in case it hasn't "
239-
f"already been reported: {issue_link}"
240-
)
241-
save_turbine_data = True
272+
curve_dict[curve_type] = curves_df
273+
broken_turbines_dict[curve_type] = broken_turbine_data
274+
275+
# check if there are faulty turbines and if so, raise warning
276+
# if there are too many, don't save downloaded data to disk but keep existing data
277+
if any(len(_) > 0 for _ in broken_turbines_dict.values()):
278+
issue_link = ("https://github.com/OpenEnergyPlatform/data-preprocessing"
279+
"/issues/28")
280+
# in case only some data is faulty, only give out warning
281+
if all(len(_) < threshold * len(turbine_data)
282+
for _ in broken_turbines_dict.values()):
283+
save_turbine_data = True
284+
for curve_type in curve_types:
285+
if len(broken_turbines_dict[curve_type]) > 0:
286+
logging.warning(
287+
f"The turbine library data contains faulty {curve_type}s. The "
288+
f"{curve_type} data can therefore not be loaded for the "
289+
f"following turbines: {broken_turbine_data}. "
290+
f"Please report this in the following issue, in case it hasn't "
291+
f"already been reported: {issue_link}"
292+
)
242293
# set has_power_(coefficient)_curve to False for faulty turbines
243-
for turb in broken_turbine_data:
294+
for turb in broken_turbines_dict[curve_type]:
244295
ind = turbine_data[turbine_data.turbine_type == turb].index[0]
245296
col = ("has_power_curve" if curve_type == "power_curve"
246297
else "has_cp_curve")
247298
turbine_data.at[ind, col] = False
248-
# in case most data is faulty, do not store downloaded data
249-
else:
250-
logging.warning(
251-
f"The turbine library data contains too many faulty {curve_type}s,"
252-
f"wherefore {curve_type} data is not loaded from the oedb. "
253-
f"Please report this in the following issue, in case it hasn't "
254-
f"already been reported: {issue_link}"
255-
)
256-
save_turbine_data = False
299+
# in case most data is faulty, do not store downloaded data
257300
else:
258-
save_turbine_data = True
259-
260-
if save_turbine_data:
261-
curves_df = curves_df.set_index("wind_speed").sort_index().transpose()
301+
logging.warning(
302+
f"The turbine library data contains too many faulty turbine datasets "
303+
f"wherefore it is not loaded from the oedb. "
304+
f"In case you want to circumvent this behaviour, you can specify a "
305+
f"higher tolerance through the parameter 'threshold'."
306+
f"Please report this in the following issue, in case it hasn't "
307+
f"already been reported: {issue_link}"
308+
)
309+
save_turbine_data = False
310+
else:
311+
save_turbine_data = True
312+
313+
if save_turbine_data:
314+
# standard file name for saving data
315+
filename = os.path.join(os.path.dirname(__file__), "oedb", "{0}.csv")
316+
# save curve data to csv
317+
for curve_type in curve_types:
318+
curves_df = curve_dict[curve_type].set_index(
319+
"wind_speed").sort_index().transpose()
262320
# power curve values in W
263321
if curve_type == "power_curve":
264322
curves_df *= 1000
265323
curves_df.index.name = "turbine_type"
266324
curves_df.sort_index(inplace=True)
267325
curves_df.to_csv(filename.format("{}s".format(curve_type)))
268326

269-
# get turbine data and save to file (excl. curves)
270-
turbine_data_df = turbine_data.drop(
271-
[
272-
"power_curve_wind_speeds",
273-
"power_curve_values",
274-
"power_coefficient_curve_wind_speeds",
275-
"power_coefficient_curve_values",
276-
"thrust_coefficient_curve_wind_speeds",
277-
"thrust_coefficient_curve_values",
278-
],
279-
axis=1,
280-
).set_index("turbine_type")
281-
# nominal power in W
282-
turbine_data_df["nominal_power"] *= 1000
283-
turbine_data_df.sort_index(inplace=True)
284-
turbine_data_df.to_csv(filename.format("turbine_data"))
285-
check_turbine_data(filename)
327+
# save turbine data to file (excl. curves)
328+
turbine_data_df = turbine_data.drop(
329+
[
330+
"power_curve_wind_speeds",
331+
"power_curve_values",
332+
"power_coefficient_curve_wind_speeds",
333+
"power_coefficient_curve_values",
334+
"thrust_coefficient_curve_wind_speeds",
335+
"thrust_coefficient_curve_values",
336+
],
337+
axis=1,
338+
).set_index("turbine_type")
339+
# nominal power in W
340+
turbine_data_df["nominal_power"] *= 1000
341+
turbine_data_df.sort_index(inplace=True)
342+
turbine_data_df.to_csv(filename.format("turbine_data"))
286343
return turbine_data
287344

288345

0 commit comments

Comments
 (0)