Skip to content

Commit 29a7c7c

Browse files
committed
Fix data completeness logic
1 parent 5b50a85 commit 29a7c7c

File tree

7 files changed

+417
-57
lines changed

7 files changed

+417
-57
lines changed

investing_algorithm_framework/app/app.py

Lines changed: 67 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
import logging
33
import os
44
import threading
5-
from datetime import datetime, timezone
5+
from datetime import datetime, timezone, timedelta
66
from typing import List, Optional, Any, Dict, Tuple
77

8-
import pandas as pd
98
from flask import Flask
109

1110
from investing_algorithm_framework.app.algorithm import Algorithm
@@ -19,7 +18,7 @@
1918
AppMode, BacktestDateRange, DATABASE_DIRECTORY_NAME, DataSource, \
2019
BACKTESTING_INITIAL_AMOUNT, SNAPSHOT_INTERVAL, Backtest, DataError, \
2120
PortfolioConfiguration, SnapshotInterval, DataType, combine_backtests, \
22-
PortfolioProvider, OrderExecutor, ImproperlyConfigured, \
21+
PortfolioProvider, OrderExecutor, ImproperlyConfigured, TimeFrame, \
2322
DataProvider, INDEX_DATETIME, tqdm, BacktestPermutationTest, \
2423
LAST_SNAPSHOT_DATETIME, BACKTESTING_FLAG, generate_backtest_summary_metrics
2524
from investing_algorithm_framework.infrastructure import setup_sqlalchemy, \
@@ -794,8 +793,9 @@ def get_market_credentials(self) -> List[MarketCredential]:
794793
def check_data_completeness(
795794
self,
796795
strategies: List[TradingStrategy],
797-
backtest_date_range: BacktestDateRange
798-
) -> None:
796+
backtest_date_range: BacktestDateRange,
797+
show_progress: bool = True
798+
) -> Tuple[bool, Dict[str, Any]]:
799799
"""
800800
Function to check the data completeness for a set of strategies
801801
over a given backtest date range. This method checks if all data
@@ -807,18 +807,23 @@ def check_data_completeness(
807807
to check data completeness for.
808808
backtest_date_range (BacktestDateRange): The date range to
809809
check data completeness for.
810+
show_progress (bool): Whether to show a progress bar when
811+
checking data completeness.
810812
Returns:
811-
None
813+
Tuple[bool, Dict[str, Any]]: A tuple containing a boolean
814+
indicating if the data is complete and a dictionary
815+
with information about missing data for each data source.
812816
"""
813817
data_sources = []
818+
missing_data_info = {}
814819

815820
for strategy in strategies:
816821
data_sources.extend(strategy.data_sources)
817822

818823
self.initialize_data_sources_backtest(
819824
data_sources,
820825
backtest_date_range,
821-
show_progress=True
826+
show_progress=show_progress
822827
)
823828
data_provider_service = self.container.data_provider_service()
824829

@@ -827,37 +832,60 @@ def check_data_completeness(
827832
for data_source in strategy.data_sources:
828833

829834
if DataType.OHLCV.equals(data_source.data_type):
830-
df = data_provider_service.get_ohlcv_data(
831-
symbol=data_source.symbol,
832-
start_date=backtest_date_range.start_date,
833-
end_date=backtest_date_range.end_date,
834-
pandas=True,
835-
add_pandas_index=False,
836-
add_datetime_column=True,
837-
time_frame=data_source.time_frame
838-
)
839-
df = df.copy()
840-
df['Datetime'] = pd.to_datetime(df['Datetime'])
841-
df = df.sort_values('Datetime')\
842-
.tail(data_source.window_size)
843-
start = df['Datetime'].iloc[0]
844-
end = df['Datetime'].iloc[-1]
845-
freq = pd.to_timedelta(data_source.time_frame.value)
846-
expected = pd.date_range(start, end, freq=freq)
847-
actual = df['Datetime']
848-
missing = expected.difference(actual)
849-
850-
# Calculate the percentage completeness
851-
completeness = len(actual) / len(expected) * 100
852-
853-
if completeness < 100:
835+
required_start_date = backtest_date_range.start_date - \
836+
timedelta(
837+
minutes=TimeFrame.from_value(
838+
data_source.time_frame
839+
).amount_of_minutes * data_source.window_size
840+
)
841+
number_of_required_data_points = \
842+
data_source.get_number_of_required_data_points(
843+
backtest_date_range.start_date,
844+
backtest_date_range.end_date
845+
)
846+
847+
try:
848+
data_provider = data_provider_service.get(data_source)
849+
number_of_available_data_points = \
850+
data_provider.get_number_of_data_points(
851+
backtest_date_range.start_date,
852+
backtest_date_range.end_date
853+
)
854+
855+
missing_dates = \
856+
data_provider.get_missing_data_dates(
857+
required_start_date,
858+
backtest_date_range.end_date
859+
)
860+
if number_of_available_data_points > 0:
861+
missing_data_info[data_source.identifier] = {
862+
"data_source_id": data_source.identifier,
863+
"completeness_percentage": (
864+
(
865+
number_of_available_data_points /
866+
number_of_required_data_points
867+
) * 100
868+
),
869+
"missing_data_points": len(
870+
missing_dates
871+
),
872+
"missing_dates": missing_dates,
873+
"data_source_file_path":
874+
data_provider.get_data_source_file_path()
875+
}
876+
877+
except Exception as e:
854878
raise DataError(
855-
f"Data completeness for data source "
879+
f"Error getting data provider for data source "
856880
f"{data_source.identifier} "
857-
f"({data_source.symbol}) is {completeness:.2f}% "
858-
f"complete. Missing data points: {len(missing)}"
881+
f"({data_source.symbol}): {str(e)}"
859882
)
860883

884+
if len(missing_data_info.keys()) > 0:
885+
return False, missing_data_info
886+
887+
return True, missing_data_info
888+
861889
def run_vector_backtests(
862890
self,
863891
initial_amount,
@@ -1071,13 +1099,10 @@ def run_vector_backtest(
10711099
progress bar when initializing data sources.
10721100
market (str): The market to use for the backtest. This is used
10731101
to create a portfolio configuration if no portfolio
1074-
configuration is found for the strategy. If not provided,
1075-
the first portfolio configuration found will be used.
1102+
configuration is provided in the strategy.
10761103
trading_symbol (str): The trading symbol to use for the backtest.
10771104
This is used to create a portfolio configuration if no
1078-
portfolio configuration is found for the strategy. If not
1079-
provided, the first trading symbol found in the portfolio
1080-
configuration will be used.
1105+
portfolio configuration is provided in the strategy.
10811106
initial_amount (float): The initial amount to start the
10821107
backtest with. This will be the amount of trading currency
10831108
that the portfolio will start with. If not provided,
@@ -1181,7 +1206,7 @@ def run_backtests(
11811206
backtest_date_ranges (List[BacktestDateRange]): List of date ranges
11821207
initial_amount (float): The initial amount to start the
11831208
backtest with. This will be the amount of trading currency
1184-
that the portfolio will start with.
1209+
that the backtest portfolio will start with.
11851210
snapshot_interval (SnapshotInterval): The snapshot interval to use
11861211
for the backtest. This is used to determine how often the
11871212
portfolio snapshot should be taken during the backtest.
@@ -1406,11 +1431,11 @@ def run_permutation_test(
14061431
the risk-free rate from the US Treasury website.
14071432
market (str): The market to use for the backtest. This is used
14081433
to create a portfolio configuration if no portfolio
1409-
configuration is found for the strategy. If not provided,
1434+
configuration is provided in the strategy. If not provided,
14101435
the first portfolio configuration found will be used.
14111436
trading_symbol (str): The trading symbol to use for the backtest.
14121437
This is used to create a portfolio configuration if no
1413-
portfolio configuration is found for the strategy. If not
1438+
portfolio configuration is provided in the strategy. If not
14141439
provided, the first trading symbol found in the portfolio
14151440
configuration will be used.
14161441

investing_algorithm_framework/domain/data_provider.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import List, Any
1+
from typing import List, Any, Union
22
from abc import ABC, abstractmethod
33
from datetime import datetime
44
from investing_algorithm_framework.domain.exceptions import \
@@ -282,3 +282,53 @@ def copy(self, data_source: DataSource) -> "DataProvider":
282282
configuration.
283283
"""
284284
raise NotImplementedError("Subclasses should implement this method.")
285+
286+
@abstractmethod
287+
def get_number_of_data_points(
288+
self,
289+
start_date: datetime,
290+
end_date: datetime,
291+
) -> int:
292+
"""
293+
Returns the number of data points available between the
294+
given start and end dates.
295+
296+
Args:
297+
start_date (datetime): The start date for the data points.
298+
end_date (datetime): The end date for the data points.
299+
Returns:
300+
int: The number of data points available between the
301+
given start and end dates.
302+
"""
303+
raise NotImplementedError("Subclasses should implement this method.")
304+
305+
@abstractmethod
306+
def get_missing_data_dates(
307+
self,
308+
start_date: datetime,
309+
end_date: datetime,
310+
) -> List[datetime]:
311+
"""
312+
Returns a list of dates for which data is missing between the
313+
given start and end dates.
314+
315+
Args:
316+
start_date (datetime): The start date for checking missing data.
317+
end_date (datetime): The end date for checking missing data.
318+
319+
Returns:
320+
List[datetime]: A list of dates for which data is missing
321+
between the given start and end dates.
322+
"""
323+
raise NotImplementedError("Subclasses should implement this method.")
324+
325+
@abstractmethod
326+
def get_data_source_file_path(self) -> Union[str, None]:
327+
"""
328+
Returns the file path for the given data source if applicable.
329+
330+
Returns:
331+
Union[str, None]: The file path for the data source or None
332+
if not applicable.
333+
"""
334+
raise NotImplementedError("Subclasses should implement this method.")

investing_algorithm_framework/domain/exceptions.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,18 @@ class DataError(Exception):
9292
during data retrieval or processing
9393
"""
9494

95-
def __init__(self, message) -> None:
95+
def __init__(
96+
self,
97+
message,
98+
data_source_file_path: str = None,
99+
number_of_missing_data_points: int = None,
100+
total_number_of_data_points: int = None,
101+
) -> None:
96102
super(DataError, self).__init__(message)
97103
self.error_message = message
104+
self.data_source_file_path = data_source_file_path
105+
self.number_of_missing_data_points = number_of_missing_data_points
106+
self.total_number_of_data_points = total_number_of_data_points
98107

99108
def to_response(self):
100109
return {

investing_algorithm_framework/domain/models/data/data_source.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,36 @@ def create_start_date_data(self, index_date: datetime) -> datetime:
179179
(self.window_size * timedelta(
180180
minutes=self.time_frame.amount_of_minutes
181181
))
182+
183+
def get_number_of_required_data_points(
184+
self, start_date: datetime, end_date: datetime
185+
) -> int:
186+
"""
187+
Returns the number of data points required based on the given
188+
attributes of the data source. If the required number of data points
189+
can't be determined, it returns None.
190+
191+
E.g., for OHLCV data source, it
192+
calculates the number of data points needed between the
193+
start_date and end_date based on the time frame.
194+
195+
Args:
196+
start_date (datetime): The start date for the data points.
197+
end_date (datetime): The end date for the data points.
198+
199+
Returns:
200+
int: The number of required data points, or None if it can't
201+
be determined.
202+
"""
203+
204+
if self.time_frame is None:
205+
return None
206+
207+
delta = end_date - start_date
208+
total_minutes = delta.total_seconds() / 60
209+
data_points = total_minutes / self.time_frame.amount_of_minutes
210+
211+
if self.window_size is not None:
212+
data_points += self.window_size
213+
214+
return int(data_points)

0 commit comments

Comments
 (0)