coding-kitties
diff --git a/‎investing_algorithm_framework/app/app.py‎
Lines changed: 67 additions & 42 deletions b/‎investing_algorithm_framework/app/app.py‎
Lines changed: 67 additions & 42 deletions
diff --git a/‎investing_algorithm_framework/domain/data_provider.py‎
Lines changed: 51 additions & 1 deletion b/‎investing_algorithm_framework/domain/data_provider.py‎
Lines changed: 51 additions & 1 deletion
diff --git a/‎investing_algorithm_framework/domain/exceptions.py‎
Lines changed: 10 additions & 1 deletion b/‎investing_algorithm_framework/domain/exceptions.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎investing_algorithm_framework/domain/models/data/data_source.py‎
Lines changed: 33 additions & 0 deletions b/‎investing_algorithm_framework/domain/models/data/data_source.py‎
Lines changed: 33 additions & 0 deletions
@@ -2,10 +2,9 @@
 import logging
 import os
 import threading
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
 from typing import List, Optional, Any, Dict, Tuple
 
-import pandas as pd
 from flask import Flask
 
 from investing_algorithm_framework.app.algorithm import Algorithm
@@ -19,7 +18,7 @@
     AppMode, BacktestDateRange, DATABASE_DIRECTORY_NAME, DataSource, \
     BACKTESTING_INITIAL_AMOUNT, SNAPSHOT_INTERVAL, Backtest, DataError, \
     PortfolioConfiguration, SnapshotInterval, DataType, combine_backtests, \
-    PortfolioProvider, OrderExecutor, ImproperlyConfigured, \
+    PortfolioProvider, OrderExecutor, ImproperlyConfigured, TimeFrame, \
     DataProvider, INDEX_DATETIME, tqdm, BacktestPermutationTest, \
     LAST_SNAPSHOT_DATETIME, BACKTESTING_FLAG, generate_backtest_summary_metrics
 from investing_algorithm_framework.infrastructure import setup_sqlalchemy, \
@@ -794,8 +793,9 @@ def get_market_credentials(self) -> List[MarketCredential]:
     def check_data_completeness(
         self,
         strategies: List[TradingStrategy],
-        backtest_date_range: BacktestDateRange
-    ) -> None:
+        backtest_date_range: BacktestDateRange,
+        show_progress: bool = True
+    ) -> Tuple[bool, Dict[str, Any]]:
         """
         Function to check the data completeness for a set of strategies
         over a given backtest date range. This method checks if all data
@@ -807,18 +807,23 @@ def check_data_completeness(
                 to check data completeness for.
             backtest_date_range (BacktestDateRange): The date range to
                 check data completeness for.
+            show_progress (bool): Whether to show a progress bar when
+                checking data completeness.
         Returns:
-            None
+            Tuple[bool, Dict[str, Any]]: A tuple containing a boolean
+                indicating if the data is complete and a dictionary
+                with information about missing data for each data source.
         """
         data_sources = []
+        missing_data_info = {}
 
         for strategy in strategies:
             data_sources.extend(strategy.data_sources)
 
         self.initialize_data_sources_backtest(
             data_sources,
             backtest_date_range,
-            show_progress=True
+            show_progress=show_progress
         )
         data_provider_service = self.container.data_provider_service()
 
@@ -827,37 +832,60 @@ def check_data_completeness(
             for data_source in strategy.data_sources:
 
                 if DataType.OHLCV.equals(data_source.data_type):
-                    df = data_provider_service.get_ohlcv_data(
-                        symbol=data_source.symbol,
-                        start_date=backtest_date_range.start_date,
-                        end_date=backtest_date_range.end_date,
-                        pandas=True,
-                        add_pandas_index=False,
-                        add_datetime_column=True,
-                        time_frame=data_source.time_frame
-                    )
-                    df = df.copy()
-                    df['Datetime'] = pd.to_datetime(df['Datetime'])
-                    df = df.sort_values('Datetime')\
-                        .tail(data_source.window_size)
-                    start = df['Datetime'].iloc[0]
-                    end = df['Datetime'].iloc[-1]
-                    freq = pd.to_timedelta(data_source.time_frame.value)
-                    expected = pd.date_range(start, end, freq=freq)
-                    actual = df['Datetime']
-                    missing = expected.difference(actual)
-
-                    # Calculate the percentage completeness
-                    completeness = len(actual) / len(expected) * 100
-
-                    if completeness < 100:
+                    required_start_date = backtest_date_range.start_date - \
+                        timedelta(
+                            minutes=TimeFrame.from_value(
+                                data_source.time_frame
+                            ).amount_of_minutes * data_source.window_size
+                        )
+                    number_of_required_data_points = \
+                        data_source.get_number_of_required_data_points(
+                            backtest_date_range.start_date,
+                            backtest_date_range.end_date
+                        )
+
+                    try:
+                        data_provider = data_provider_service.get(data_source)
+                        number_of_available_data_points = \
+                            data_provider.get_number_of_data_points(
+                                backtest_date_range.start_date,
+                                backtest_date_range.end_date
+                            )
+
+                        missing_dates = \
+                            data_provider.get_missing_data_dates(
+                                required_start_date,
+                                backtest_date_range.end_date
+                            )
+                        if number_of_available_data_points > 0:
+                            missing_data_info[data_source.identifier] = {
+                                "data_source_id": data_source.identifier,
+                                "completeness_percentage": (
+                                    (
+                                        number_of_available_data_points /
+                                        number_of_required_data_points
+                                    ) * 100
+                                ),
+                                "missing_data_points": len(
+                                    missing_dates
+                                ),
+                                "missing_dates": missing_dates,
+                                "data_source_file_path":
+                                    data_provider.get_data_source_file_path()
+                            }
+
+                    except Exception as e:
                         raise DataError(
-                            f"Data completeness for data source "
+                            f"Error getting data provider for data source "
                             f"{data_source.identifier} "
-                            f"({data_source.symbol}) is {completeness:.2f}% "
-                            f"complete. Missing data points: {len(missing)}"
+                            f"({data_source.symbol}): {str(e)}"
                         )
 
+                    if len(missing_data_info.keys()) > 0:
+                        return False, missing_data_info
+
+        return True, missing_data_info
+
     def run_vector_backtests(
         self,
         initial_amount,
@@ -1071,13 +1099,10 @@ def run_vector_backtest(
                 progress bar when initializing data sources.
             market (str): The market to use for the backtest. This is used
                 to create a portfolio configuration if no portfolio
-                configuration is found for the strategy. If not provided,
-                the first portfolio configuration found will be used.
+                configuration is provided in the strategy.
             trading_symbol (str): The trading symbol to use for the backtest.
                 This is used to create a portfolio configuration if no
-                portfolio configuration is found for the strategy. If not
-                provided, the first trading symbol found in the portfolio
-                configuration will be used.
+                portfolio configuration is provided in the strategy.
             initial_amount (float): The initial amount to start the
                 backtest with. This will be the amount of trading currency
                 that the portfolio will start with. If not provided,
@@ -1181,7 +1206,7 @@ def run_backtests(
             backtest_date_ranges (List[BacktestDateRange]): List of date ranges
             initial_amount (float): The initial amount to start the
                 backtest with. This will be the amount of trading currency
-                that the portfolio will start with.
+                that the backtest portfolio will start with.
             snapshot_interval (SnapshotInterval): The snapshot interval to use
                 for the backtest. This is used to determine how often the
                 portfolio snapshot should be taken during the backtest.
@@ -1406,11 +1431,11 @@ def run_permutation_test(
                 the risk-free rate from the US Treasury website.
             market (str): The market to use for the backtest. This is used
                 to create a portfolio configuration if no portfolio
-                configuration is found for the strategy. If not provided,
+                configuration is provided in the strategy. If not provided,
                 the first portfolio configuration found will be used.
             trading_symbol (str): The trading symbol to use for the backtest.
                 This is used to create a portfolio configuration if no
-                portfolio configuration is found for the strategy. If not
+                portfolio configuration is provided in the strategy. If not
                 provided, the first trading symbol found in the portfolio
                 configuration will be used.
 
 
@@ -1,4 +1,4 @@
-from typing import List, Any
+from typing import List, Any, Union
 from abc import ABC, abstractmethod
 from datetime import datetime
 from investing_algorithm_framework.domain.exceptions import \
@@ -282,3 +282,53 @@ def copy(self, data_source: DataSource) -> "DataProvider":
                 configuration.
         """
         raise NotImplementedError("Subclasses should implement this method.")
+
+    @abstractmethod
+    def get_number_of_data_points(
+        self,
+        start_date: datetime,
+        end_date: datetime,
+    ) -> int:
+        """
+        Returns the number of data points available between the
+        given start and end dates.
+
+        Args:
+            start_date (datetime): The start date for the data points.
+            end_date (datetime): The end date for the data points.
+        Returns:
+            int: The number of data points available between the
+                given start and end dates.
+        """
+        raise NotImplementedError("Subclasses should implement this method.")
+
+    @abstractmethod
+    def get_missing_data_dates(
+        self,
+        start_date: datetime,
+        end_date: datetime,
+    ) -> List[datetime]:
+        """
+        Returns a list of dates for which data is missing between the
+        given start and end dates.
+
+        Args:
+            start_date (datetime): The start date for checking missing data.
+            end_date (datetime): The end date for checking missing data.
+
+        Returns:
+            List[datetime]: A list of dates for which data is missing
+                between the given start and end dates.
+        """
+        raise NotImplementedError("Subclasses should implement this method.")
+
+    @abstractmethod
+    def get_data_source_file_path(self) -> Union[str, None]:
+        """
+        Returns the file path for the given data source if applicable.
+
+        Returns:
+            Union[str, None]: The file path for the data source or None
+                if not applicable.
+        """
+        raise NotImplementedError("Subclasses should implement this method.")
@@ -92,9 +92,18 @@ class DataError(Exception):
     during data retrieval or processing
     """
 
-    def __init__(self, message) -> None:
+    def __init__(
+        self,
+        message,
+        data_source_file_path: str = None,
+        number_of_missing_data_points: int = None,
+        total_number_of_data_points: int = None,
+    ) -> None:
         super(DataError, self).__init__(message)
         self.error_message = message
+        self.data_source_file_path = data_source_file_path
+        self.number_of_missing_data_points = number_of_missing_data_points
+        self.total_number_of_data_points = total_number_of_data_points
 
     def to_response(self):
         return {
 
@@ -179,3 +179,36 @@ def create_start_date_data(self, index_date: datetime) -> datetime:
             (self.window_size * timedelta(
                 minutes=self.time_frame.amount_of_minutes
             ))
+
+    def get_number_of_required_data_points(
+        self, start_date: datetime, end_date: datetime
+    ) -> int:
+        """
+        Returns the number of data points required based on the given
+        attributes of the data source. If the required number of data points
+        can't be determined, it returns None.
+
+        E.g., for OHLCV data source, it
+        calculates the number of data points needed between the
+        start_date and end_date based on the time frame.
+
+        Args:
+            start_date (datetime): The start date for the data points.
+            end_date (datetime): The end date for the data points.
+
+        Returns:
+            int: The number of required data points, or None if it can't
+            be determined.
+        """
+
+        if self.time_frame is None:
+            return None
+
+        delta = end_date - start_date
+        total_minutes = delta.total_seconds() / 60
+        data_points = total_minutes / self.time_frame.amount_of_minutes
+
+        if self.window_size is not None:
+            data_points += self.window_size
+
+        return int(data_points)