dynamical-org
diff --git a/‎src/reformatters/common/download.py‎
Lines changed: 1 addition & 1 deletion b/‎src/reformatters/common/download.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/reformatters/contrib/noaa/__init__.py‎ b/‎src/reformatters/contrib/noaa/__init__.py‎
diff --git a/‎src/reformatters/contrib/noaa/ndvi_cdr/__init__.py‎ b/‎src/reformatters/contrib/noaa/ndvi_cdr/__init__.py‎
diff --git a/‎src/reformatters/contrib/noaa/ndvi_cdr/analysis/region_job.py‎
Lines changed: 92 additions & 9 deletions b/‎src/reformatters/contrib/noaa/ndvi_cdr/analysis/region_job.py‎
Lines changed: 92 additions & 9 deletions
diff --git a/‎tests/contrib/noaa/ndvi_cdr/analysis/region_job_test.py‎
Lines changed: 122 additions & 0 deletions b/‎tests/contrib/noaa/ndvi_cdr/analysis/region_job_test.py‎
Lines changed: 122 additions & 0 deletions
@@ -67,7 +67,7 @@ def http_store(base_url: str) -> obstore.store.HTTPStore:
         base_url,
         client_options={
             "connect_timeout": "4 seconds",
-            "timeout": "16 seconds",
+            "timeout": "120 seconds",
         },
         retry_config={
             "max_retries": 16,
 
@@ -1,3 +1,4 @@
+import re
 from collections.abc import Callable, Mapping, Sequence
 from pathlib import Path
 from typing import cast
@@ -7,10 +8,16 @@
 import obstore
 import pandas as pd
 import rasterio  # type: ignore[import-untyped]
+import requests
 import xarray as xr
 import zarr
 
-from reformatters.common.download import download_to_disk, get_local_path, s3_store
+from reformatters.common.download import (
+    download_to_disk,
+    get_local_path,
+    http_store,
+    s3_store,
+)
 from reformatters.common.iterating import item
 from reformatters.common.logging import get_logger
 from reformatters.common.region_job import (
@@ -61,14 +68,20 @@ def out_loc(self) -> Mapping[Dim, CoordinateValueOrRange]:
 class NoaaNdviCdrAnalysisRegionJob(
     RegionJob[NoaaNdviCdrDataVar, NoaaNdviCdrAnalysisSourceFileCoord]
 ):
-    download_parallelism: int = 10
+    # Set lower than would be needed for fetching exclusively from S3
+    # to accomodate the cases where we are downloading from NCEI.
+    download_parallelism: int = 5
 
     # We observed deadlocks when using more than 2 threads to read data into shared memory.
     read_parallelism: int = 1
 
     s3_bucket_url: str = "s3://noaa-cdr-ndvi-pds"
     s3_region: str = "us-east-1"
 
+    root_nc_url: str = (
+        "http://ncei.noaa.gov/data/land-normalized-difference-vegetation-index/access"
+    )
+
     def generate_source_file_coords(
         self,
         processing_region_ds: xr.Dataset,
@@ -96,10 +109,20 @@ def generate_source_file_coords(
                 # We want to extract the date part (e.g., 19810728)
                 try:
                     _, date_str, _ = filepath.rsplit("_", 2)
+
                     # Parse date string to pd.Timestamp
                     file_time = pd.Timestamp(date_str)
                     filename = Path(filepath).name
-                    url = f"{self.s3_bucket_url}/data/{year}/{filename}"
+
+                    # if file_time is within 2 weeks of today, fetch from ncei,
+                    # otherwise fetch from S3
+                    two_weeks_ago = pd.Timestamp.now() - pd.Timedelta(days=14)
+                    is_within_last_2_weeks = two_weeks_ago <= file_time
+                    if is_within_last_2_weeks:
+                        url = f"{self.root_nc_url}/{year}/{filename}"
+                    else:
+                        url = f"{self.s3_bucket_url}/data/{year}/{filename}"
+
                     urls_by_time[file_time] = url
                 except Exception as e:
                     log.warning(f"Skipping file {filepath} due to error: {e}")
@@ -117,14 +140,20 @@ def generate_source_file_coords(
 
     def download_file(self, coord: NoaaNdviCdrAnalysisSourceFileCoord) -> Path:
         """Download the file for the given coordinate and return the local path."""
-        store = s3_store(self.s3_bucket_url, self.s3_region, skip_signature=True)
+        url = coord.get_url()
+        parsed_url = urlparse(url)
+
+        store: obstore.store.HTTPStore | obstore.store.S3Store
+        if parsed_url.netloc == "ncei.noaa.gov":
+            store = http_store(f"https://{parsed_url.netloc}")
+        else:
+            store = s3_store(self.s3_bucket_url, self.s3_region, skip_signature=True)
 
-        s3_url = coord.get_url()
-        object_key = urlparse(s3_url).path.removeprefix("/")
-        local_path = get_local_path(self.dataset_id, object_key)
+        remote_path = urlparse(url).path.removeprefix("/")
+        local_path = get_local_path(self.dataset_id, remote_path)
 
-        download_to_disk(store, object_key, local_path, overwrite_existing=True)
-        log.debug(f"Downloaded {object_key} to {local_path}")
+        download_to_disk(store, remote_path, local_path, overwrite_existing=True)
+        log.debug(f"Downloaded {url} to {local_path}")
 
         return local_path
 
@@ -197,6 +226,22 @@ def _read_usable_ndvi(
         return cast(ArrayFloat32, ndvi_data)
 
     def _list_source_files(self, year: int) -> list[str]:
+        # We believe NCEI will have more recent files before S3 does.
+        # While this gap may only be a couple of weeks at most, we cannot enumerate
+        # files by a coarser granularity than a year. The reason we check if the requested
+        # year is the current or previous year is to be sure that we continue to check
+        # NCEI in early January of the current year. I.e., in Jan 2026, we should check
+        # NCEI for the 2025 files.
+        #
+        # We hardcode 2025 as the earliest year to check NCEI, since as of this writing,
+        # we know S3 is up to date through June 2025. Backfills should go through S3.
+        current_year = pd.Timestamp.now().year
+        if year >= 2025 and year in (current_year, current_year - 1):
+            return self._list_ncei_source_files(year)
+        else:
+            return self._list_s3_source_files(year)
+
+    def _list_s3_source_files(self, year: int) -> list[str]:
         store = s3_store(self.s3_bucket_url, self.s3_region, skip_signature=True)
         results = list(obstore.list(store, f"data/{year}", chunk_size=366))
         if len(results) == 0:
@@ -208,6 +253,44 @@ def _list_source_files(self, year: int) -> list[str]:
 
         return [result["path"] for result in results[0]]
 
+    def _list_ncei_source_files(self, year: int) -> list[str]:
+        """List source files from NCEI.
+
+        The response text from NCEI is HTML with a table enumerating available files. Example:
+
+        <td><a href="VIIRS-Land_v001_JP113C1_NOAA-20_20250101_c20250103153010.nc">VIIRS-Land_v001_JP113C1_NOAA-20_20250101_c20250103153010.nc</a></td>
+        <td align="right">2025-01-05 15:40</td>
+        <td align="right">63914048</td>
+        <td></td>
+        </tr>
+        <tr>
+        <td><a href="VIIRS-Land_v001_JP113C1_NOAA-20_20250102_c20250104153009.nc">VIIRS-Land_v001_JP113C1_NOAA-20_20250102_c20250104153009.nc</a></td>
+        ...
+        """
+        ncei_url = f"{self.root_nc_url}/{year}/"
+
+        response = requests.get(ncei_url, timeout=15)
+        response.raise_for_status()
+
+        content = response.text
+        filenames = re.findall(r"href=\"(VIIRS-Land.+nc)\"", content)
+        filenames = list(set(filenames))
+
+        # Simple check: startswith, endswith, and only one .nc present
+        def is_valid_viirs_nc(fname: str) -> bool:
+            return (
+                fname.startswith("VIIRS-Land")
+                and fname.endswith(".nc")
+                and fname.count(".nc") == 1
+            )
+
+        assert all(is_valid_viirs_nc(fname) for fname in filenames), (
+            "Some filenames do not conform to expected structure: "
+            + str([fname for fname in filenames if not is_valid_viirs_nc(fname)])
+        )
+
+        return filenames
+
     @classmethod
     def operational_update_jobs(
         cls,
 
@@ -1,3 +1,4 @@
+from typing import Any
 from unittest.mock import Mock
 
 import numpy as np
@@ -315,3 +316,124 @@ def mock_read_netcdf_data(
     assert result[2, 0] == 0.2  # land_no_desert+aerosol preserved
     assert np.isnan(result[2, 1])  # no aerosol quality masked
     assert np.isnan(result[2, 2])  # no aerosol quality masked
+
+
+def test_generate_source_file_coords_uses_ncei_for_recent_year(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test that NCEI is used for recent source files in generate_source_file_coords."""
+
+    # Mock pd.Timestamp.now to return a date within 2 weeks of the test files
+    monkeypatch.setattr("pandas.Timestamp.now", lambda: pd.Timestamp("2026-01-15"))
+    monkeypatch.setattr("obstore.list", Mock())
+
+    def mock_requests_get(url: str, **kwargs: Any) -> Mock:
+        mock_response = Mock()
+        mock_response.raise_for_status = Mock()
+        if "2025" in url:
+            mock_response.text = """
+            <a href="VIIRS-Land_v001_JP113C1_NOAA-20_20251231_c20250102153009.nc">VIIRS-Land_v001_JP113C1_NOAA-20_20251231_c20250102153009.nc</a>
+            """
+        elif "2026" in url:
+            mock_response.text = """
+            <a href="VIIRS-Land_v001_JP113C1_NOAA-20_20260101_c20260103153010.nc">VIIRS-Land_v001_JP113C1_NOAA-20_20260101_c20260103153010.nc</a>
+            <a href="VIIRS-Land_v001_JP113C1_NOAA-20_20260102_c20260104153009.nc">VIIRS-Land_v001_JP113C1_NOAA-20_20260102_c20260104153009.nc</a>
+            """
+        else:
+            mock_response.text = ""
+        return mock_response
+
+    monkeypatch.setattr("requests.get", mock_requests_get)
+
+    template_config = NoaaNdviCdrAnalysisTemplateConfig()
+
+    template_ds = xr.Dataset(
+        coords={
+            "time": pd.date_range("2025-12-31", "2026-01-02", freq="D"),
+            "latitude": np.linspace(89.999998472637188, -89.999998472637188, 3600),
+            "longitude": np.linspace(-180.000006104363450, 179.999993895636550, 7200),
+        }
+    )
+
+    region_job = NoaaNdviCdrAnalysisRegionJob.model_construct(
+        final_store=get_zarr_store("prod-path", "test-dataset", "test-version"),
+        tmp_store=Mock(),
+        template_ds=template_ds,
+        data_vars=template_config.data_vars,
+        append_dim=template_config.append_dim,
+        region=Mock(spec=slice),
+        reformat_job_name="test",
+    )
+
+    processing_region_ds = template_ds.isel(latitude=slice(0, 10))
+    coords = region_job.generate_source_file_coords(
+        processing_region_ds, template_config.data_vars
+    )
+
+    assert len(coords) == 3
+    assert (
+        coords[0].get_url()
+        == "s3://noaa-cdr-ndvi-pds/data/2025/VIIRS-Land_v001_JP113C1_NOAA-20_20251231_c20250102153009.nc"
+    )
+    assert (
+        coords[1].get_url()
+        == "http://ncei.noaa.gov/data/land-normalized-difference-vegetation-index/access/2026/VIIRS-Land_v001_JP113C1_NOAA-20_20260101_c20260103153010.nc"
+    )
+    assert (
+        coords[2].get_url()
+        == "http://ncei.noaa.gov/data/land-normalized-difference-vegetation-index/access/2026/VIIRS-Land_v001_JP113C1_NOAA-20_20260102_c20260104153009.nc"
+    )
+
+
+@pytest.mark.parametrize(
+    "test_year,expected_source,expected_result",
+    [
+        (
+            2026,
+            "ncei",
+            ["ncei_file.nc"],
+        ),  # Current year -> NCEI (# current year mocked to 2026)
+        (2025, "ncei", ["ncei_file.nc"]),  # Previous year -> NCEI
+        (2024, "s3", ["s3_file.nc"]),  # 2+ years ago -> S3
+        (2020, "s3", ["s3_file.nc"]),  # Older year -> S3
+    ],
+)
+def test_list_source_files_routing_by_year(
+    monkeypatch: pytest.MonkeyPatch,
+    test_year: int,
+    expected_source: str,
+    expected_result: list[str],
+) -> None:
+    """Test that _list_source_files routes to NCEI for recent years and S3 for older years."""
+    # Mock current date to 2026
+    mock_now = Mock(return_value=pd.Timestamp("2026-06-15"))
+    monkeypatch.setattr("pandas.Timestamp.now", mock_now)
+
+    template_config = NoaaNdviCdrAnalysisTemplateConfig()
+
+    region_job = NoaaNdviCdrAnalysisRegionJob.model_construct(
+        final_store=get_zarr_store("prod-path", "test-dataset", "test-version"),
+        tmp_store=Mock(),
+        template_ds=Mock(),
+        data_vars=template_config.data_vars,
+        append_dim=template_config.append_dim,
+        region=Mock(spec=slice),
+        reformat_job_name="test",
+    )
+
+    # Mock both methods
+    mock_ncei = Mock(return_value=["ncei_file.nc"])
+    mock_s3 = Mock(return_value=["s3_file.nc"])
+    monkeypatch.setattr(region_job, "_list_ncei_source_files", mock_ncei)
+    monkeypatch.setattr(region_job, "_list_s3_source_files", mock_s3)
+
+    result = region_job._list_source_files(test_year)
+
+    assert result == expected_result
+
+    if expected_source == "ncei":
+        mock_ncei.assert_called_once_with(test_year)
+        mock_s3.assert_not_called()
+    else:
+        mock_s3.assert_called_once_with(test_year)
+        mock_ncei.assert_not_called()