Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
f9973bf
removing not necessary function
bruAristimunha Oct 22, 2025
f3dc559
updating
bruAristimunha Oct 24, 2025
58cf377
updating the fetch dataset
bruAristimunha Oct 24, 2025
5d757d4
cloning
bruAristimunha Oct 24, 2025
37ebae0
updating
bruAristimunha Oct 25, 2025
6a72510
iterating
bruAristimunha Oct 25, 2025
705df51
first step, fetching automatically
bruAristimunha Nov 7, 2025
feef992
fetch openneuro
bruAristimunha Nov 7, 2025
5728ac7
including the 1-fetch-openneuro
bruAristimunha Nov 7, 2025
53a85fe
updating the fetch
bruAristimunha Nov 7, 2025
e6b6a93
updating the fetch
bruAristimunha Nov 7, 2025
26b360c
saving the .json file and updating the fetch
bruAristimunha Nov 7, 2025
696a22b
updating the fetch to add
bruAristimunha Nov 7, 2025
f35773e
chore: update OpenNeuro & NEMAR dataset listings
github-actions[bot] Nov 7, 2025
69da4a1
updating the tests
bruAristimunha Nov 7, 2025
75f6b82
Merge branch 'diggestion-v2' of https://github.com/sccn/EEGDash into …
bruAristimunha Nov 7, 2025
7ae29d6
including and updating
bruAristimunha Nov 7, 2025
add6f9e
chore: update OpenNeuro & NEMAR dataset listings and filtered to_dige…
github-actions[bot] Nov 7, 2025
1a46825
updating the scripts
bruAristimunha Nov 11, 2025
a989519
chore: update OpenNeuro & NEMAR dataset listings and filtered to_dige…
github-actions[bot] Nov 11, 2025
a67911e
renaming for the correct entities
bruAristimunha Nov 11, 2025
e56da2c
done with openneuro
bruAristimunha Nov 11, 2025
81b92a8
scidb for later
bruAristimunha Nov 11, 2025
5e984d1
updating the fetch for zenodo
bruAristimunha Nov 11, 2025
2ab4963
figure share
bruAristimunha Nov 11, 2025
4f26ea2
updating the json
bruAristimunha Nov 11, 2025
fa2eb74
updating the clone please
bruAristimunha Nov 11, 2025
851096f
removing .json files
bruAristimunha Nov 11, 2025
44c7e3f
testing diggestion
bruAristimunha Nov 11, 2025
fa0a05b
1, to allow downloading
bruAristimunha Nov 11, 2025
4a40623
using more constant
bruAristimunha Nov 11, 2025
d292a7e
updating the documentation
bruAristimunha Nov 11, 2025
39e7f5c
updating
bruAristimunha Nov 11, 2025
f350a34
test correctness
bruAristimunha Nov 11, 2025
5b1e541
remove the json to move to another place
bruAristimunha Nov 11, 2025
feb4b66
removing the other scripts
bruAristimunha Nov 17, 2025
12d7995
Merge branch 'develop' into diggestion-v2
bruAristimunha Nov 17, 2025
8635ea8
reverting the .json
bruAristimunha Nov 17, 2025
d1f030f
Merge branch 'diggestion-v2' of https://github.com/sccn/EEGDash into …
bruAristimunha Nov 17, 2025
203577f
Update eegdash/dataset/bids_dataset.py
bruAristimunha Nov 17, 2025
60513b2
updating the gitignore
bruAristimunha Nov 17, 2025
733a730
pre-commit
bruAristimunha Nov 17, 2025
4071d02
running inside the CI with new configuration
bruAristimunha Nov 17, 2025
badf18f
chore: trigger CI to test dataset listings workflow
bruAristimunha Nov 17, 2025
3404182
refactor: remove filtering step, fetch only
bruAristimunha Nov 17, 2025
242cf79
fix: use DATASET_LISTINGS_TOKEN for pushing to dataset listings repo
bruAristimunha Nov 17, 2025
a55cbe5
fix(ci): persist credentials for dataset listings checkout to allow p…
bruAristimunha Nov 17, 2025
838998b
ci: trigger fetch run (empty commit)
bruAristimunha Nov 17, 2025
6035f96
updating diggestion
bruAristimunha Nov 17, 2025
405b54d
caching installation to make more fast
bruAristimunha Nov 17, 2025
648cf0d
pull now
bruAristimunha Nov 17, 2025
488993d
beautifulsoup4
bruAristimunha Nov 17, 2025
0ac327c
comment the osf because of bad filtering
bruAristimunha Nov 17, 2025
a95a19c
sort across the json
bruAristimunha Nov 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
451 changes: 451 additions & 0 deletions .github/workflows/1-fetch-openneuro-datasets-nemar.yml

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ tests/data/
.vscode/
build/


dist/
docs/build/
docs/source/gen_modules/
Expand All @@ -43,6 +42,13 @@ data/
*.isorted
*.py.isorted

# Exclude cloned dataset directories and JSON listings
test_diggestion/
eegdash-datasets/
eegdash-dataset-listings/
digestion_output/
consolidated/

examples/eeg2025/weights*
# Generated dataset visualizations
docs/source/_static/dataset_generated/
268 changes: 113 additions & 155 deletions eegdash/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@
EEG data from S3 for matched records.
"""

import json
import os
from pathlib import Path
from typing import Any, Mapping

import mne
import numpy as np
import pandas as pd
from mne.utils import _soft_import
from pymongo import InsertOne, UpdateOne

from .bids_eeg_metadata import (
build_query_from_kwargs,
Expand Down Expand Up @@ -353,9 +355,18 @@ def _raise_if_conflicting_constraints(
)

def add_bids_dataset(
self, dataset: str, data_dir: str, overwrite: bool = True
) -> None:
"""Scan a local BIDS dataset and upsert records into MongoDB.
self,
dataset: str,
data_dir: str,
overwrite: bool = True,
output_path: str | Path | None = None,
) -> dict[str, Any]:
"""Collect metadata for a local BIDS dataset as JSON-ready records.

Instead of inserting records directly into MongoDB, this method scans
``data_dir`` and returns a JSON-serializable manifest describing every
EEG recording that was discovered. The manifest can be written to disk
or forwarded to the EEGDash ingestion API for persistence.

Parameters
----------
Expand All @@ -364,127 +375,91 @@ def add_bids_dataset(
data_dir : str
Path to the local BIDS dataset directory.
overwrite : bool, default True
If ``True``, update existing records when encountered; otherwise,
skip records that already exist.
If ``False``, skip records that already exist in the database based
on ``data_name`` lookups.
output_path : str | Path | None, optional
If provided, the manifest is written to the given JSON file.

Raises
------
ValueError
If called on a public client ``(is_public=True)``.
Returns
-------
dict
A manifest with keys ``dataset``, ``source``, ``records`` and, when
applicable, ``skipped`` or ``errors``.

"""
if self.is_public:
raise ValueError("This operation is not allowed for public users")

if not overwrite and self.exist({"dataset": dataset}):
logger.info("Dataset %s already exists in the database", dataset)
return
source_dir = Path(data_dir).expanduser()
try:
bids_dataset = EEGBIDSDataset(
data_dir=data_dir,
data_dir=str(source_dir),
dataset=dataset,
)
except Exception as e:
logger.error("Error creating bids dataset %s: %s", dataset, str(e))
raise e
requests = []
for bids_file in bids_dataset.get_files():
try:
data_id = f"{dataset}_{Path(bids_file).name}"

if self.exist({"data_name": data_id}):
if overwrite:
eeg_attrs = load_eeg_attrs_from_bids_file(
bids_dataset, bids_file
)
requests.append(self._update_request(eeg_attrs))
else:
eeg_attrs = load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
requests.append(self._add_request(eeg_attrs))
except Exception as e:
logger.error("Error adding record %s", bids_file)
logger.error(str(e))

logger.info("Number of requests: %s", len(requests))

if requests:
result = self.__collection.bulk_write(requests, ordered=False)
logger.info("Inserted: %s ", result.inserted_count)
logger.info("Modified: %s ", result.modified_count)
logger.info("Deleted: %s", result.deleted_count)
logger.info("Upserted: %s", result.upserted_count)
logger.info("Errors: %s ", result.bulk_api_result.get("writeErrors", []))

def _add_request(self, record: dict) -> InsertOne:
"""Create a MongoDB insertion request for a record.

Parameters
----------
record : dict
The record to insert.

Returns
-------
InsertOne
A PyMongo ``InsertOne`` object.

"""
return InsertOne(record)

def add(self, record: dict) -> None:
"""Add a single record to the MongoDB collection.

Parameters
----------
record : dict
The record to add.

"""
try:
self.__collection.insert_one(record)
except ValueError as e:
logger.error("Validation error for record: %s ", record["data_name"])
logger.error(e)
except Exception as exc:
logger.error(
"Error adding record: %s ", record.get("data_name", "<unknown>")
)
logger.debug("Add operation failed", exc_info=exc)
logger.error("Error creating BIDS dataset %s: %s", dataset, exc)
raise exc

def _update_request(self, record: dict) -> UpdateOne:
"""Create a MongoDB update request for a record.
records: list[dict[str, Any]] = []
skipped: list[str] = []
errors: list[dict[str, str]] = []

Parameters
----------
record : dict
The record to update.

Returns
-------
UpdateOne
A PyMongo ``UpdateOne`` object.

"""
return UpdateOne({"data_name": record["data_name"]}, {"$set": record})
for bids_file in bids_dataset.get_files():
data_id = f"{dataset}_{Path(bids_file).name}"
if not overwrite:
try:
if self.exist({"data_name": data_id}):
skipped.append(data_id)
continue
except Exception as exc:
logger.warning(
"Could not verify existing record %s due to: %s",
data_id,
exc,
)

def update(self, record: dict) -> None:
"""Update a single record in the MongoDB collection.
try:
eeg_attrs = load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
records.append(eeg_attrs)
except Exception as exc: # log and continue collecting
logger.error("Error extracting metadata for %s", bids_file)
logger.error(str(exc))
errors.append({"file": str(bids_file), "error": str(exc)})

manifest: dict[str, Any] = {
"dataset": dataset,
"source": str(source_dir.resolve()),
"record_count": len(records),
"records": records,
}
if skipped:
manifest["skipped"] = skipped
if errors:
manifest["errors"] = errors

if output_path is not None:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as fh:
json.dump(
manifest,
fh,
indent=2,
sort_keys=True,
default=_json_default,
)
logger.info(
"Wrote EEGDash ingestion manifest for %s to %s",
dataset,
output_path,
)

Parameters
----------
record : dict
Record content to set at the matching ``data_name``.
logger.info(
"Prepared %s records for dataset %s (skipped=%s, errors=%s)",
len(records),
dataset,
len(skipped),
len(errors),
)

"""
try:
self.__collection.update_one(
{"data_name": record["data_name"]}, {"$set": record}
)
except Exception as exc: # log and continue
logger.error(
"Error updating record: %s", record.get("data_name", "<unknown>")
)
logger.debug("Update operation failed", exc_info=exc)
return manifest

def exists(self, query: dict[str, Any]) -> bool:
"""Check if at least one record matches the query.
Expand All @@ -504,35 +479,6 @@ def exists(self, query: dict[str, Any]) -> bool:
"""
return self.exist(query)

def remove_field(self, record: dict, field: str) -> None:
"""Remove a field from a specific record in the MongoDB collection.

Parameters
----------
record : dict
Record-identifying object with a ``data_name`` key.
field : str
The name of the field to remove.

"""
self.__collection.update_one(
{"data_name": record["data_name"]}, {"$unset": {field: 1}}
)

def remove_field_from_db(self, field: str) -> None:
"""Remove a field from all records in the database.

.. warning::
This is a destructive operation and cannot be undone.

Parameters
----------
field : str
The name of the field to remove from all documents.

"""
self.__collection.update_many({}, {"$unset": {field: 1}})

@property
def collection(self):
"""The underlying PyMongo ``Collection`` object.
Expand All @@ -545,26 +491,38 @@ def collection(self):
"""
return self.__collection

def close(self) -> None:
"""Close the MongoDB connection.

.. deprecated:: 0.1
Connections are now managed globally by :class:`MongoConnectionManager`.
This method is a no-op and will be removed in a future version.
Use :meth:`EEGDash.close_all_connections` to close all clients.
"""
# Individual instances no longer close the shared client
pass

@classmethod
def close_all_connections(cls) -> None:
"""Close all MongoDB client connections managed by the singleton manager."""
MongoConnectionManager.close_all()

def __del__(self) -> None:
"""Destructor; no explicit action needed due to global connection manager."""
# No longer needed since we're using singleton pattern

def _json_default(value: Any) -> Any:
"""Fallback serializer for complex objects when exporting ingestion JSON."""
try:
if isinstance(value, (np.generic,)):
return value.item()
if isinstance(value, np.ndarray):
return value.tolist()
except Exception:
pass

try:
if value is pd.NA:
return None
if isinstance(value, (pd.Timestamp, pd.Timedelta)):
return value.isoformat()
if isinstance(value, pd.Series):
return value.to_dict()
except Exception:
pass

if isinstance(value, Path):
return value.as_posix()
if isinstance(value, set):
return sorted(value)

raise TypeError(f"Object of type {type(value).__name__} is not JSON serializable")


__all__ = ["EEGDash"]
Loading