Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,35 @@ jobs:
pip install click
./run-tests.sh --data-json

data-licenses:
data-mapping:
runs-on: ubuntu-24.04
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: 3.9

- name: Check data licenses
- name: Fetch base branch
run: |
git fetch origin ${{ github.event.pull_request.base.ref }}

- name: Check mapping of changed records
if: github.event_name == 'pull_request'
run: |
CHANGED_FILES=$(git diff --name-only origin/${{ github.event.pull_request.base.ref }}...HEAD data/records/)
echo "Changed files: $CHANGED_FILES"
pip install click pyyaml
./run-tests.sh --data-mapping ${CHANGED_FILES}

- name: Check mapping of all records
run: |
./run-tests.sh --data-licenses
pip install click pyyaml
./run-tests.sh --data-mapping

data-recids:
runs-on: ubuntu-24.04
Expand Down
11 changes: 6 additions & 5 deletions run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ data_json() {
find . -name "*.json" -exec ./scripts/clean_json_file.py --check {} \+
}

data_licenses() {
scripts/check_licenses.py
data_mapping() {
shift
scripts/check_mapping.py "$@"
}

data_recids() {
Expand Down Expand Up @@ -151,7 +152,7 @@ lint_yamllint() {
all() {
data_dois
data_json
data_licenses
data_mapping
data_recids
data_slugs
data_types
Expand All @@ -173,7 +174,7 @@ help() {
echo " --all Perform all checks [default]"
echo " --data-dois Check data DOIs"
echo " --data-json Check data JSON"
echo " --data-licenses Check data licenses"
echo " --data-mapping Check data mapping"
echo " --data-recids Check data record IDs "
echo " --data-slugs Check data slugs"
echo " --data-types Check data types"
Expand Down Expand Up @@ -201,7 +202,7 @@ case $arg in
--help) help ;;
--data-dois) data_dois ;;
--data-json) data_json ;;
--data-licenses) data_licenses ;;
--data-mapping) data_mapping "$@" ;;
--data-recids) data_recids ;;
--data-slugs) data_slugs ;;
--data-types) data_types ;;
Expand Down
104 changes: 0 additions & 104 deletions scripts/check_licenses.py

This file was deleted.

153 changes: 153 additions & 0 deletions scripts/check_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#!/usr/bin/env python

"""Check if license fields are valid in all records."""

import asyncio
import dataclasses
import itertools
import json
import logging
import os
import time
import typing
from glob import glob
from pathlib import Path

import click
import yaml

LOOP = asyncio.get_event_loop()
MAPPING = Path(os.getcwd()) / "scripts" / "record_mapping.yaml"
FILES = Path(os.getcwd()) / "data" / "records" / "*"


@dataclasses.dataclass
class InvalidRecord:
"""Dataclass to hold information about a validated file."""

recid: typing.Optional[str]
path: Path
msg: str


@click.command()
@click.option(
"-m",
"--mapping",
default=MAPPING,
type=click.Path(readable=True, path_type=Path, dir_okay=False),
help="Path to check records against.",
)
@click.option(
"-v", "--verbose", default=False, is_flag=True, help="Print verbose output."
)
@click.argument("files", type=click.Path(readable=True, path_type=Path), nargs=-1)
def command(**kwargs):
"""Validate a files of supplied paths. Arguments support unix-like patterns."""
try:
LOOP.run_until_complete(main(**kwargs))
finally:
LOOP.close()


async def main(mapping, verbose, files) -> None:
"""Validate record fields against a defined mapping."""
start_time = time.perf_counter()
files = files or (FILES,)

log_level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(level=log_level, format="[%(levelname)s] %(message)s")

logging.info("Loading mapping file...")
mapping: dict = await LOOP.run_in_executor(
None, lambda: yaml.safe_load(open(mapping, "r"))
)

globs = [glob(str(f)) for f in files]
paths = [Path(g) for g in itertools.chain(*globs)]
logging.info("Found %d files. Validating...", len(paths))

tasks = [LOOP.create_task(validate_single(path, mapping)) for path in paths]
results = await asyncio.gather(*tasks)

finish = f"within {time.perf_counter() - start_time:.2f} seconds"
logging.info(
"Validated %d files (%d records) %s.",
len(paths),
sum(r[0] for r in results),
finish,
)

errors = {p: e for _, e, p in results if len(e)}
if not errors:
logging.info("All files validated successfully. No errors found.")
exit(0)

logging.error(
"Found %d errors in %d files.",
sum(len(e) for e in errors.values()),
len(errors),
)

for p, err in errors.items():
logging.error("File %s has %d errors:", p.name, len(err))

for e in err:
logging.error(" - %s: %s", e.recid or "UNSET", e.msg)

exit(1)


async def validate_single(
path: Path, mapping: dict
) -> tuple[int, list[InvalidRecord], Path]:
"""Validate a single file against the mapping schema."""
errors = []
try:
records = await asyncio.get_event_loop().run_in_executor(
None, lambda p: json.loads(open(p, "rb").read()), path
)

except Exception as e:
logging.error("Failed to load json file %s: %s", path.name, e)
records = []

def rcheck(doc, validation, stack=None) -> typing.Generator[str, None, None]:
"""Recursively checks a record against the validation schema."""
stack = stack or []

if isinstance(validation, dict):
for v_key, v_value in validation.items():
is_optional = v_key.startswith("?")
v_key = v_key.removeprefix("?")

if v_key not in doc:
if not is_optional:
yield f"Missing required key [{']['.join(stack)}]->{v_key}"
continue

sub_docs = v if isinstance((v := doc[v_key]), list) else [v]
for sub_doc in sub_docs:
yield from rcheck(sub_doc, v_value, stack + [v_key])

else:
if validation is None:
return

allowed = validation if isinstance(validation, list) else [validation]

if not any(doc == pattern for pattern in allowed):
yield f"Value of [{']['.join(stack)}]: `{doc}` does not match any valid patterns: {allowed}"

for record in records:
if error_list := list(rcheck(record, mapping)):
for e in error_list:
rec = InvalidRecord(recid=record.get("recid"), path=path, msg=e)
errors.append(rec)

logging.debug("Validated file %s with %d records.", path.name, len(records))
return len(records), errors, path


if __name__ == "__main__":
command()
25 changes: 25 additions & 0 deletions scripts/record_mapping.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
?license:
attribution:
- Apache-2.0
- BSD-3-Clause
- CC0-1.0
- GPL-3.0-only
- MIT
?collision_information:
?type:
- e+e-
- pp
- pPb
- PbPb
- Interfill

experiment:
- ALICE
- ATLAS
- CMS
- DELPHI
- JADE
- LHCb
- OPERA
- PHENIX
- TOTEM