From d653588401f255c7af511e35be96091132fa47a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20Horv=C3=A1t?= Date: Fri, 30 May 2025 17:46:34 +0200 Subject: [PATCH] ci(check-mapping): generalised license checker and added cli params --- .github/workflows/ci.yml | 21 ++++- run-tests.sh | 11 +-- scripts/check_licenses.py | 104 ------------------------ scripts/check_mapping.py | 153 ++++++++++++++++++++++++++++++++++++ scripts/record_mapping.yaml | 25 ++++++ 5 files changed, 202 insertions(+), 112 deletions(-) delete mode 100755 scripts/check_licenses.py create mode 100755 scripts/check_mapping.py create mode 100644 scripts/record_mapping.yaml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4fe370be5..726f1c1a8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,20 +53,35 @@ jobs: pip install click ./run-tests.sh --data-json - data-licenses: + data-mapping: runs-on: ubuntu-24.04 steps: - name: Checkout uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Setup Python uses: actions/setup-python@v5 with: python-version: 3.9 - - name: Check data licenses + - name: Fetch base branch + run: | + git fetch origin ${{ github.event.pull_request.base.ref }} + + - name: Check mapping of changed records + if: github.event_name == 'pull_request' + run: | + CHANGED_FILES=$(git diff --name-only origin/${{ github.event.pull_request.base.ref }}...HEAD data/records/) + echo "Changed files: $CHANGED_FILES" + pip install click pyyaml + ./run-tests.sh --data-mapping ${CHANGED_FILES} + + - name: Check mapping of all records run: | - ./run-tests.sh --data-licenses + pip install click pyyaml + ./run-tests.sh --data-mapping data-recids: runs-on: ubuntu-24.04 diff --git a/run-tests.sh b/run-tests.sh index cc959110f..57e8b6dca 100755 --- a/run-tests.sh +++ b/run-tests.sh @@ -39,8 +39,9 @@ data_json() { find . -name "*.json" -exec ./scripts/clean_json_file.py --check {} \+ } -data_licenses() { - scripts/check_licenses.py +data_mapping() { + shift + scripts/check_mapping.py "$@" } data_recids() { @@ -151,7 +152,7 @@ lint_yamllint() { all() { data_dois data_json - data_licenses + data_mapping data_recids data_slugs data_types @@ -173,7 +174,7 @@ help() { echo " --all Perform all checks [default]" echo " --data-dois Check data DOIs" echo " --data-json Check data JSON" - echo " --data-licenses Check data licenses" + echo " --data-mapping Check data mapping" echo " --data-recids Check data record IDs " echo " --data-slugs Check data slugs" echo " --data-types Check data types" @@ -201,7 +202,7 @@ case $arg in --help) help ;; --data-dois) data_dois ;; --data-json) data_json ;; ---data-licenses) data_licenses ;; +--data-mapping) data_mapping "$@" ;; --data-recids) data_recids ;; --data-slugs) data_slugs ;; --data-types) data_types ;; diff --git a/scripts/check_licenses.py b/scripts/check_licenses.py deleted file mode 100755 index 5322b4b47..000000000 --- a/scripts/check_licenses.py +++ /dev/null @@ -1,104 +0,0 @@ -#!/usr/bin/env python - -"""Check if license fields are valid in all records.""" - -import asyncio -import json -import logging -import os -import pathlib -import time - -VALID_LICENSE_IDENTIFIERS = [ - "CC0-1.0", - "GPL-3.0-only", - "MIT", - "Apache-2.0", - "BSD-3-Clause", -] - -logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s") - - -async def validate_file(path: pathlib.Path) -> int: - """Validate a single file.""" - checks = 0 - errors = 0 - records = await asyncio.get_event_loop().run_in_executor( - None, lambda p: json.loads(open(p, "rb").read()), path - ) - - for record in records: - if rec_licenses := record.get("license"): - try: - attr = rec_licenses["attribution"] - except KeyError: - recid = record.get("recid", "UNSET") - message = f"License field set but without attribution in file {path.name} with recid {recid}!" - - logging.error(message) - errors += 1 - continue - - if attr not in VALID_LICENSE_IDENTIFIERS: - recid = record.get("recid", "UNSET") - message = f"Invalid license identifier `{attr}` in file {path.name} for recid {recid}! " - - logging.error(message) - errors += 1 - else: - checks += 1 - - if errors: - raise ValueError(errors) - - logging.info(f"Successfully validated file {path.name}") - return checks - - -async def check_all_paths(): - """Execute checks on all found files.""" - start_time = time.perf_counter() - - loop = asyncio.get_event_loop() - - root_path = pathlib.Path(os.getcwd()) / "data" / "records" - all_paths = list(root_path.glob("*.json")) - - tasks = [loop.create_task(validate_file(file_path)) for file_path in all_paths] - results = await asyncio.gather(*tasks, return_exceptions=True) - - finish_time = time.perf_counter() - start_time - logging.info(f"Processed {len(all_paths)} files within {finish_time:.2f} seconds.") - - if any(isinstance(result, Exception) for result in results): - errors = sum( - [ - int(str(result)) if str(result).isdigit() else 1 - for result in results - if isinstance(result, Exception) - ] - ) - logging.error( - f"Validation completed with {errors} errors!\n" - f"\tPlease ensure the licenses are one of the following: {VALID_LICENSE_IDENTIFIERS}.\n" - f"\tIf you are using a valid SPDX license string that is not in the above list, " - f"please contact `opendata-team@cern.ch`." - ) - exit(1) - - else: - logging.info(f"Successfully validated {sum(results)} records. No errors found.") - - -def main(): - """Test to validate all license fields.""" - loop = asyncio.new_event_loop() - try: - loop.run_until_complete(check_all_paths()) - finally: - loop.close() - - -if __name__ == "__main__": - main() diff --git a/scripts/check_mapping.py b/scripts/check_mapping.py new file mode 100755 index 000000000..a07ffbb26 --- /dev/null +++ b/scripts/check_mapping.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python + +"""Check if license fields are valid in all records.""" + +import asyncio +import dataclasses +import itertools +import json +import logging +import os +import time +import typing +from glob import glob +from pathlib import Path + +import click +import yaml + +LOOP = asyncio.get_event_loop() +MAPPING = Path(os.getcwd()) / "scripts" / "record_mapping.yaml" +FILES = Path(os.getcwd()) / "data" / "records" / "*" + + +@dataclasses.dataclass +class InvalidRecord: + """Dataclass to hold information about a validated file.""" + + recid: typing.Optional[str] + path: Path + msg: str + + +@click.command() +@click.option( + "-m", + "--mapping", + default=MAPPING, + type=click.Path(readable=True, path_type=Path, dir_okay=False), + help="Path to check records against.", +) +@click.option( + "-v", "--verbose", default=False, is_flag=True, help="Print verbose output." +) +@click.argument("files", type=click.Path(readable=True, path_type=Path), nargs=-1) +def command(**kwargs): + """Validate a files of supplied paths. Arguments support unix-like patterns.""" + try: + LOOP.run_until_complete(main(**kwargs)) + finally: + LOOP.close() + + +async def main(mapping, verbose, files) -> None: + """Validate record fields against a defined mapping.""" + start_time = time.perf_counter() + files = files or (FILES,) + + log_level = logging.DEBUG if verbose else logging.INFO + logging.basicConfig(level=log_level, format="[%(levelname)s] %(message)s") + + logging.info("Loading mapping file...") + mapping: dict = await LOOP.run_in_executor( + None, lambda: yaml.safe_load(open(mapping, "r")) + ) + + globs = [glob(str(f)) for f in files] + paths = [Path(g) for g in itertools.chain(*globs)] + logging.info("Found %d files. Validating...", len(paths)) + + tasks = [LOOP.create_task(validate_single(path, mapping)) for path in paths] + results = await asyncio.gather(*tasks) + + finish = f"within {time.perf_counter() - start_time:.2f} seconds" + logging.info( + "Validated %d files (%d records) %s.", + len(paths), + sum(r[0] for r in results), + finish, + ) + + errors = {p: e for _, e, p in results if len(e)} + if not errors: + logging.info("All files validated successfully. No errors found.") + exit(0) + + logging.error( + "Found %d errors in %d files.", + sum(len(e) for e in errors.values()), + len(errors), + ) + + for p, err in errors.items(): + logging.error("File %s has %d errors:", p.name, len(err)) + + for e in err: + logging.error(" - %s: %s", e.recid or "UNSET", e.msg) + + exit(1) + + +async def validate_single( + path: Path, mapping: dict +) -> tuple[int, list[InvalidRecord], Path]: + """Validate a single file against the mapping schema.""" + errors = [] + try: + records = await asyncio.get_event_loop().run_in_executor( + None, lambda p: json.loads(open(p, "rb").read()), path + ) + + except Exception as e: + logging.error("Failed to load json file %s: %s", path.name, e) + records = [] + + def rcheck(doc, validation, stack=None) -> typing.Generator[str, None, None]: + """Recursively checks a record against the validation schema.""" + stack = stack or [] + + if isinstance(validation, dict): + for v_key, v_value in validation.items(): + is_optional = v_key.startswith("?") + v_key = v_key.removeprefix("?") + + if v_key not in doc: + if not is_optional: + yield f"Missing required key [{']['.join(stack)}]->{v_key}" + continue + + sub_docs = v if isinstance((v := doc[v_key]), list) else [v] + for sub_doc in sub_docs: + yield from rcheck(sub_doc, v_value, stack + [v_key]) + + else: + if validation is None: + return + + allowed = validation if isinstance(validation, list) else [validation] + + if not any(doc == pattern for pattern in allowed): + yield f"Value of [{']['.join(stack)}]: `{doc}` does not match any valid patterns: {allowed}" + + for record in records: + if error_list := list(rcheck(record, mapping)): + for e in error_list: + rec = InvalidRecord(recid=record.get("recid"), path=path, msg=e) + errors.append(rec) + + logging.debug("Validated file %s with %d records.", path.name, len(records)) + return len(records), errors, path + + +if __name__ == "__main__": + command() diff --git a/scripts/record_mapping.yaml b/scripts/record_mapping.yaml new file mode 100644 index 000000000..78df7a234 --- /dev/null +++ b/scripts/record_mapping.yaml @@ -0,0 +1,25 @@ +?license: + attribution: + - Apache-2.0 + - BSD-3-Clause + - CC0-1.0 + - GPL-3.0-only + - MIT +?collision_information: + ?type: + - e+e- + - pp + - pPb + - PbPb + - Interfill + +experiment: + - ALICE + - ATLAS + - CMS + - DELPHI + - JADE + - LHCb + - OPERA + - PHENIX + - TOTEM