@@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles):
134
134
135
135
def customize_schema_rdm (json_record ):
136
136
# Get vocabularies used in InvenioRDM
137
- vocabularies = get_vocabularies ()
138
137
138
+ vocabularies = get_vocabularies ()
139
+ validate_metadata (json_record )
139
140
peopleroles = vocabularies ["crr" ]
140
141
resourcetypes = vocabularies ["rsrct" ]
141
142
descriptiontypes = vocabularies ["dty" ]
@@ -386,6 +387,169 @@ def customize_schema_rdm(json_record):
386
387
return final
387
388
388
389
390
+ def validate_metadata (json_record ):
391
+ """
392
+ Validates the presence and structure of required fields in a CaltechDATA JSON record.
393
+ Raises an exception if any required field is missing or structured incorrectly.
394
+ """
395
+ errors = []
396
+
397
+ # Check for 'types' and 'resourceTypeGeneral'
398
+ if "types" not in json_record :
399
+ errors .append ("'types' field is missing." )
400
+ elif not isinstance (json_record ["types" ], dict ):
401
+ errors .append ("'types' field should be a dictionary." )
402
+ elif "resourceTypeGeneral" not in json_record ["types" ]:
403
+ errors .append ("'resourceTypeGeneral' field is missing in 'types'." )
404
+
405
+ # Check for 'title'
406
+ if "titles" not in json_record :
407
+ errors .append ("'titles' field is missing." )
408
+ elif not isinstance (json_record ["titles" ], list ) or len (json_record ["titles" ]) == 0 :
409
+ errors .append ("'titles' should be a non-empty list." )
410
+ else :
411
+ # Ensure each title is a dictionary with 'title' field
412
+ for title in json_record ["titles" ]:
413
+ if not isinstance (title , dict ) or "title" not in title :
414
+ errors .append (
415
+ "Each entry in 'titles' must be a dictionary with a 'title' key."
416
+ )
417
+
418
+ # Check for 'publication_date'
419
+ if "publicationYear" not in json_record and "dates" not in json_record :
420
+ errors .append (
421
+ "A publication date is required ('publicationYear' or 'dates' field is missing)."
422
+ )
423
+ if "dates" in json_record :
424
+ if not isinstance (json_record ["dates" ], list ):
425
+ errors .append ("'dates' should be a list." )
426
+ else :
427
+ for date_entry in json_record ["dates" ]:
428
+ if (
429
+ not isinstance (date_entry , dict )
430
+ or "dateType" not in date_entry
431
+ or "date" not in date_entry
432
+ ):
433
+ errors .append (
434
+ "Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys."
435
+ )
436
+
437
+ # Check for 'creators'
438
+ if "creators" not in json_record :
439
+ errors .append ("'creators' field is missing." )
440
+ elif (
441
+ not isinstance (json_record ["creators" ], list )
442
+ or len (json_record ["creators" ]) == 0
443
+ ):
444
+ errors .append ("'creators' should be a non-empty list." )
445
+ else :
446
+ for creator in json_record ["creators" ]:
447
+ if not isinstance (creator , dict ) or "name" not in creator :
448
+ errors .append (
449
+ "Each creator in 'creators' must be a dictionary with a 'name' key."
450
+ )
451
+
452
+ # Check for 'contributors'
453
+ if "contributors" in json_record :
454
+ if not isinstance (json_record ["contributors" ], list ):
455
+ errors .append ("'contributors' should be a list." )
456
+ else :
457
+ for contributor in json_record ["contributors" ]:
458
+ if not isinstance (contributor , dict ) or "name" not in contributor :
459
+ errors .append (
460
+ "Each contributor must be a dictionary with a 'name' key."
461
+ )
462
+
463
+ # Check for 'resourceType'
464
+ if "resourceType" not in json_record ["types" ]:
465
+ errors .append ("'resourceType' field is missing in 'types'." )
466
+ elif not isinstance (json_record ["types" ]["resourceType" ], str ):
467
+ errors .append ("'resourceType' should be a string." )
468
+
469
+ # Check for 'identifiers'
470
+ if "identifiers" in json_record :
471
+ if not isinstance (json_record ["identifiers" ], list ):
472
+ errors .append ("'identifiers' should be a list." )
473
+ else :
474
+ for identifier in json_record ["identifiers" ]:
475
+ if (
476
+ not isinstance (identifier , dict )
477
+ or "identifier" not in identifier
478
+ or "identifierType" not in identifier
479
+ ):
480
+ errors .append (
481
+ "Each identifier must be a dictionary with 'identifier' and 'identifierType' keys."
482
+ )
483
+
484
+ # Check for 'subjects'
485
+ if "subjects" in json_record :
486
+ if not isinstance (json_record ["subjects" ], list ):
487
+ errors .append ("'subjects' should be a list." )
488
+ else :
489
+ for subject in json_record ["subjects" ]:
490
+ if not isinstance (subject , dict ) or "subject" not in subject :
491
+ errors .append (
492
+ "Each subject must be a dictionary with a 'subject' key."
493
+ )
494
+
495
+ # Check for 'relatedIdentifiers'
496
+ if "relatedIdentifiers" in json_record :
497
+ if not isinstance (json_record ["relatedIdentifiers" ], list ):
498
+ errors .append ("'relatedIdentifiers' should be a list." )
499
+ else :
500
+ for related_id in json_record ["relatedIdentifiers" ]:
501
+ if (
502
+ not isinstance (related_id , dict )
503
+ or "relatedIdentifier" not in related_id
504
+ ):
505
+ errors .append (
506
+ "Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key."
507
+ )
508
+
509
+ # Check for 'rightsList'
510
+ if "rightsList" in json_record :
511
+ if not isinstance (json_record ["rightsList" ], list ):
512
+ errors .append ("'rightsList' should be a list." )
513
+ else :
514
+ for rights in json_record ["rightsList" ]:
515
+ if not isinstance (rights , dict ) or "rights" not in rights :
516
+ errors .append (
517
+ "Each entry in 'rightsList' must be a dictionary with a 'rights' key."
518
+ )
519
+
520
+ # Check for 'geoLocations'
521
+ if "geoLocations" in json_record :
522
+ if not isinstance (json_record ["geoLocations" ], list ):
523
+ errors .append ("'geoLocations' should be a list." )
524
+ else :
525
+ for location in json_record ["geoLocations" ]:
526
+ if not isinstance (location , dict ):
527
+ errors .append ("Each entry in 'geoLocations' must be a dictionary." )
528
+ elif (
529
+ "geoLocationPoint" not in location
530
+ and "geoLocationBox" not in location
531
+ and "geoLocationPlace" not in location
532
+ ):
533
+ errors .append (
534
+ "Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'."
535
+ )
536
+
537
+ # Check for 'fundingReferences'
538
+ if "fundingReferences" in json_record :
539
+ if not isinstance (json_record ["fundingReferences" ], list ):
540
+ errors .append ("'fundingReferences' should be a list." )
541
+ else :
542
+ for funding in json_record ["fundingReferences" ]:
543
+ if not isinstance (funding , dict ):
544
+ errors .append ("Each funding reference must be a dictionary." )
545
+ if "funderName" not in funding :
546
+ errors .append ("Each funding reference must contain 'funderName'." )
547
+
548
+ # Return errors if any are found
549
+ if errors :
550
+ raise ValueError (f"Validation errors in metadata: { ', ' .join (errors )} " )
551
+
552
+
389
553
if __name__ == "__main__" :
390
554
# Read in from file for demo purposes
391
555
0 commit comments