Skip to content

Commit 9159d93

Browse files
Add multiple file support and initial metadata validation
1 parent 0f5add5 commit 9159d93

File tree

3 files changed

+207
-22
lines changed

3 files changed

+207
-22
lines changed

caltechdata_api/caltechdata_write.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import copy
22
import json
3-
import os, requests
4-
3+
import os
4+
import requests
55
import s3fs
66
from requests import session
77
from json.decoder import JSONDecodeError
@@ -49,8 +49,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
4949
infile = open(name, "rb")
5050
else:
5151
infile = open(f_list[name], "rb")
52-
# size = infile.seek(0, 2)
53-
# infile.seek(0, 0) # reset at beginning
5452
result = requests.put(link, headers=f_headers, data=infile)
5553
if result.status_code != 200:
5654
raise Exception(result.text)
@@ -68,7 +66,7 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
6866
def add_file_links(
6967
metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None
7068
):
71-
# Currently configured for S3 links, assuming all are at same endpoint
69+
# Currently configured for S3 links, assuming all are at the same endpoint
7270
link_string = ""
7371
endpoint = "https://" + file_links[0].split("/")[2]
7472
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint})

caltechdata_api/cli.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def decrypt_token(encrypted_token, key):
5959
return f.decrypt(encrypted_token).decode()
6060

6161

62-
# Function to get or set token
62+
# Function to get or set token with support for test system
6363
def get_or_set_token(production=True):
6464
key = load_or_generate_key()
6565

@@ -411,6 +411,7 @@ def main():
411411

412412
def create_record(production):
413413
token = get_or_set_token(production)
414+
# keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes"
414415
print("Using CaltechDATA token:", token)
415416
while True:
416417
choice = get_user_input(
@@ -521,13 +522,10 @@ def print_upload_message(rec_id, production):
521522
else "https://data.caltechlibrary.dev/uploads/"
522523
)
523524
print(
524-
f"""
525-
You can view and publish this record at
526-
525+
f"""You can view and publish this record at
527526
{base_url}{rec_id}
528-
529-
If you need to upload large files to S3, you can type `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`
530-
"""
527+
If you need to upload large files to S3, you can type
528+
`s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`"""
531529
)
532530

533531

@@ -552,7 +550,6 @@ def edit_record(production):
552550
print(f"An error occurred during metadata editing: {e}")
553551
else:
554552
print("No metadata file found.")
555-
556553
choice = get_user_input("Do you want to add files? (y/n): ").lower()
557554
if choice == "y":
558555
if production:
@@ -571,19 +568,32 @@ def edit_record(production):
571568
url = API_URL_TEMPLATE.format(record_id=record_id)
572569
url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id)
573570

574-
response = requests.get(url)
575-
response_draft = requests.get(url_draft)
571+
headers = {
572+
"accept": "application/json",
573+
}
576574

577-
filepath, file_link = upload_supporting_file(record_id)
578-
print(file_link)
575+
if token:
576+
headers["Authorization"] = "Bearer %s" % token
579577

580-
if response.status_code == 404 and response_draft.status_code == 404:
578+
response = requests.get(url, headers=headers)
579+
response_draft = requests.get(url_draft, headers=headers)
580+
data = response.json()
581+
data_draft = response_draft.json()
582+
# Check if 'entries' exists and its length
583+
if (
584+
len(data.get("entries", [])) == 0
585+
and len(data_draft.get("entries", [])) == 0
586+
):
581587
keepfile = False
582588
else:
583589
keepfile = (
584590
input("Do you want to keep existing files? (y/n): ").lower() == "y"
585591
)
586592

593+
filepath, file_link = upload_supporting_file(record_id)
594+
if file_link:
595+
print(file_link)
596+
587597
if filepath != "":
588598
response = caltechdata_edit(
589599
record_id,
@@ -601,7 +611,7 @@ def edit_record(production):
601611
file_links=file_link,
602612
production=production,
603613
publish=False,
604-
keepfile=keepfile,
614+
keepfiles=keepfile,
605615
)
606616

607617
rec_id = response
@@ -620,15 +630,28 @@ def download_file_by_id(record_id, token=None):
620630

621631
try:
622632
response = requests.get(url, headers=headers)
623-
624633
if response.status_code != 200:
625634
# Might have a draft
626635
response = requests.get(
627636
url + "/draft",
628637
headers=headers,
629638
)
630639
if response.status_code != 200:
631-
raise Exception(f"Record {record_id} does not exist, cannot edit")
640+
url = f"https://data.caltechlibrary.dev/api/records/{record_id}"
641+
response = requests.get(
642+
url,
643+
headers=headers,
644+
)
645+
if response.status_code != 200:
646+
# Might have a draft
647+
response = requests.get(
648+
url + "/draft",
649+
headers=headers,
650+
)
651+
if response.status_code != 200:
652+
raise Exception(
653+
f"Record {record_id} does not exist, cannot edit"
654+
)
632655
file_content = response.content
633656
file_name = f"downloaded_data_{record_id}.json"
634657
with open(file_name, "wb") as file:

caltechdata_api/customize_schema.py

Lines changed: 165 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles):
134134

135135
def customize_schema_rdm(json_record):
136136
# Get vocabularies used in InvenioRDM
137-
vocabularies = get_vocabularies()
138137

138+
vocabularies = get_vocabularies()
139+
validate_metadata(json_record)
139140
peopleroles = vocabularies["crr"]
140141
resourcetypes = vocabularies["rsrct"]
141142
descriptiontypes = vocabularies["dty"]
@@ -386,6 +387,169 @@ def customize_schema_rdm(json_record):
386387
return final
387388

388389

390+
def validate_metadata(json_record):
391+
"""
392+
Validates the presence and structure of required fields in a CaltechDATA JSON record.
393+
Raises an exception if any required field is missing or structured incorrectly.
394+
"""
395+
errors = []
396+
397+
# Check for 'types' and 'resourceTypeGeneral'
398+
if "types" not in json_record:
399+
errors.append("'types' field is missing.")
400+
elif not isinstance(json_record["types"], dict):
401+
errors.append("'types' field should be a dictionary.")
402+
elif "resourceTypeGeneral" not in json_record["types"]:
403+
errors.append("'resourceTypeGeneral' field is missing in 'types'.")
404+
405+
# Check for 'title'
406+
if "titles" not in json_record:
407+
errors.append("'titles' field is missing.")
408+
elif not isinstance(json_record["titles"], list) or len(json_record["titles"]) == 0:
409+
errors.append("'titles' should be a non-empty list.")
410+
else:
411+
# Ensure each title is a dictionary with 'title' field
412+
for title in json_record["titles"]:
413+
if not isinstance(title, dict) or "title" not in title:
414+
errors.append(
415+
"Each entry in 'titles' must be a dictionary with a 'title' key."
416+
)
417+
418+
# Check for 'publication_date'
419+
if "publicationYear" not in json_record and "dates" not in json_record:
420+
errors.append(
421+
"A publication date is required ('publicationYear' or 'dates' field is missing)."
422+
)
423+
if "dates" in json_record:
424+
if not isinstance(json_record["dates"], list):
425+
errors.append("'dates' should be a list.")
426+
else:
427+
for date_entry in json_record["dates"]:
428+
if (
429+
not isinstance(date_entry, dict)
430+
or "dateType" not in date_entry
431+
or "date" not in date_entry
432+
):
433+
errors.append(
434+
"Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys."
435+
)
436+
437+
# Check for 'creators'
438+
if "creators" not in json_record:
439+
errors.append("'creators' field is missing.")
440+
elif (
441+
not isinstance(json_record["creators"], list)
442+
or len(json_record["creators"]) == 0
443+
):
444+
errors.append("'creators' should be a non-empty list.")
445+
else:
446+
for creator in json_record["creators"]:
447+
if not isinstance(creator, dict) or "name" not in creator:
448+
errors.append(
449+
"Each creator in 'creators' must be a dictionary with a 'name' key."
450+
)
451+
452+
# Check for 'contributors'
453+
if "contributors" in json_record:
454+
if not isinstance(json_record["contributors"], list):
455+
errors.append("'contributors' should be a list.")
456+
else:
457+
for contributor in json_record["contributors"]:
458+
if not isinstance(contributor, dict) or "name" not in contributor:
459+
errors.append(
460+
"Each contributor must be a dictionary with a 'name' key."
461+
)
462+
463+
# Check for 'resourceType'
464+
if "resourceType" not in json_record["types"]:
465+
errors.append("'resourceType' field is missing in 'types'.")
466+
elif not isinstance(json_record["types"]["resourceType"], str):
467+
errors.append("'resourceType' should be a string.")
468+
469+
# Check for 'identifiers'
470+
if "identifiers" in json_record:
471+
if not isinstance(json_record["identifiers"], list):
472+
errors.append("'identifiers' should be a list.")
473+
else:
474+
for identifier in json_record["identifiers"]:
475+
if (
476+
not isinstance(identifier, dict)
477+
or "identifier" not in identifier
478+
or "identifierType" not in identifier
479+
):
480+
errors.append(
481+
"Each identifier must be a dictionary with 'identifier' and 'identifierType' keys."
482+
)
483+
484+
# Check for 'subjects'
485+
if "subjects" in json_record:
486+
if not isinstance(json_record["subjects"], list):
487+
errors.append("'subjects' should be a list.")
488+
else:
489+
for subject in json_record["subjects"]:
490+
if not isinstance(subject, dict) or "subject" not in subject:
491+
errors.append(
492+
"Each subject must be a dictionary with a 'subject' key."
493+
)
494+
495+
# Check for 'relatedIdentifiers'
496+
if "relatedIdentifiers" in json_record:
497+
if not isinstance(json_record["relatedIdentifiers"], list):
498+
errors.append("'relatedIdentifiers' should be a list.")
499+
else:
500+
for related_id in json_record["relatedIdentifiers"]:
501+
if (
502+
not isinstance(related_id, dict)
503+
or "relatedIdentifier" not in related_id
504+
):
505+
errors.append(
506+
"Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key."
507+
)
508+
509+
# Check for 'rightsList'
510+
if "rightsList" in json_record:
511+
if not isinstance(json_record["rightsList"], list):
512+
errors.append("'rightsList' should be a list.")
513+
else:
514+
for rights in json_record["rightsList"]:
515+
if not isinstance(rights, dict) or "rights" not in rights:
516+
errors.append(
517+
"Each entry in 'rightsList' must be a dictionary with a 'rights' key."
518+
)
519+
520+
# Check for 'geoLocations'
521+
if "geoLocations" in json_record:
522+
if not isinstance(json_record["geoLocations"], list):
523+
errors.append("'geoLocations' should be a list.")
524+
else:
525+
for location in json_record["geoLocations"]:
526+
if not isinstance(location, dict):
527+
errors.append("Each entry in 'geoLocations' must be a dictionary.")
528+
elif (
529+
"geoLocationPoint" not in location
530+
and "geoLocationBox" not in location
531+
and "geoLocationPlace" not in location
532+
):
533+
errors.append(
534+
"Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'."
535+
)
536+
537+
# Check for 'fundingReferences'
538+
if "fundingReferences" in json_record:
539+
if not isinstance(json_record["fundingReferences"], list):
540+
errors.append("'fundingReferences' should be a list.")
541+
else:
542+
for funding in json_record["fundingReferences"]:
543+
if not isinstance(funding, dict):
544+
errors.append("Each funding reference must be a dictionary.")
545+
if "funderName" not in funding:
546+
errors.append("Each funding reference must contain 'funderName'.")
547+
548+
# Return errors if any are found
549+
if errors:
550+
raise ValueError(f"Validation errors in metadata: {', '.join(errors)}")
551+
552+
389553
if __name__ == "__main__":
390554
# Read in from file for demo purposes
391555

0 commit comments

Comments
 (0)