Add multiple file support and initial metadata validation

RohanBhattaraiNP · web-flow · commit 9159d936c42e · 2024-09-26T11:07:56.000-07:00
diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py
@@ -1,7 +1,7 @@
 import copy
 import json
-import os, requests
-
+import os
+import requests
 import s3fs
 from requests import session
 from json.decoder import JSONDecodeError
@@ -49,8 +49,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
                 infile = open(name, "rb")
             else:
                 infile = open(f_list[name], "rb")
-            # size = infile.seek(0, 2)
-            # infile.seek(0, 0)  # reset at beginning
             result = requests.put(link, headers=f_headers, data=infile)
             if result.status_code != 200:
                 raise Exception(result.text)
@@ -68,7 +66,7 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal
 def add_file_links(
     metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None
 ):
-    # Currently configured for S3 links, assuming all are at same endpoint
+    # Currently configured for S3 links, assuming all are at the same endpoint
     link_string = ""
     endpoint = "https://" + file_links[0].split("/")[2]
     s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint})
diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py
@@ -59,7 +59,7 @@ def decrypt_token(encrypted_token, key):
     return f.decrypt(encrypted_token).decode()
 
 
-# Function to get or set token
+# Function to get or set token with support for test system
 def get_or_set_token(production=True):
     key = load_or_generate_key()
 
@@ -411,6 +411,7 @@ def main():
 
 def create_record(production):
     token = get_or_set_token(production)
+    # keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes"
     print("Using CaltechDATA token:", token)
     while True:
         choice = get_user_input(
@@ -521,13 +522,10 @@ def print_upload_message(rec_id, production):
         else "https://data.caltechlibrary.dev/uploads/"
     )
     print(
-        f"""
-        You can view and publish this record at
-        
+        f"""You can view and publish this record at
         {base_url}{rec_id}
-        
-        If you need to upload large files to S3, you can type `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`
-        """
+        If you need to upload large files to S3, you can type
+        `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`"""
     )
 
 
@@ -552,7 +550,6 @@ def edit_record(production):
             print(f"An error occurred during metadata editing: {e}")
     else:
         print("No metadata file found.")
-
     choice = get_user_input("Do you want to add files? (y/n): ").lower()
     if choice == "y":
         if production:
@@ -571,19 +568,32 @@ def edit_record(production):
         url = API_URL_TEMPLATE.format(record_id=record_id)
         url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id)
 
-        response = requests.get(url)
-        response_draft = requests.get(url_draft)
+        headers = {
+            "accept": "application/json",
+        }
 
-        filepath, file_link = upload_supporting_file(record_id)
-        print(file_link)
+        if token:
+            headers["Authorization"] = "Bearer %s" % token
 
-        if response.status_code == 404 and response_draft.status_code == 404:
+        response = requests.get(url, headers=headers)
+        response_draft = requests.get(url_draft, headers=headers)
+        data = response.json()
+        data_draft = response_draft.json()
+        # Check if 'entries' exists and its length
+        if (
+            len(data.get("entries", [])) == 0
+            and len(data_draft.get("entries", [])) == 0
+        ):
             keepfile = False
         else:
             keepfile = (
                 input("Do you want to keep existing files? (y/n): ").lower() == "y"
             )
 
+        filepath, file_link = upload_supporting_file(record_id)
+        if file_link:
+            print(file_link)
+
         if filepath != "":
             response = caltechdata_edit(
                 record_id,
@@ -601,7 +611,7 @@ def edit_record(production):
                 file_links=file_link,
                 production=production,
                 publish=False,
-                keepfile=keepfile,
+                keepfiles=keepfile,
             )
 
         rec_id = response
@@ -620,15 +630,28 @@ def download_file_by_id(record_id, token=None):
 
     try:
         response = requests.get(url, headers=headers)
-
         if response.status_code != 200:
             # Might have a draft
             response = requests.get(
                 url + "/draft",
                 headers=headers,
             )
             if response.status_code != 200:
-                raise Exception(f"Record {record_id} does not exist, cannot edit")
+                url = f"https://data.caltechlibrary.dev/api/records/{record_id}"
+                response = requests.get(
+                    url,
+                    headers=headers,
+                )
+                if response.status_code != 200:
+                    # Might have a draft
+                    response = requests.get(
+                        url + "/draft",
+                        headers=headers,
+                    )
+                    if response.status_code != 200:
+                        raise Exception(
+                            f"Record {record_id} does not exist, cannot edit"
+                        )
         file_content = response.content
         file_name = f"downloaded_data_{record_id}.json"
         with open(file_name, "wb") as file:
diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py
@@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles):
 
 def customize_schema_rdm(json_record):
     # Get vocabularies used in InvenioRDM
-    vocabularies = get_vocabularies()
 
+    vocabularies = get_vocabularies()
+    validate_metadata(json_record)
     peopleroles = vocabularies["crr"]
     resourcetypes = vocabularies["rsrct"]
     descriptiontypes = vocabularies["dty"]
@@ -386,6 +387,169 @@ def customize_schema_rdm(json_record):
     return final
 
 
+def validate_metadata(json_record):
+    """
+    Validates the presence and structure of required fields in a CaltechDATA JSON record.
+    Raises an exception if any required field is missing or structured incorrectly.
+    """
+    errors = []
+
+    # Check for 'types' and 'resourceTypeGeneral'
+    if "types" not in json_record:
+        errors.append("'types' field is missing.")
+    elif not isinstance(json_record["types"], dict):
+        errors.append("'types' field should be a dictionary.")
+    elif "resourceTypeGeneral" not in json_record["types"]:
+        errors.append("'resourceTypeGeneral' field is missing in 'types'.")
+
+    # Check for 'title'
+    if "titles" not in json_record:
+        errors.append("'titles' field is missing.")
+    elif not isinstance(json_record["titles"], list) or len(json_record["titles"]) == 0:
+        errors.append("'titles' should be a non-empty list.")
+    else:
+        # Ensure each title is a dictionary with 'title' field
+        for title in json_record["titles"]:
+            if not isinstance(title, dict) or "title" not in title:
+                errors.append(
+                    "Each entry in 'titles' must be a dictionary with a 'title' key."
+                )
+
+    # Check for 'publication_date'
+    if "publicationYear" not in json_record and "dates" not in json_record:
+        errors.append(
+            "A publication date is required ('publicationYear' or 'dates' field is missing)."
+        )
+    if "dates" in json_record:
+        if not isinstance(json_record["dates"], list):
+            errors.append("'dates' should be a list.")
+        else:
+            for date_entry in json_record["dates"]:
+                if (
+                    not isinstance(date_entry, dict)
+                    or "dateType" not in date_entry
+                    or "date" not in date_entry
+                ):
+                    errors.append(
+                        "Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys."
+                    )
+
+    # Check for 'creators'
+    if "creators" not in json_record:
+        errors.append("'creators' field is missing.")
+    elif (
+        not isinstance(json_record["creators"], list)
+        or len(json_record["creators"]) == 0
+    ):
+        errors.append("'creators' should be a non-empty list.")
+    else:
+        for creator in json_record["creators"]:
+            if not isinstance(creator, dict) or "name" not in creator:
+                errors.append(
+                    "Each creator in 'creators' must be a dictionary with a 'name' key."
+                )
+
+    # Check for 'contributors'
+    if "contributors" in json_record:
+        if not isinstance(json_record["contributors"], list):
+            errors.append("'contributors' should be a list.")
+        else:
+            for contributor in json_record["contributors"]:
+                if not isinstance(contributor, dict) or "name" not in contributor:
+                    errors.append(
+                        "Each contributor must be a dictionary with a 'name' key."
+                    )
+
+    # Check for 'resourceType'
+    if "resourceType" not in json_record["types"]:
+        errors.append("'resourceType' field is missing in 'types'.")
+    elif not isinstance(json_record["types"]["resourceType"], str):
+        errors.append("'resourceType' should be a string.")
+
+    # Check for 'identifiers'
+    if "identifiers" in json_record:
+        if not isinstance(json_record["identifiers"], list):
+            errors.append("'identifiers' should be a list.")
+        else:
+            for identifier in json_record["identifiers"]:
+                if (
+                    not isinstance(identifier, dict)
+                    or "identifier" not in identifier
+                    or "identifierType" not in identifier
+                ):
+                    errors.append(
+                        "Each identifier must be a dictionary with 'identifier' and 'identifierType' keys."
+                    )
+
+    # Check for 'subjects'
+    if "subjects" in json_record:
+        if not isinstance(json_record["subjects"], list):
+            errors.append("'subjects' should be a list.")
+        else:
+            for subject in json_record["subjects"]:
+                if not isinstance(subject, dict) or "subject" not in subject:
+                    errors.append(
+                        "Each subject must be a dictionary with a 'subject' key."
+                    )
+
+    # Check for 'relatedIdentifiers'
+    if "relatedIdentifiers" in json_record:
+        if not isinstance(json_record["relatedIdentifiers"], list):
+            errors.append("'relatedIdentifiers' should be a list.")
+        else:
+            for related_id in json_record["relatedIdentifiers"]:
+                if (
+                    not isinstance(related_id, dict)
+                    or "relatedIdentifier" not in related_id
+                ):
+                    errors.append(
+                        "Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key."
+                    )
+
+    # Check for 'rightsList'
+    if "rightsList" in json_record:
+        if not isinstance(json_record["rightsList"], list):
+            errors.append("'rightsList' should be a list.")
+        else:
+            for rights in json_record["rightsList"]:
+                if not isinstance(rights, dict) or "rights" not in rights:
+                    errors.append(
+                        "Each entry in 'rightsList' must be a dictionary with a 'rights' key."
+                    )
+
+    # Check for 'geoLocations'
+    if "geoLocations" in json_record:
+        if not isinstance(json_record["geoLocations"], list):
+            errors.append("'geoLocations' should be a list.")
+        else:
+            for location in json_record["geoLocations"]:
+                if not isinstance(location, dict):
+                    errors.append("Each entry in 'geoLocations' must be a dictionary.")
+                elif (
+                    "geoLocationPoint" not in location
+                    and "geoLocationBox" not in location
+                    and "geoLocationPlace" not in location
+                ):
+                    errors.append(
+                        "Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'."
+                    )
+
+    # Check for 'fundingReferences'
+    if "fundingReferences" in json_record:
+        if not isinstance(json_record["fundingReferences"], list):
+            errors.append("'fundingReferences' should be a list.")
+        else:
+            for funding in json_record["fundingReferences"]:
+                if not isinstance(funding, dict):
+                    errors.append("Each funding reference must be a dictionary.")
+                if "funderName" not in funding:
+                    errors.append("Each funding reference must contain 'funderName'.")
+
+    # Return errors if any are found
+    if errors:
+        raise ValueError(f"Validation errors in metadata: {', '.join(errors)}")
+
+
 if __name__ == "__main__":
     # Read in from file for demo purposes