diff --git a/src/harvesters/base.ts b/src/harvesters/base.ts index 8952b67..d49e139 100644 --- a/src/harvesters/base.ts +++ b/src/harvesters/base.ts @@ -14,7 +14,8 @@ export type BaseHarvesterConfig = { }; export abstract class BaseHarvester< - SourceDatasetT extends { [k: string]: string } = any + SourceDatasetT extends { [k: string]: string } = any, + TargetDatasetT extends PortalJsCloudDataset = PortalJsCloudDataset > { protected config: BaseHarvesterConfig; @@ -23,14 +24,12 @@ export abstract class BaseHarvester< } abstract getSourceDatasets(): Promise; - abstract mapSourceDatasetToTarget( - dataset: SourceDatasetT - ): PortalJsCloudDataset; + abstract mapSourceDatasetToTarget(dataset: SourceDatasetT): TargetDatasetT; async getTargetPreexistingDatasets(): Promise { return await getDatasetList(); } - async upsertIntoTarget({ dataset }: { dataset: PortalJsCloudDataset }) { + async upsertIntoTarget({ dataset }: { dataset: TargetDatasetT }) { return await upsertDataset({ dataset, dryRun: this.config.dryRun, diff --git a/src/harvesters/ckan.ts b/src/harvesters/ckan.ts index 51219e3..1a4a5bb 100644 --- a/src/harvesters/ckan.ts +++ b/src/harvesters/ckan.ts @@ -6,7 +6,9 @@ import { Harvester } from "."; import { getAllDatasets } from "../lib/ckan"; @Harvester -class CkanHarvester extends BaseHarvester { +class CkanHarvester< + SourceDatasetT extends CkanPackage = CkanPackage +> extends BaseHarvester { constructor(args: BaseHarvesterConfig) { super(args); } @@ -28,7 +30,7 @@ class CkanHarvester extends Ba resources: (pkg.resources || []).map((r: any) => ({ name: r.name, url: r.url, - format: r.format + format: r.format, })), language: pkg.language || "EN", diff --git a/src/harvesters/dcat.ts b/src/harvesters/dcat.ts new file mode 100644 index 0000000..3bc7899 --- /dev/null +++ b/src/harvesters/dcat.ts @@ -0,0 +1,176 @@ +import { env } from "../../config"; +import { BaseHarvester, BaseHarvesterConfig } from "./base"; +import { PortalJsCloudDataset } from "@/schemas/portaljs-cloud"; +import { Harvester } from "."; +import { + DCATDataset, + DCATDistribution, + extractString, + extractAgentName, + extractStringArray, + extractDistributions, +} from "../lib/dcat"; + +@Harvester +class DCATHarvester extends BaseHarvester { + constructor(args: BaseHarvesterConfig) { + super(args); + } + + async getSourceDatasets(): Promise { + const url = this.config.source.url; + const res = await fetch(url); + if (!res.ok) { + throw new Error( + `Failed to fetch DCAT JSON-LD: ${res.status} ${res.statusText}` + ); + } + const jsonLd: any[] = await res.json(); + + const objectMap = new Map(); + jsonLd.forEach((obj) => objectMap.set(obj["@id"], obj)); + + const datasets: DCATDataset[] = jsonLd + .filter((obj) => + obj["@type"]?.includes("http://www.w3.org/ns/dcat#Dataset") + ) + .map((dataset) => ({ + ...dataset, + distributions: extractDistributions(dataset, jsonLd), + resolvedPublisherName: extractAgentName( + dataset, + "http://purl.org/dc/terms/publisher", + jsonLd + ), + })); + + return datasets; + } + + mapSourceDatasetToTarget(pkg: DCATDataset): PortalJsCloudDataset { + const owner_org = env.PORTALJS_CLOUD_MAIN_ORG; + + // Map distributions to resources + const resources = (pkg.distributions || []).map( + (dist: DCATDistribution) => ({ + name: + extractString(dist, "http://purl.org/dc/terms/title") || + "Unnamed Resource", + url: + extractString(dist, "http://www.w3.org/ns/dcat#downloadURL") || + extractString(dist, "http://www.w3.org/ns/dcat#accessURL") || + "", + format: + extractString(dist, "http://purl.org/dc/terms/format") || + extractString(dist, "http://www.w3.org/ns/dcat#mediaType") || + "", + description: + extractString(dist, "http://purl.org/dc/terms/description") || "", + license_url: + extractString(dist, "http://purl.org/dc/terms/license") || "", + }) + ); + + const extras: Array<{ key: string; value: string }> = []; + const extraMappings = [ + { predicate: "http://purl.org/dc/terms/issued", key: "issued" }, + { predicate: "http://purl.org/dc/terms/modified", key: "modified" }, + { + predicate: "http://www.w3.org/2002/07/owl#versionInfo", + key: "dcat_version", + }, + { + predicate: "http://purl.org/dc/terms/accrualPeriodicity", + key: "frequency", + }, + { + predicate: "http://purl.org/dc/terms/conformsTo", + key: "conforms_to", + isArray: true, + }, + { + predicate: "http://purl.org/dc/terms/accessRights", + key: "access_rights", + }, + { predicate: "http://purl.org/dc/terms/provenance", key: "provenance" }, + { predicate: "http://purl.org/dc/terms/type", key: "dcat_type" }, + { predicate: "http://purl.org/dc/terms/spatial", key: "spatial_uri" }, + { predicate: "http://purl.org/dc/terms/publisher", key: "publisher_uri" }, + ]; + + extraMappings.forEach(({ predicate, key, isArray = false }) => { + const value = isArray + ? extractStringArray(pkg, predicate).join(", ") + : extractString(pkg, predicate); + if (value) extras.push({ key, value }); + }); + + const skippedKeys = [ + "@id", + "@type", + "distributions", + "http://www.w3.org/ns/dcat#distribution", + "http://purl.org/dc/terms/title", + "http://purl.org/dc/terms/description", + "http://purl.org/dc/terms/identifier", + "http://purl.org/dc/terms/issued", + "http://purl.org/dc/terms/modified", + "http://www.w3.org/2002/07/owl#versionInfo", + "http://purl.org/dc/terms/language", + "http://www.w3.org/ns/dcat#landingPage", + "http://xmlns.com/foaf/0.1/page", + "http://purl.org/dc/terms/accrualPeriodicity", + "http://purl.org/dc/terms/conformsTo", + "http://purl.org/dc/terms/accessRights", + "http://purl.org/dc/terms/provenance", + "http://purl.org/dc/terms/type", + "http://purl.org/dc/terms/spatial", + "http://purl.org/dc/terms/publisher", + "http://www.w3.org/ns/dcat#contactPoint", + "http://purl.org/dc/terms/creator", + "http://purl.org/dc/terms/license", + ]; + Object.keys(pkg).forEach((key) => { + if (!skippedKeys.includes(key)) { + const value = extractString(pkg, key) || JSON.stringify(pkg[key]); + if (value) extras.push({ key, value }); + } + }); + + const extractedLanguage = extractString( + pkg, + "http://purl.org/dc/terms/language" + ); + const validLanguages = ["EN", "FR", "ES", "DE", "IT"]; + const language = ( + validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN" + ) as "EN" | "FR" | "ES" | "DE" | "IT"; + const datasetLicense = + extractString(pkg, "http://purl.org/dc/terms/license") || + (resources.length > 0 ? (resources[0] as any).license_url || "" : ""); + + // Map to PortalJsCloudDataset (based on ckanext-dcat mappings) + return { + owner_org, + name: `${owner_org}--${ + extractString(pkg, "http://purl.org/dc/terms/identifier") || + pkg["@id"].split("/").pop() || + "unknown" + }`, + title: extractString(pkg, "http://purl.org/dc/terms/title") || "", + notes: extractString(pkg, "http://purl.org/dc/terms/description") || "", + url: extractString(pkg, "http://www.w3.org/ns/dcat#landingPage") || "", + language, + author: extractString(pkg, "http://purl.org/dc/terms/creator") || "", + maintainer: (pkg as any).resolvedPublisherName || "", + license_id: extractString(pkg, "http://purl.org/dc/terms/license") || "", + license_url: datasetLicense, + contact_point: + extractString(pkg, "http://www.w3.org/ns/dcat#contactPoint") || "", + resources, + extras, + }; + } +} + +export { DCATHarvester }; diff --git a/src/harvesters/dcatap.ts b/src/harvesters/dcatap.ts new file mode 100644 index 0000000..644ab93 --- /dev/null +++ b/src/harvesters/dcatap.ts @@ -0,0 +1,171 @@ +import { env } from "../../config"; +import { CkanHarvester } from "./ckan"; +import { Harvester } from "."; +import { BaseHarvesterConfig } from "./base"; +import { CkanPackage } from "@/schemas/ckanPackage"; +import { PortalJsCloudDataset, CkanResource } from "@/schemas/portaljs-cloud"; + +/** + * Extended CKAN Package type with additional DCAT-AP fields + */ +export interface DCATAPResource extends CkanResource { + hash?: string; + mimetype?: string | null; + mimetype_inner?: string | null; + cache_url?: string | null; + cache_last_updated?: string | null; + datastore_active?: boolean; + created?: string; + last_modified?: string; + state?: string; + position?: number; + id?: string; + revision_id?: string; + url_type?: string; + resource_type?: string | null; + size?: number | string | null; + package_id?: string; +} + +// Then extend both interfaces +export interface DCATAPCkanPackage extends CkanPackage { + license_title?: string; + license_id?: string; + license_url?: string; + maintainer?: string; + maintainer_email?: string; + author?: string; + author_email?: string; + metadata_created?: string; + metadata_modified?: string; + tags?: Array<{ + name: string; + display_name?: string; + id?: string; + state?: string; + }>; + groups?: Array<{ + name: string; + title?: string; + display_name?: string; + description?: string; + id?: string; + }>; + organization?: { + title?: string; + name?: string; + description?: string; + id?: string; + }; + isopen?: boolean; + version?: string; + url?: string; + state?: string; + type?: string; + extras?: Array<{ + key: string; + value: string; + }>; + resources?: DCATAPResource[]; // Add this line to explicitly define resource type +} + +// Finally extend the PortalJsCloudDataset interface +export interface DCATAPPortalJsDataset extends PortalJsCloudDataset { + license_title?: string; + license_url?: string; + metadata_created?: string; + metadata_modified?: string; + state?: string; + private?: boolean; + isopen?: boolean; + type?: string; + extras?: Array<{ + key: string; + value: string; + }>; + resources?: DCATAPResource[]; // Override with extended resource type +} + +@Harvester +export class DCATAPHarvester extends CkanHarvester { + constructor(args: BaseHarvesterConfig) { + super(args); + } + + mapSourceDatasetToTarget(pkg: DCATAPCkanPackage): DCATAPPortalJsDataset { + const owner_org = env.PORTALJS_CLOUD_MAIN_ORG; + + // Map resources with more fields according to DCAT-AP + const resources = (pkg.resources || []).map((r) => ({ + name: r.name || "", + url: r.url || "", + format: r.format || "", + description: r.description || "", + hash: r.hash || "", + mimetype: r.mimetype || "", + mimetype_inner: r.mimetype_inner || "", + size: r.size ? String(r.size) : undefined, + created: r.created || "", + last_modified: r.last_modified || "", + id: r.id || "", + state: r.state || "active", + position: r.position !== undefined ? r.position : 0, + })); + + //Todo: Ask about how portaljs handle tags and groups harvesting + // const tags = pkg.tags ? pkg.tags.map((tag) => tag.name) : []; + // const groups = pkg.groups ? pkg.groups.map((group) => group.name) : []; + + // Build extras from fields that don't have direct mapping + const extras: Record = {}; + pkg.extras?.forEach((extra) => { + extras[extra.key] = extra.value; + }); + + // Map to DCAT-AP compliant structure + return { + // Core metadata + owner_org, + name: `${owner_org}--${pkg.name}`, + title: pkg.title || "", + notes: pkg.notes || "", + url: pkg.url || "", + version: pkg.version || "", + type: pkg.type || "dataset", + + // Temporal metadata + metadata_created: pkg.metadata_created || "", + metadata_modified: pkg.metadata_modified || "", + + // Licensing and access + license_id: pkg.license_id || "", + license_title: pkg.license_title || "", + license_url: pkg.license_url || "", + private: pkg.private || false, + isopen: pkg.isopen || false, + + // Attribution + author: pkg.author || "", + author_email: pkg.author_email || "", + maintainer: pkg.maintainer || "", + maintainer_email: pkg.maintainer_email || "", + + // Resources + resources, + + // DCAT-AP specific fields (mapped from extras or direct fields) + language: pkg.language || extras.language || "EN", + // frequency: extras.frequency || "", + // temporal_start: extras.temporal_start || "", + // temporal_end: extras.temporal_end || "", + // publisher_name: extras.publisher_name || pkg.organization?.title || "", + // publisher_email: extras.publisher_email || "", + // contact_name: extras.contact_name || pkg.maintainer || pkg.author || "", + // contact_email: + // extras.contact_email || pkg.maintainer_email || pkg.author_email || "", + // theme: extras.theme || "", + // conforms_to: extras.conforms_to || "", + // extras: pkg.extras || [], + }; + } +} diff --git a/src/lib/dcat.ts b/src/lib/dcat.ts new file mode 100644 index 0000000..acd28a7 --- /dev/null +++ b/src/lib/dcat.ts @@ -0,0 +1,66 @@ +// Interfaces for DCAT JSON-LD (expanded form) +export interface DCATDistribution { + "@id": string; + "@type": string[]; + [key: string]: any; +} + +export interface DCATDataset { + "@id": string; + "@type": string[]; + [key: string]: any; +} + +// Helper to extract string value from JSON-LD predicate (e.g., [{"@value": "title"}]) +export function extractString(obj: any, predicate: string): string { + const values = obj[predicate]; + if (Array.isArray(values) && values.length > 0) { + const first = values[0]; + return first["@value"] || first["@id"] || ""; + } + return ""; +} + +export function extractAgentName( + obj: any, + predicate: string, + allObjects: any[] +): string { + const agentId = extractString(obj, predicate); + if (agentId) { + const agent = allObjects.find((o) => o["@id"] === agentId); + if (agent) { + return ( + extractString(agent, "http://xmlns.com/foaf/0.1/name") || + extractString(agent, "http://purl.org/dc/terms/title") || + "" + ); + } + } + return ""; +} + +// Helper to extract array of strings (e.g., for multiple values) +export function extractStringArray(obj: any, predicate: string): string[] { + const values = obj[predicate]; + if (Array.isArray(values)) { + return values.map((v) => v["@value"] || v["@id"] || "").filter(Boolean); + } + return []; +} + +// Helper to extract distributions by @id references +export function extractDistributions( + dataset: DCATDataset, + allObjects: any[] +): DCATDistribution[] { + const distIds = extractStringArray( + dataset, + "http://www.w3.org/ns/dcat#distribution" + ); + return allObjects.filter( + (obj) => + distIds.includes(obj["@id"]) && + obj["@type"]?.includes("http://www.w3.org/ns/dcat#Distribution") + ) as DCATDistribution[]; +} diff --git a/src/schemas/portaljs-cloud.d.ts b/src/schemas/portaljs-cloud.d.ts index 588f9a9..81fae66 100644 --- a/src/schemas/portaljs-cloud.d.ts +++ b/src/schemas/portaljs-cloud.d.ts @@ -10,6 +10,7 @@ export interface PortalJsCloudDataset { author?: string; author_email?: string; maintainer?: string; + url?: string; maintainer_email?: string; language: "EN" | "FR" | "ES" | "DE" | "IT"; coverage?: string; @@ -19,6 +20,8 @@ export interface PortalJsCloudDataset { is_version_of?: string; contact_point?: string; resources?: CkanResource[]; + license_url?: string; + extras?: Array<{ key: string; value: string }>; } export interface CkanResource {