From 2a98bcff87d8516ddfc1b23e7bb0f7c3e25ffa34 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Sun, 8 Jun 2025 20:09:17 -0400 Subject: [PATCH 1/4] feat: Metadata schema architecture to scope --- .../metadata-schema/variable_details.yaml | 204 ++++++++++++ scope-docs/metadata-schema/variables.yaml | 296 ++++++++++++++++++ scope-docs/metadata-schemas.qmd | 104 ++++++ scope-docs/metadata.qmd | 23 ++ 4 files changed, 627 insertions(+) create mode 100644 scope-docs/metadata-schema/variable_details.yaml create mode 100644 scope-docs/metadata-schema/variables.yaml create mode 100644 scope-docs/metadata-schemas.qmd diff --git a/scope-docs/metadata-schema/variable_details.yaml b/scope-docs/metadata-schema/variable_details.yaml new file mode 100644 index 00000000..0580c6ae --- /dev/null +++ b/scope-docs/metadata-schema/variable_details.yaml @@ -0,0 +1,204 @@ +schema_version: "0.1" +schema_date: "2025-01-01" +description: "Core variable details schema including active templateVariable development" + +variable_details_schema: + title: "CCHSFlow Variable Details Configuration" + description: "Schema for variable_details.csv, defining value-level attributes, recoding logic, and categorical value labels for variables." + version: "0.1" + id_column_name: "file_row_id" + expected_column_order: + - "file_row_id" + - "variable" + - "templateVariable" # Active field - currently in development + - "dummyVariable" + - "typeEnd" + - "typeStart" + - "databaseStart" # camelCase convention + - "variableStart" # camelCase convention + - "variableStartLabel" # camelCase convention + - "numValidCat" + - "recEnd" + - "catLabel" + - "catLabelLong" + - "units" + - "recStart" + - "variableStartShortLabel" # camelCase convention + - "notes" + + fields: + - name: "file_row_id" + title: "File Row Identifier" + description: "Unique identifier for the row within this CSV file. Generated using format: detail_{variable_name}_{sequence}" + type: "string" + constraints: + required: true + unique: true + pattern: "^detail_[a-zA-Z0-9_.]+_[0-9]{3}$" # Semantic ID pattern with sequence + - name: "variable" + title: "Variable Name" + description: "Canonical name of the variable this detail row pertains to. Foreign key to variables.csv." + type: "string" + constraints: + required: true + - name: "templateVariable" + title: "Template Variable Indicator" + description: "Indicates if this variable follows a template pattern or references another template variable" + type: "string" + constraints: + required: false + # Permissive validation - allowing both "Yes"/"No" and template variable names + - name: "dummyVariable" + title: "Dummy Variable Indicator" + description: "Indicates if this row defines a dummy variable created during recoding (eg, for a category of a categorical variable)." + type: "string" + constraints: + required: false + - name: "typeEnd" + title: "Target Data Type" + description: "The data type of the variable *after* recoding or as its final representation (eg, categorical, numeric)." + type: "string" + constraints: + required: false + - name: "typeStart" + title: "Source Data Type" + description: "The data type of the variable *before* recoding or in its original form." + type: "string" + constraints: + required: false + - name: "databaseStart" + title: "Original Database Name" + description: "Name of the original database or data source for this variable detail." + type: "string" + constraints: + required: false + - name: "variableStart" + title: "Original Variable Name or Source Value" + description: "Name of the original variable or specific source value being recoded." + type: "string" + constraints: + required: false + - name: "variableStartLabel" + title: "Original Variable Label" + description: "Label of the original variable in the source database." + type: "string" + constraints: + required: false + - name: "numValidCat" + title: "Number of Valid Categories" + description: "For categorical variables, the number of distinct valid categories." + type: "integer" + constraints: + required: false + minimum: 0 + - name: "recEnd" + title: "Recoded Value (Target)" + description: "The target value after recoding. For categorical variables, this is the value being labelled." + type: "string" + constraints: + required: false + - name: "catLabel" + title: "Category Label (Short)" + description: "Short label for a specific category of a categorical variable." + type: "string" + constraints: + required: false + - name: "catLabelLong" + title: "Category Label (Long)" + description: "Long, descriptive label for a specific category of a categorical variable." + type: "string" + constraints: + required: false + - name: "units" + title: "Units" + description: "Units of measurement, if applicable to this specific variable detail or category." + type: "string" + constraints: + required: false + - name: "recStart" + title: "Recode From Value (Source)" + description: "The original value or range that is being recoded to 'recEnd'." + type: "string" + constraints: + required: false + - name: "variableStartShortLabel" + title: "Original Variable Short Label" + description: "Short label of the original variable in the source database." + type: "string" + constraints: + required: false + - name: "notes" + title: "Notes" + description: "Specific notes or comments related to this variable detail or recoding rule." + type: "string" + constraints: + required: false + + missingValues: ["", "NA", "N/A"] + allow_additional_columns: true # Permissive during recodeflow development + extension_schema: null + +--- + +# Template System Schema (inst/metadata/schemas/core/templates.yaml) +schema_version: "0.1" +schema_date: "2025-01-01" +description: "Template variable system documentation and validation rules" + +template_system_schema: + title: "Template Variable System" + description: "Schema for template variable inheritance and validation in recodeflow development" + version: "0.1" + + # How to identify template definitions + template_definitions: + marker_field: "templateVariable" + marker_values: ["Yes"] + required_fields: + - "variable" # Template must have a name (serves as template identifier) + - "typeEnd" # Template must define output type + - "recStart" # Template must define source values + - "recEnd" # Template must define target values + + # How template inheritance works + template_inheritance: + reference_field: "templateVariable" + reference_pattern: "template_name" # References existing template by variable name + required_fields: + - "variable" # Using variable must have unique name + - "variableStart" # Using variable must define source mapping + inheritance_rules: + - "Template recoding rules (recStart/recEnd) are inherited" + - "Using variable defines its own source mapping (variableStart)" + - "Type information (typeEnd/typeStart) can be inherited or overridden" + + # Validation rules for template system + validation_rules: + template_existence: + description: "Referenced templates must exist in the same variable_details file" + rule: "If templateVariable != 'Yes' and templateVariable != 'No', then variable with that name and templateVariable = 'Yes' must exist" + + circular_references: + description: "Templates cannot reference other templates" + rule: "If templateVariable = 'Yes', then variable cannot reference another template" + + consistent_typing: + description: "Template usage should maintain type consistency" + rule: "Variables using templates should have compatible typeEnd values" + + # Examples for documentation + examples: + simple_template: + description: "Basic language template example" + template_definition: + variable: "lang" + templateVariable: "Yes" + typeEnd: "cat" + recStart: ["english", "french"] + recEnd: ["1", "2"] + + template_usage: + variable: "primary_lang" + templateVariable: "lang" + variableStart: "[PL]" + # Inherits: typeEnd="cat", recStart/recEnd mappings diff --git a/scope-docs/metadata-schema/variables.yaml b/scope-docs/metadata-schema/variables.yaml new file mode 100644 index 00000000..b9ab0cc6 --- /dev/null +++ b/scope-docs/metadata-schema/variables.yaml @@ -0,0 +1,296 @@ +schema_version: "0.1" +schema_date: "2025-01-01" +description: "Core variables schema describing the format and specifications of variables (variables.csv) within the recodeflow package." + +# CSV file format specification +csv_format: + encoding: "UTF-8" # Character encoding + bom: false # No UTF-8 BOM (Byte Order Mark) + delimiter: "," # Field delimiter + quote_char: '"' # Character used to quote fields + escape_char: '"' # Escape character (doubled quotes) + line_terminator: "\n" # Unix-style line endings (LF) + header_required: true # First row must contain column names + header_case_sensitive: true # Column names must match exactly + quote_when_needed: true # Quote fields containing delimiter, quotes, or newlines + trailing_delimiter: false # No comma after last field + blank_lines: "skip" # Skip blank lines + comment_char: null # No comment lines allowed + +variables_schema: + title: "cchsflow variables configuration" + description: "Schema for variables.csv, defining master variable attributes, types, and labels. Used for data processing and metadata generation within the cchsflow project." + version: "0.1" + + id_column_name: "fileRowId" + expected_column_order: + - "fileRowId" + - "variable" + - "label" + - "labelLong" + - "subject" + - "section" + - "variableType" + - "databaseStart" + - "units" + - "variableStart" + - "notes" + - "description" + - "version" + - "lastUpdated" + - "harmonizationStatus" + - "reviewNotes" + + fields: + - name: "fileRowId" + title: "File row identifier" + description: "Unique identifier for the row within this CSV file. Generated using format: var_{variable_name}" + type: "string" + constraints: + required: true + unique: true + pattern: "^var_[a-zA-Z0-9_.]+$" # Semantic ID pattern + - name: "variable" + title: "Variable name" + description: "Canonical and unique name of the variable being described. Used as a key." + type: "string" + constraints: + required: true + unique: true + pattern: "^[a-zA-Z_][a-zA-Z0-9_.]*$" + - name: "label" + title: "Short label" + description: "A concise, human-readable label for the variable (eg, for chart axes, short displays)." + type: "string" + constraints: + required: true + - name: "labelLong" + title: "Long label" + description: "A longer, more descriptive human-readable label for the variable (eg, for full descriptions, codebook)." + type: "string" + constraints: + required: true + - name: "subject" + title: "Subject area" + description: "The general subject or thematic area the variable belongs to (eg, demographics, health behaviours)." + type: "string" + constraints: + required: false + - name: "section" + title: "Survey section" + description: "Specific section or module of the survey/dataset where the variable originates." + type: "string" + constraints: + required: false + - name: "variableType" + title: "Variable type" + description: "The nature or type of the variable indicating how it should be analyzed and processed." + type: "string" + constraints: + required: true + enum: + - "Continuous" + - "Categorical" + - name: "databaseStart" + title: "Original database name" + description: "Name of the original database or data source from which this variable was derived or sourced." + type: "string" + constraints: + required: true + - name: "units" + title: "Measurement units" + description: "Units of measurement for the variable, if applicable (eg, kg, years, minutes/day)." + type: "string" + constraints: + required: false + - name: "variableStart" + title: "Source variables and transformation rules" + description: | + Specifies the source variable(s) used to create this variable. + This field supports multiple patterns for mapping source data to harmonized variables, + including simple references, database mappings, and derived variable calculations. + See patterns section below for formal specifications. + type: "string" + constraints: + required: true + patterns: + - name: "simple_reference" + pattern: "^\\[[A-Z][A-Z0-9_]*\\]$" + description: "Default pattern - references variable from any unspecified source" + example: "[ADL_005]" + - name: "database_mapping" + pattern: "^[a-zA-Z0-9_]+::[A-Z][A-Z0-9_]*$" + description: "Explicit database::variable mapping" + example: "cchs2001_p::RACA_6A" + - name: "derived_variable" + pattern: "^DerivedVar::\\[([A-Z][A-Z0-9_]*(,\\s*[A-Z][A-Z0-9_]*)*)\\]$" + description: "Derived variable calculation from multiple sources" + example: "DerivedVar::[HEIGHT, WEIGHT]" + - name: "multiple_sources" + pattern: "^([a-zA-Z0-9_]+::[A-Z][A-Z0-9_]*(,\\s*)?)+$" + description: "Multiple database::variable mappings" + example: "cchs2001_p::RACA_6A, cchs2003_p::RACC_6A" + - name: "combined_pattern" + pattern: "^([a-zA-Z0-9_]+::[A-Z][A-Z0-9_]*(,\\s*)?)+(,\\s*)?\\[[A-Z][A-Z0-9_]*\\]$" + description: "Multiple sources with default reference" + example: "cchs2001_p::RACA_6A, cchs2003_p::RACC_6A, [ADL_01]" + - name: "notes" + title: "Notes" + description: "General notes, comments, or annotations about the variable." + type: "string" + constraints: + required: false + - name: "description" + title: "Detailed description" + description: "A more detailed textual description or definition of the variable, potentially including operationalisation." + type: "string" + constraints: + required: false + - name: "version" + title: "Variable version" + description: "Semantic version of this variable definition (e.g., 1.0.0)" + type: "string" + constraints: + required: true + pattern: "^[0-9]+\\.[0-9]+\\.[0-9]+$" + - name: "lastUpdated" + title: "Last updated date" + description: "Date when this variable definition was last modified (ISO format: YYYY-MM-DD)" + type: "string" + format: "date" + constraints: + required: true + - name: "harmonizationStatus" + title: "Harmonization status" + description: "Current status of the variable in the harmonization process" + type: "string" + constraints: + required: true + enum: + - "development" + - "active" + - "not_harmonizable" + - "pending_review" + - name: "reviewNotes" + title: "Review notes" + description: "Brief notes about harmonization decisions, links to GitHub issues/discussions, or references to external documentation" + type: "string" + constraints: + required: false + + missingValues: ["", "NA", "N/A"] + + # Validation rules for cross-field dependencies + validation_rules: + cross_field: + - rule: "units_required_for_continuous" + condition: "variableType == 'Continuous'" + requirement: "units != null" + level: "warning" + message: "Continuous variable '{variable}' should have units specified" + - rule: "review_notes_for_not_harmonizable" + condition: "harmonizationStatus == 'not_harmonizable'" + requirement: "reviewNotes != null" + level: "warning" + message: "Variable '{variable}' marked as not_harmonizable should have reviewNotes explaining why" + - rule: "subject_section_mutual_exclusive" + condition: "subject != null AND section != null" + requirement: "section == 'N/A'" + level: "info" + message: "Variable '{variable}' has both subject and section; typically these are mutually exclusive" + + pattern_validation: + # Note: variableStart can contain mixed patterns (e.g., database::var + [var]) + # Pattern precedence to be determined in future versions + mixed_patterns_allowed: true + validation_message: "variableStart for '{variable}' does not match any known pattern: {value}" + + # Permissive for v0.1 - coordinate with recodeflow development + allow_additional_columns: true # Allow development flexibility + extension_schema: null + + # ============================================================================ +# PROJECT-SPECIFIC METADATA: CCHS (Canadian Community Health Survey) +# ============================================================================ +cchs_metadata: + title: "CCHS-specific configuration" + description: "Metadata specific to the Canadian Community Health Survey harmonization project" + version: "0.1" + + # Source database naming conventions + source_databases: + pattern: "cchs{YEAR}_{TYPE}" + description: "CCHS database naming convention" + year_formats: + - "YYYY" # e.g., cchs2001 + - "YYYY_YYYY" # e.g., cchs2007_2008 + type_suffixes: + - code: "p" + description: "Public Use Microdata File (PUMF)" + - code: "i" + description: "ICES data (temporary reference to institute)" + - code: "s" + description: "Shared file (contains variables)" + - code: "m" + description: "Master file" + examples: + - "cchs2001_p" + - "cchs2017_2018_i" + - "cchs2015_2016_s" + - "cchs2015_2016_m" + + # Field-specific enumerations for CCHS + field_enums: + subject: + description: "CCHS subject areas" + values: + - "ADL" + - "Age" + - "Alcohol" + - "BMI" + - "Chronic condition" + - "Diet" + - "Education" + - "Ethnicity" + - "Exercise" + - "Food security" + - "Health behaviour" + - "Health care use" + - "Health status" + - "Height" + - "Home ownership" + - "Household type" + - "Immigration" + - "Income" + - "Indigenous" + - "Life satisfaction" + - "Marital status" + - "Need" + - "Number" + - "Oral health" + - "Province" + - "Proxy" + - "Race" + - "Rec" + - "Sample" + - "Sex" + - "Sleep" + - "Smoking" + - "Vaccination" + - "Weight" + + section: + description: "CCHS survey sections" + values: + - "Demographics" + - "Health behaviour" + - "Health care use" + - "Health status" + - "N/A" + - "Sociodemographics" + + # Additional CCHS-specific metadata + notes: + - "Variables with '_A' suffix - To be confirmed (may be newer CCHS variable naming convention)" + - "DerivedVar patterns often combine multiple survey waves or create composite scores" + - "Some variables may not have entries from all survey years due to questionnaire changes" \ No newline at end of file diff --git a/scope-docs/metadata-schemas.qmd b/scope-docs/metadata-schemas.qmd new file mode 100644 index 00000000..5fc671cd --- /dev/null +++ b/scope-docs/metadata-schemas.qmd @@ -0,0 +1,104 @@ +--- +title: Metadata +format: + html: + embed-resources: true +--- + +# YAML Metadata Schema Specifications + +## Purpose & Rationale + +The YAML metadata schemas define the structure, constraints, and documentation for harmonized research data within the recodeflow ecosystem. These schemas serve as the single source of truth for data structure, enabling: + +- **Version control optimization**: Semantic row IDs and consistent column ordering minimize git diff noise +- **Data quality assurance**: Formal validation rules catch errors before data processing +- **Interoperability**: LinkML-ready structure supports future integration with semantic web standards +- **Documentation generation**: Schemas serve as both machine-readable specifications and human documentation + +Draft versions of the yamls are in the metadata-schema folder. + +- [variables.yaml](/scope-docs/metadata-schema/variables.yaml) +- [variable_details.yaml](/scope-docs/metadata-schema/variable_details.yaml) + +## Core Schema Architecture + +### 1. **variables.yaml** (Variable Catalog) +Defines the master registry of harmonized variables with: +- Semantic row identifiers: `var_{variable_name}` +- Variable metadata: labels, types, units, harmonization status +- Transformation patterns: formal regex specifications for source mappings +- Version tracking: semantic versioning at row level + +### 2. **variable_details.yaml** (Value Mappings) +Specifies value-level transformations and recoding rules with: +- Hierarchical row identifiers: `detail_{variable_name}_{sequence}` +- Recoding specifications: interval notation `[a,b]`, NA categories `NA::a` +- Template system: reusable transformation patterns +- Conditional validation: context-aware rules (e.g., dummy variables for categoricals) + +## Key Design Principles + +### Schema-Driven Development +- **Declarative specifications**: Define what, not how +- **Progressive validation**: From permissive (development) to strict (production) +- **Extensibility**: Project-specific metadata sections (e.g., `cchs_metadata`) + +### Git-Native Architecture +- **Clean filenames**: No version numbers in files +- **Semantic versioning**: Git tags (v0.1, v1.0, v2.1.0) +- **Stable identifiers**: Prevent row reordering in diffs + +### Standards Alignment +- **Frictionless Data**: Table Schema-inspired field definitions +- **LinkML-ready**: Structured for future semantic web integration +- **DDI/Dublin Core**: Metadata elements align with established standards + +## Schema Components + +```yaml +# Core structure for both schemas +csv_format: # Explicit CSV specifications +variables_schema: # Main schema definition + fields: # Column definitions with types & constraints + validation_rules: # Cross-field dependencies + patterns: # Formal specifications (regex) +cchs_metadata: # Project-specific extensions +``` + +## Primary Use Cases + +### 1. **Data Validation** +- Pre-commit hooks validate CSV structure +- Type checking and constraint enforcement +- Pattern matching for complex fields + +### 2. **Documentation Generation** +- Automated codebooks from schema definitions +- API documentation for data access +- Variable catalogs for researchers + +### 3. **Transformation Specifications** +- Formal recoding rules for reproducibility +- Template-based transformations +- Traceable data lineage + +### 4. **Tool Integration** +- R package functions consume schemas directly +- Validation tools parse YAML specifications +- Future LinkML converters for RDF generation + +## Implementation Status + +- **Current**: v2.1.0 schemas in production for cchsflow +- **Active**: Template system development for reusable patterns +- **Planned**: LinkML conversion tools, multi-project inheritance + +## Benefits + +- **Reproducibility**: Versioned schemas ensure consistent data interpretation +- **Collaboration**: Clear specifications reduce miscommunication +- **Quality**: Automated validation catches errors early +- **Efficiency**: Reusable patterns reduce duplication +- **Governance**: Formal metadata supports compliance requirements + diff --git a/scope-docs/metadata.qmd b/scope-docs/metadata.qmd index 6eb587e0..e6a918d1 100644 --- a/scope-docs/metadata.qmd +++ b/scope-docs/metadata.qmd @@ -36,6 +36,8 @@ all variable metadata and should continue to do so. As much as possible, any new metadata that the library wishes to support should be added to the sheets rather than for example being hardcoded in the library. +The sheets themselves + ## Dataset metadata Dataset metadata sits on top of variable metadata and is used to annotate an @@ -62,4 +64,25 @@ document. The library should enable users to easily create a data dictionary including information from the variable and dataset metadata mentioned above. +### Metadata Schema Architecture + +To support the metadata requirements outlined above, recodeflow implements +a YAML-based schema architecture that: + +- **Defines structure**: Formal specifications for variable and variable_details + CSV files including column names, types, constraints, and validation rules +- **Enables validation**: Schema-driven validation ensures data quality before + processing +- **Supports extensibility**: Project-specific metadata sections allow custom + fields while maintaining core compatibility +- **Facilitates interoperability**: LinkML-ready structure enables future + integration with semantic web standards and metadata exchange +- **Provides documentation**: Schemas serve as both machine specifications + and human documentation + +This architecture ensures that metadata remains the "source of truth" as +specified, while providing the structure needed for reliable software +implementation. +For detailed specifications of the YAML metadata schemas, see +[Metadata Schema Specifications](metadata-schemas.qmd). From 60fdec4e705813cb1849f141870eade67db56f1e Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Sun, 22 Jun 2025 14:12:35 -0400 Subject: [PATCH 2/4] feat: Implement three-file metadata architecture with DRY principles Introduces metadata_registry.yaml as central coordination layer, streamlined variables.yaml and variable_details.yaml schemas, and ecosystem_roadmap.md for future planning. Eliminates duplication while adding production-tested validation patterns from real-world cchsflow usage. --- scope-docs/ecosystem_roadmap.md | 211 +++++++++ .../metadata-schema/metadata_registry.yaml | 215 +++++++++ .../metadata-schema/variable_details.yaml | 419 +++++++++++------- scope-docs/metadata-schema/variables.yaml | 404 +++++++---------- 4 files changed, 858 insertions(+), 391 deletions(-) create mode 100644 scope-docs/ecosystem_roadmap.md create mode 100644 scope-docs/metadata-schema/metadata_registry.yaml diff --git a/scope-docs/ecosystem_roadmap.md b/scope-docs/ecosystem_roadmap.md new file mode 100644 index 00000000..1fbcf839 --- /dev/null +++ b/scope-docs/ecosystem_roadmap.md @@ -0,0 +1,211 @@ +# Recodeflow Metadata Ecosystem Roadmap + +## Purpose +This document captures future planning and ecosystem development ideas for the recodeflow metadata system. It serves as a lessons-learned repository and planning document for metadata architecture evolution. + +## Context +This roadmap emerged from work on PR #70 and extensive experience with Claude Code automation reducing variable harmonization work by ~90%. The current three-file architecture (metadata_registry.yaml, variables.yaml, variable_details.yaml) addresses immediate needs while positioning for future growth. + +## Current State (2025-Q2) + +### Implemented Components +- **Core schemas**: variables.yaml and variable_details.yaml for CSV validation +- **Central registry**: Shared specifications to eliminate duplication (DRY principle) +- **Extension system**: Template variables for workflow efficiency +- **AI-friendly documentation**: Clear structure that prevents Claude Code stumbles + +### Proven Patterns +- **Task-specific file usage**: Registry for validation, schemas for layout assessment +- **Progressive documentation**: Over-documentation prevents AI errors, reduces human effort +- **Cross-file references**: Clear separation with explicit coordination + +## Future Development Areas + +### Database Metadata System +**Status**: Conceptual (discussed in PR #65, #43) + +**Vision**: Dublin Core compliant dataset-level metadata +- **Purpose**: Dataset documentation, cataloging, and provenance tracking +- **Standard**: Dublin Core with DCAT extensions +- **Implementation**: YAML sidecar files alongside data +- **Integration**: Automated data dictionary generation, rec_with_table() metadata + +**Fields identified**: +- title, description, creator, publisher, subject +- date_created, date_modified, version, license, contact_point + +**Implementation approach**: +- Functions: set_catalog(), get_catalog(), print.catalog(), summary.catalog() +- CSV import/export support for non-R users +- Flexible, modular metadata management + +### Extension Framework Evolution + +**Current**: Single extension (templateVariable) with ad-hoc documentation +**Future**: Systematic extension categories and management + +#### Extension Categories (Proposed) +1. **Workflow Efficiency** + - Template variables (implemented) + - Batch transformation patterns + - Variable inheritance systems + +2. **Data Integration** + - Complex source integration patterns + - Multi-database harmonization rules + - External system connectors + +3. **Validation Enhancement** + - Advanced data quality rules + - Cross-field validation logic + - Business rule enforcement + +4. **Project Management** + - Collaboration workflow support + - Version control integration + - Review and approval systems + +#### Extension Placement Framework +**Decision criteria for new extensions**: +- **variables.csv**: Extensions affecting variable-level metadata, definitions, cross-variable relationships +- **variable_details.csv**: Extensions affecting transformation rules, value mappings, row-level processing +- **Both files**: Extensions requiring coordination between variable definitions and transformations + +### Project-Specific Extensions + +**Current examples**: +- **cchsflow**: CCHS database patterns, health survey subjects, survey section categories +- **raiflow**: RAI assessment patterns, care classifications + +**Future pattern**: +- Core schema compatibility maintained across all projects +- Domain-specific enumerations and validation rules +- Shared template patterns across projects +- Extension inheritance model (never remove core rules, only add) + +### Validation System Evolution + +#### Current Implementation +- **Basic mode**: Minimal validation for Excel-first researchers +- **Full mode**: Comprehensive validation for automated processing +- **Progressive validation**: Support different technical comfort levels + +#### Future Enhancements +- **Cross-file validation**: Consistency checking between variables and variable_details +- **Template system validation**: Inheritance and reference validation +- **Real-time validation**: Integration with editing tools +- **Semantic validation**: Business logic beyond field constraints + +### AI Collaboration Optimization + +#### Lessons Learned +- **Clear documentation prevents AI errors**: Structure matters more than brevity +- **Examples accelerate understanding**: Concrete patterns > abstract descriptions +- **Reference relationships**: Cross-file coordination reduces confusion +- **Progressive complexity**: Start simple, add complexity as needed + +#### Future AI Integration +- **Schema-driven code generation**: Automated CSV reading/writing functions +- **Validation tool generation**: Auto-generate validation functions from schemas +- **Documentation automation**: Generate user guides from schema definitions +- **Pattern recognition**: AI-assisted extension development + +## Architecture Decisions + +### Three-File Approach: Validated +**Registry-first workflow** positions users well for both human and AI collaboration: +1. **Context before details**: Registry provides ecosystem understanding +2. **Task-specific entry points**: Validation, layout assessment, extension discovery +3. **Flexible consumption**: Users access what they need when they need it + +### DRY Implementation: Successful +**Shared specifications eliminate duplication without over-engineering**: +- CSV format, validation patterns, tier system centralized +- Individual schemas focus on field definitions +- Clear cross-references prevent maintenance issues + +### Extension Strategy: Evolving +**Template variables prove the extension concept**: +- Workflow efficiency gains demonstrated +- Clear placement criteria established +- Foundation for systematic extension development + +## Implementation Priorities + +### Short Term (Next 6 months) +1. **Complete database metadata implementation** + - Define database_metadata.yaml schema + - Implement basic catalog functions + - Integration with existing workflows + +2. **Extension documentation enhancement** + - Template variable usage guide + - Extension development guidelines + - Clear examples and patterns + +3. **Validation tool optimization** + - Performance testing with large datasets + - User experience refinement + - Error message improvement + +### Medium Term (6-18 months) +1. **Advanced extension categories** + - Data integration patterns + - Validation enhancement framework + - Project management features + +2. **Cross-project standardization** + - Shared extension patterns + - Compatibility testing + - Migration tools + +3. **AI integration expansion** + - Schema-driven automation + - Pattern recognition tools + - Documentation generation + +### Long Term (18+ months) +1. **Ecosystem standardization** + - Industry adoption patterns + - Standards body engagement + - Interoperability frameworks + +2. **Advanced collaboration features** + - Multi-user workflows + - Version control integration + - Review and approval systems + +## Success Metrics + +### Current Measures +- **AI productivity**: 90% reduction in variable harmonization effort (achieved) +- **Documentation clarity**: Reduced Claude Code errors (achieved) +- **DRY compliance**: Eliminated specification duplication (achieved) + +### Future Measures +- **Extension adoption**: Number of active extensions across projects +- **Cross-project compatibility**: Successful schema sharing between projects +- **User productivity**: Time reduction in metadata creation and validation +- **System reliability**: Reduced validation errors and data quality issues + +## Risk Management + +### Technical Risks +- **Complexity creep**: Monitor abstraction levels, maintain user focus +- **Performance impact**: Three-file reading overhead vs. single-file simplicity +- **Maintenance burden**: Cross-file consistency requirements + +### Adoption Risks +- **User confusion**: Three-file system vs. familiar single-file approach +- **Tool compatibility**: Validation tools may need significant updates +- **Migration effort**: Existing projects require schema updates + +### Mitigation Strategies +- **Incremental rollout**: Gradual introduction of new features +- **Backwards compatibility**: Maintain support for existing patterns +- **Clear documentation**: Comprehensive guides and examples +- **User feedback loops**: Regular assessment of pain points and benefits + +--- + +*This roadmap will be updated quarterly based on implementation experience and user feedback.* \ No newline at end of file diff --git a/scope-docs/metadata-schema/metadata_registry.yaml b/scope-docs/metadata-schema/metadata_registry.yaml new file mode 100644 index 00000000..af90070a --- /dev/null +++ b/scope-docs/metadata-schema/metadata_registry.yaml @@ -0,0 +1,215 @@ +schema_version: "1.0.0" +schema_date: "2025-06-22" +description: "Central registry for recodeflow metadata schemas and shared specifications." +purpose: "Single source of truth for shared specifications used across variables.yaml and variable_details.yaml schemas." + +# ============================================================================ +# SHARED SPECIFICATIONS - DRY compliance +# ============================================================================ + +shared_specifications: + csv_format: + description: "Standard CSV formatting rules used across all recodeflow metadata files." + encoding: "UTF-8" + bom: false + delimiter: "," + quote_char: '"' + escape_char: '"' + line_terminator: "\n" + header_required: true + header_case_sensitive: true + quote_when_needed: true + trailing_delimiter: false + blank_lines: "skip" + comment_char: null + + validation_patterns: + description: "Common regex patterns and validation rules used across schemas." + patterns: + variable_name: "^[a-zA-Z_][a-zA-Z0-9_]*$" + semantic_version: "^[0-9]+\\.[0-9]+\\.[0-9]+$" + iso_date: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" + + # variableStart transformation patterns (used in both schemas) + transformation_patterns: + simple_reference: + pattern: "^\\[[a-zA-Z][a-zA-Z0-9_]*\\]$" + description: "References variable from default/any source database (case-insensitive)." + example: "[ADL_005] or [adl_005]" + + database_mapping: + pattern: "^[a-zA-Z0-9_]+::[a-zA-Z][a-zA-Z0-9_]*$" + description: "Explicit database::variable mapping for single source (case-insensitive)." + example: "cchs2001_p::RACA_6A or cchs2001_p::raca_6a" + + derived_variable: + pattern: "^DerivedVar::\\[([a-zA-Z][a-zA-Z0-9_]*(,\\s*[a-zA-Z][a-zA-Z0-9_]*)*)\\]$" + description: "Computed variable from multiple inputs using derivation function." + example: "DerivedVar::[PAC_4A_cont, PAC_4B_cont]" + + multiple_database_mapping: + pattern: "^[a-zA-Z0-9_]+::[a-zA-Z][a-zA-Z0-9_]*(,\\s*[a-zA-Z0-9_]+::[a-zA-Z][a-zA-Z0-9_]*)*$" + description: "Variable exists in multiple databases with different names (case-insensitive)." + example: "cchs2001_p::RACA_6A, cchs2003_p::RACC_6A" + + mixed_pattern: + pattern: "^(\\[[a-zA-Z][a-zA-Z0-9_]*\\]|[a-zA-Z0-9_]+::[a-zA-Z][a-zA-Z0-9_]*|DerivedVar::\\[[a-zA-Z][a-zA-Z0-9_]+(,\\s*[a-zA-Z][a-zA-Z0-9_]*)*\\])(,\\s*(\\[[a-zA-Z][a-zA-Z0-9_]*\\]|[a-zA-Z0-9_]+::[a-zA-Z][a-zA-Z0-9_]*|DerivedVar::\\[[a-zA-Z][a-zA-Z0-9_]+(,\\s*[a-zA-Z][a-zA-Z0-9_]*)*\\]))*$" + description: "Complex patterns combining multiple transformation types (case-insensitive)." + example: "cchs2001_p::RACA_6A, cchs2003_p::RACC_6A, [ADL_01]" + + # recStart interval notation patterns + interval_notation: + simple_values: + pattern: "^[a-zA-Z0-9]+$" + description: "Single values (text or numeric)." + examples: ["1", "english", "male"] + + closed_intervals: + pattern: "^\\[[0-9]*\\.?[0-9]*,\\s*[0-9]*\\.?[0-9]*\\]$" + description: "Closed intervals [a,b] - includes both endpoints." + examples: ["[1,3]", "[18.5,24.9]"] + + open_intervals: + pattern: "^\\([0-9]*\\.?[0-9]*,\\s*[0-9]*\\.?[0-9]*\\)$" + description: "Open intervals (a,b) - excludes both endpoints." + examples: ["(0,18.5)", "(65,120)"] + + half_open_intervals: + pattern: "^[\\[\\(][0-9]*\\.?[0-9]*,\\s*[0-9]*\\.?[0-9]*[\\]\\)]$" + description: "Half-open intervals [a,b) or (a,b]." + examples: ["[25,30)", "(18.5,25]"] + + complex_decimal_intervals: + pattern: "^\\[[-]?[0-9]*\\.?[0-9]*,\\s*[-]?[0-9]*\\.?[0-9]*[\\)\\]]$" + description: "Advanced intervals with negative decimals (Health Utility Index, complex scores)." + examples: ["[-0.359,1]", "[0.0487,0.1846)", "[-0.2231,-0.0872)"] + + # dummyVariable naming patterns + dummy_variable_patterns: + recommended_pattern: "^[a-zA-Z0-9_]+_(cat|cont)[0-9]+(_[0-9]+|_NA::[a-z])?$" + description: "Systematic naming for generated variables providing natural grouping." + examples: + categorical: ["age_cat4_1", "age_cat4_2", "smoking_cat3_1"] + continuous: ["bmi_cont1", "height_cont1"] + benefits: "Provides natural grouping and stable ordering for CSV files and git diffs." + + missing_data_standards: + description: "Standardized missing data handling across recodeflow system." + csv_metadata_fields: ["", "NA", "N/A"] + r_compliant_values: ["NA::a", "NA::b", "NA::c", "NA::d"] + usage_guidelines: | + - Use empty strings, "NA", or "N/A" for missing metadata fields + - Use tagged missing values (NA::a, etc.) for survey data patterns + + tier_system: + description: "Standard tier classification system used across schemas." + philosophy: "Users specify tier, system handles complexity." + tiers: + core: + description: "Essential fields required for basic functionality." + presence: "required" + validation_strictness: "strict" + + optional: + description: "Extensions for enhanced documentation and organization." + presence: "recommended" + validation_strictness: "permissive" + + extension: + description: "Enhanced functionality fields for advanced features." + presence: "conditional" + validation_strictness: "permissive" + + versioning: + description: "Best practices for project management, transparency, and reproducibility." + presence: "recommended" + validation_strictness: "permissive" + +# ============================================================================ +# SCHEMA REGISTRY - Current implementation +# ============================================================================ + +schema_registry: + harmonization_schemas: + variables: + file: "variables.yaml" + purpose: "Define harmonized variable attributes, types, labels, and specifications." + target_csv: "variables.csv" + + variable_details: + file: "variable_details.yaml" + purpose: "Define value-level transformations, recoding logic, and categorical mappings." + target_csv: "variable_details.csv" + +# ============================================================================ +# EXTENSION REGISTRY - Current extensions +# ============================================================================ + +extension_registry: + template_variables: + description: "Reusable transformation patterns to avoid categorical structure duplication." + affects_schemas: ["variable_details"] + field_location: "templateVariable field in variable_details.csv" + status: "active" + values: ["Yes", "No", null, "", ""] + usage_notes: | + Production-tested functionality since 2022. Reduces duplication significantly: + - 8 variables × 132 categories = 1,056 rows reduced to 138 rows (87% reduction) + - Template definition: templateVariable = "Yes" + - Template usage: templateVariable = "" + +# ============================================================================ +# USAGE GUIDANCE +# ============================================================================ + +usage_guidance: + validation_workflow: + description: "How validation tools should use these specifications." + csv_import_validation: "Use shared_specifications for format requirements and validation rules" + schema_validation: "Use individual schema files for field-specific validation" + cross_validation: "Check variables referenced in variable_details exist in variables.csv" + + file_relationships: + description: "How the three files work together." + registry_first: "Start with metadata_registry.yaml to understand shared specifications" + schema_specific: "Use variables.yaml or variable_details.yaml for detailed field requirements" + + task_specific_usage: + csv_validation: "metadata_registry.yaml + relevant schema file" + layout_assessment: "relevant schema file (variables.yaml or variable_details.yaml)" + extension_discovery: "metadata_registry.yaml extension_registry section" + pattern_validation: "Use transformation_patterns and interval_notation from registry" + + cross_validation_rules: + variable_references: + description: "Variables in variable_details must exist in variables.csv" + validation: "Check variable_details.variable against variables.variable" + + database_consistency: + description: "Database references should be consistent" + validation: "Check databaseStart values match across files" + + template_references: + description: "Template usage must reference existing template definitions" + validation: "templateVariable values must reference defined templates or be 'Yes'/'No'" + + implementation_examples: + dummyVariable_creation: + categorical: | + For age categories: age_cat4_1, age_cat4_2, age_cat4_3, age_cat4_4 + Pattern: {variable}_{cat|cont}{num_categories}_{category_number} + continuous: | + For BMI: bmi_cont1, height_cont1, weight_cont1 + Pattern: {variable}_{cat|cont}{transformation_number} + + variableStart_usage: + simple: "[HEIGHT] - references HEIGHT variable from any database" + database_specific: "cchs2017_p::HWT_2 - specific database and variable" + derived: "DerivedVar::[HEIGHT_CM, WEIGHT_KG] - calculated from multiple variables" + complex: "cchs2001_p::VAR1, cchs2003_p::VAR2, [VAR3] - mixed sources" + + recStart_patterns: + categorical: "English, French, 1, 2, NA::a" + numeric_ranges: "[18.5,24.9], (0,18), [-0.359,1]" + missing_data: "NA::a (valid skip), NA::b (don't know/refusal)" + diff --git a/scope-docs/metadata-schema/variable_details.yaml b/scope-docs/metadata-schema/variable_details.yaml index 0580c6ae..013d4c0d 100644 --- a/scope-docs/metadata-schema/variable_details.yaml +++ b/scope-docs/metadata-schema/variable_details.yaml @@ -1,204 +1,317 @@ -schema_version: "0.1" -schema_date: "2025-01-01" -description: "Core variable details schema including active templateVariable development" +schema_version: "1.0.0" +schema_date: "2025-06-22" +description: "Variable details schema for recodeflow - defines transformation rules and recoding specifications for harmonizing data across multiple sources." +registry_file: "metadata_registry.yaml" + +# Note: Shared specifications (CSV format, tier system, validation patterns, etc.) +# are defined in metadata_registry.yaml to maintain DRY principles variable_details_schema: - title: "CCHSFlow Variable Details Configuration" - description: "Schema for variable_details.csv, defining value-level attributes, recoding logic, and categorical value labels for variables." - version: "0.1" - id_column_name: "file_row_id" + title: "Variable details configuration" + description: "Defines value-level transformations, recoding logic, and categorical mappings for data harmonization projects." + + id_column_name: "dummyVariable" + + # Column order based on cchsflow production + recodeflow extensions expected_column_order: - - "file_row_id" + # Core fields (positions 1-16) - cchsflow production compatibility - "variable" - - "templateVariable" # Active field - currently in development - "dummyVariable" - "typeEnd" + - "databaseStart" + - "variableStart" - "typeStart" - - "databaseStart" # camelCase convention - - "variableStart" # camelCase convention - - "variableStartLabel" # camelCase convention - - "numValidCat" - "recEnd" + - "numValidCat" - "catLabel" - "catLabelLong" - "units" - "recStart" - - "variableStartShortLabel" # camelCase convention + - "catStartLabel" + - "variableStartShortLabel" + - "variableStartLabel" - "notes" + # Extension fields (positions 17+) + - "templateVariable" + # Versioning fields (far right) + - "version" + - "lastUpdated" + - "harmonizationStatus" + - "reviewNotes" + # Field definitions organized by tier fields: - - name: "file_row_id" - title: "File Row Identifier" - description: "Unique identifier for the row within this CSV file. Generated using format: detail_{variable_name}_{sequence}" - type: "string" - constraints: - required: true - unique: true - pattern: "^detail_[a-zA-Z0-9_.]+_[0-9]{3}$" # Semantic ID pattern with sequence + # ============================================================================ + # CORE FIELDS - Essential for any recodeflow project + # ============================================================================ + - name: "variable" - title: "Variable Name" - description: "Canonical name of the variable this detail row pertains to. Foreign key to variables.csv." - type: "string" - constraints: - required: true - - name: "templateVariable" - title: "Template Variable Indicator" - description: "Indicates if this variable follows a template pattern or references another template variable" + title: "Variable name" + description: "Name of the harmonized variable being created." type: "string" + tier: "core" constraints: - required: false - # Permissive validation - allowing both "Yes"/"No" and template variable names + pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" + foreign_key: "variables.csv:variable" + notes: | + This should match a variable name defined in your variables.csv file. + Use descriptive names that clearly indicate what the variable represents. + - name: "dummyVariable" - title: "Dummy Variable Indicator" - description: "Indicates if this row defines a dummy variable created during recoding (eg, for a category of a categorical variable)." + title: "Row identifier" + description: "Unique identifier for each transformation rule." type: "string" + tier: "core" constraints: - required: false + unique: true + pattern_reference: "See metadata_registry.yaml dummy_variable_patterns for naming guidelines" + notes: | + Primary identifier for variable detail rows. Provides stable ordering for CSV files and git diffs. + + Recommended pattern: {variable}_{cat|cont}{number}_{category} + Examples: + - Categorical: age_cat4_1, age_cat4_2, smoking_cat3_1 + - Continuous: bmi_cont1, height_cont1, weight_cont1 + + Benefits: Natural grouping, stable CSV ordering, meaningful git diffs. + For complete patterns and examples, see metadata_registry.yaml dummy_variable_patterns. + - name: "typeEnd" - title: "Target Data Type" - description: "The data type of the variable *after* recoding or as its final representation (eg, categorical, numeric)." + title: "Target data type" + description: "Type of the variable after harmonization." type: "string" + tier: "core" constraints: - required: false - - name: "typeStart" - title: "Source Data Type" - description: "The data type of the variable *before* recoding or in its original form." - type: "string" - constraints: - required: false + enum: ["cat", "cont"] + notes: | + - "cat" for categorical variables (factors with discrete levels) + - "cont" for continuous variables (numeric measurements) + - name: "databaseStart" - title: "Original Database Name" - description: "Name of the original database or data source for this variable detail." + title: "Source database" + description: "Name of the original database or data source." type: "string" - constraints: - required: false + tier: "core" + notes: | + Identifies which database this transformation rule applies to. + Examples: "cchs2017_p", "rai_hc_2019", "custom_survey_2024" + - name: "variableStart" - title: "Original Variable Name or Source Value" - description: "Name of the original variable or specific source value being recoded." + title: "Source variable name" + description: "Name of the original variable being transformed." type: "string" + tier: "core" constraints: - required: false - - name: "variableStartLabel" - title: "Original Variable Label" - description: "Label of the original variable in the source database." + pattern_reference: "See metadata_registry.yaml transformation_patterns for validation rules" + notes: | + Specifies how to find the source data for transformation. + Uses same transformation patterns as variables.yaml variableStart field. + + Supports multiple patterns (case-insensitive): + - Simple reference: [HEIGHT] or [height] + - Database-specific: cchs2017_p::HWT_2 or cchs2017_p::hwt_2 + - Derived variables: DerivedVar::[HEIGHT_CM, WEIGHT_KG] + - Multiple sources: cchs2017_p::VAR1, cchs2019_p::VAR2 + - Complex mixed: cchs2001_p::RACA_6A, cchs2003_p::RACC_6A, [ADL_01] + + For complete validation patterns, see metadata_registry.yaml transformation_patterns. + + - name: "typeStart" + title: "Source data type" + description: "Type of the variable in its original form." type: "string" + tier: "core" constraints: - required: false - - name: "numValidCat" - title: "Number of Valid Categories" - description: "For categorical variables, the number of distinct valid categories." - type: "integer" - constraints: - required: false - minimum: 0 + enum: ["cat", "cont", "N/A"] + notes: | + Helps understand the transformation being performed. + Use "N/A" for derived variables or when type doesn't apply. + - name: "recEnd" - title: "Recoded Value (Target)" - description: "The target value after recoding. For categorical variables, this is the value being labelled." + title: "Target value" + description: "The harmonized value after transformation." + type: "string" + tier: "core" + notes: | + Defines what value this rule produces in the harmonized dataset. + Common patterns: + - Categorical codes: "1", "2", "3" + - Missing data: "NA::a", "NA::b" + - Functions: "Func::bmi_calculation" + - Copy original: "copy" + + - name: "numValidCat" + title: "Number of valid categories" + description: "Total count of valid (non-missing) categories for categorical variables." type: "string" + tier: "core" constraints: - required: false + pattern: "^([0-9]+|N/A)$" + notes: | + For categorical variables, specify the total number of meaningful categories. + Use "N/A" for continuous variables or when not applicable. + - name: "catLabel" - title: "Category Label (Short)" - description: "Short label for a specific category of a categorical variable." + title: "Category label" + description: "Short, display-friendly label for this category." type: "string" - constraints: - required: false + tier: "core" + notes: | + Brief labels suitable for charts, tables, and user interfaces. + Examples: "Male", "High", "18-24 years" + - name: "catLabelLong" - title: "Category Label (Long)" - description: "Long, descriptive label for a specific category of a categorical variable." + title: "Detailed category label" + description: "Comprehensive description for documentation and codebooks." type: "string" - constraints: - required: false + tier: "core" + notes: | + Full descriptive labels for complete documentation. + Examples: "Body mass index 25.0-29.9 (overweight)", "Valid skip due to survey logic" + - name: "units" - title: "Units" - description: "Units of measurement, if applicable to this specific variable detail or category." + title: "Measurement units" + description: "Units of measurement for the variable." type: "string" - constraints: - required: false + tier: "core" + notes: | + Specify units for continuous variables to ensure proper interpretation. + Examples: "kg", "years", "cm", "minutes/day", "score (0-100)" + Leave blank for categorical variables. + - name: "recStart" - title: "Recode From Value (Source)" - description: "The original value or range that is being recoded to 'recEnd'." + title: "Source value or range" + description: "Original value or condition that triggers this transformation." type: "string" + tier: "core" constraints: - required: false + pattern_reference: "See metadata_registry.yaml interval_notation for validation rules" + notes: | + Defines what source data matches this transformation rule. + Enhanced interval notation based on real-world validation with 3,577 records. + + Supports comprehensive patterns: + - Single values: "1", "male", "english" + - Closed intervals: "[18.5,24.9]" (includes endpoints) + - Open intervals: "(0,18.5)" (excludes endpoints) + - Half-open: "[25,30)", "(18.5,25]" + - Complex decimals: "[-0.359,1]", "[0.0487,0.1846)" + - Missing data: "NA::a", "NA::b" + - Default case: "else" + + For complete interval notation patterns, see metadata_registry.yaml interval_notation. + + - name: "catStartLabel" + title: "Source category label" + description: "Label describing the original category being transformed." + type: "string" + tier: "core" + notes: | + Documents what the source category represents in the original data. + Helpful for understanding transformations and maintaining documentation. + - name: "variableStartShortLabel" - title: "Original Variable Short Label" - description: "Short label of the original variable in the source database." + title: "Source variable short label" + description: "Brief label for the source variable." type: "string" - constraints: - required: false + tier: "core" + notes: | + Abbreviated description of the source variable for compact displays. + + - name: "variableStartLabel" + title: "Source variable label" + description: "Full descriptive label of the source variable." + type: "string" + tier: "core" + notes: | + Complete description of what the source variable measures or represents. + Should match the official documentation from the source database. + - name: "notes" - title: "Notes" - description: "Specific notes or comments related to this variable detail or recoding rule." + title: "Transformation notes" + description: "Additional comments or documentation for this transformation rule." type: "string" - constraints: - required: false - - missingValues: ["", "NA", "N/A"] - allow_additional_columns: true # Permissive during recodeflow development - extension_schema: null + tier: "core" + notes: | + Use for any special considerations, assumptions, or explanations needed + to understand or maintain this transformation rule. ---- - -# Template System Schema (inst/metadata/schemas/core/templates.yaml) -schema_version: "0.1" -schema_date: "2025-01-01" -description: "Template variable system documentation and validation rules" - -template_system_schema: - title: "Template Variable System" - description: "Schema for template variable inheritance and validation in recodeflow development" - version: "0.1" - - # How to identify template definitions - template_definitions: - marker_field: "templateVariable" - marker_values: ["Yes"] - required_fields: - - "variable" # Template must have a name (serves as template identifier) - - "typeEnd" # Template must define output type - - "recStart" # Template must define source values - - "recEnd" # Template must define target values + # ============================================================================ + # EXTENSION FIELDS - Enhanced functionality + # ============================================================================ - # How template inheritance works - template_inheritance: - reference_field: "templateVariable" - reference_pattern: "template_name" # References existing template by variable name - required_fields: - - "variable" # Using variable must have unique name - - "variableStart" # Using variable must define source mapping - inheritance_rules: - - "Template recoding rules (recStart/recEnd) are inherited" - - "Using variable defines its own source mapping (variableStart)" - - "Type information (typeEnd/typeStart) can be inherited or overridden" - - # Validation rules for template system - validation_rules: - template_existence: - description: "Referenced templates must exist in the same variable_details file" - rule: "If templateVariable != 'Yes' and templateVariable != 'No', then variable with that name and templateVariable = 'Yes' must exist" - - circular_references: - description: "Templates cannot reference other templates" - rule: "If templateVariable = 'Yes', then variable cannot reference another template" + - name: "templateVariable" + title: "Template system indicator" + description: "Enables reusable transformation patterns to avoid duplication." + type: "string" + tier: "extension" + constraints: + enum: ["Yes", "No", null, ""] + usage_reference: "See metadata_registry.yaml extension_registry for implementation details" + default_value: "No" + notes: | + Reduces duplication for intended purposes: + - 8 variables × 132 categories = 1,056 rows reduced to 138 rows (87% reduction) + + Values: + - "Yes": This row defines a reusable template + - "No": Normal variable (not using templates) + - Template name: This variable extends the named template + + For complete usage guidance and examples, see metadata_registry.yaml extension_registry. + + # ============================================================================ + # VERSIONING FIELDS - Professional project management + # ============================================================================ - consistent_typing: - description: "Template usage should maintain type consistency" - rule: "Variables using templates should have compatible typeEnd values" + - name: "version" + title: "Version number" + description: "Semantic version of this variable detail definition." + type: "string" + tier: "versioning" + constraints: + pattern: "^[0-9]+\\.[0-9]+\\.[0-9]+$" + notes: | + Track changes to transformation rules using semantic versioning (e.g., 1.0.0). + Increment for changes: major.minor.patch + + - name: "lastUpdated" + title: "Last updated" + description: "Date when this transformation rule was last modified." + type: "string" + tier: "versioning" + format: "date" + constraints: + pattern: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" + notes: | + Use ISO date format: YYYY-MM-DD + Helps track when changes were made for collaboration and maintenance. + + - name: "harmonizationStatus" + title: "Harmonization status" + description: "Current status of this transformation in your workflow." + type: "string" + tier: "versioning" + constraints: + enum: ["development", "active", "not_harmonizable", "pending_review"] + notes: | + Track the progress of harmonization work: + - "development": Still being developed or tested + - "active": Ready for production use + - "not_harmonizable": Cannot be harmonized (document why in reviewNotes) + - "pending_review": Needs review before finalization + + - name: "reviewNotes" + title: "Review notes" + description: "Notes about harmonization decisions and review outcomes." + type: "string" + tier: "versioning" + notes: | + Document decisions, rationale, and any issues discovered during review. + Useful for team collaboration and future reference. - # Examples for documentation - examples: - simple_template: - description: "Basic language template example" - template_definition: - variable: "lang" - templateVariable: "Yes" - typeEnd: "cat" - recStart: ["english", "french"] - recEnd: ["1", "2"] - - template_usage: - variable: "primary_lang" - templateVariable: "lang" - variableStart: "[PL]" - # Inherits: typeEnd="cat", recStart/recEnd mappings + # Configuration options (schema-specific) + allow_additional_columns: true + extension_schema: null + + # Note: Missing data handling, validation modes, and extensions are defined in metadata_registry.yaml \ No newline at end of file diff --git a/scope-docs/metadata-schema/variables.yaml b/scope-docs/metadata-schema/variables.yaml index b9ab0cc6..ebc3af57 100644 --- a/scope-docs/metadata-schema/variables.yaml +++ b/scope-docs/metadata-schema/variables.yaml @@ -1,296 +1,224 @@ -schema_version: "0.1" -schema_date: "2025-01-01" -description: "Core variables schema describing the format and specifications of variables (variables.csv) within the recodeflow package." +schema_version: "1.0.0" +schema_date: "2025-06-22" +description: "Variables schema for recodeflow - defines structure and metadata for variables.csv files used in data harmonization projects." +registry_file: "metadata_registry.yaml" -# CSV file format specification -csv_format: - encoding: "UTF-8" # Character encoding - bom: false # No UTF-8 BOM (Byte Order Mark) - delimiter: "," # Field delimiter - quote_char: '"' # Character used to quote fields - escape_char: '"' # Escape character (doubled quotes) - line_terminator: "\n" # Unix-style line endings (LF) - header_required: true # First row must contain column names - header_case_sensitive: true # Column names must match exactly - quote_when_needed: true # Quote fields containing delimiter, quotes, or newlines - trailing_delimiter: false # No comma after last field - blank_lines: "skip" # Skip blank lines - comment_char: null # No comment lines allowed +# Note: Shared specifications (CSV format, tier system, validation patterns, etc.) +# are defined in metadata_registry.yaml to maintain DRY principles variables_schema: - title: "cchsflow variables configuration" - description: "Schema for variables.csv, defining master variable attributes, types, and labels. Used for data processing and metadata generation within the cchsflow project." - version: "0.1" + title: "Variables configuration" + description: "Defines master variable attributes, types, labels, and specifications for harmonization projects." - id_column_name: "fileRowId" + id_column_name: "variable" + + # Column order - core fields first, then optional, then versioning expected_column_order: - - "fileRowId" + # Core fields - essential for any project - "variable" - "label" - - "labelLong" + - "labelLong" + - "variableType" + - "databaseStart" + - "variableStart" + # Optional fields - enhanced documentation - "subject" - "section" - - "variableType" - - "databaseStart" - "units" - - "variableStart" - "notes" - - "description" + - "description" + # Extension fields - enhanced functionality + # Currently no extension fields for variables + # Versioning fields - professional workflows - "version" - "lastUpdated" - "harmonizationStatus" - "reviewNotes" + # Field definitions organized by tier fields: - - name: "fileRowId" - title: "File row identifier" - description: "Unique identifier for the row within this CSV file. Generated using format: var_{variable_name}" - type: "string" - constraints: - required: true - unique: true - pattern: "^var_[a-zA-Z0-9_.]+$" # Semantic ID pattern + # ============================================================================ + # CORE FIELDS - Essential for any recodeflow project + # ============================================================================ + - name: "variable" title: "Variable name" - description: "Canonical and unique name of the variable being described. Used as a key." + description: "Unique name of the harmonized variable you are creating." type: "string" + tier: "core" constraints: - required: true unique: true - pattern: "^[a-zA-Z_][a-zA-Z0-9_.]*$" + pattern: "^[a-zA-Z_][a-zA-Z0-9_]*$" + notes: | + Choose descriptive names that clearly indicate what the variable represents. + Follow R naming conventions: start with letter/underscore, use letters/numbers/underscores only. + Examples: age_group, bmi_category, smoking_status + - name: "label" title: "Short label" - description: "A concise, human-readable label for the variable (eg, for chart axes, short displays)." + description: "Brief, human-readable label for displays and charts." type: "string" - constraints: - required: true + tier: "core" + notes: | + Keep concise (under 20 characters) for use in charts, tables, and compact displays. + Examples: "Age group", "BMI category", "Income level" + - name: "labelLong" title: "Long label" - description: "A longer, more descriptive human-readable label for the variable (eg, for full descriptions, codebook)." + description: "Detailed description for documentation and codebooks." type: "string" - constraints: - required: true + tier: "core" + notes: | + Comprehensive description used in data dictionaries and documentation. + Include operational definitions and important context. + Examples: "Body mass index categories based on WHO classification", + "Age at time of interview, grouped into 10-year intervals" + + - name: "variableType" + title: "Variable type" + description: "Whether the variable represents categories or continuous measurements." + type: "string" + tier: "core" + constraints: + enum: ["Categorical", "Continuous"] + notes: | + Determines how rec_with_table() processes the variable: + - "Categorical": Discrete categories or groups (factors with levels) + - "Continuous": Numeric measurements or counts + + - name: "databaseStart" + title: "Source database(s)" + description: "Name(s) of the original database(s) containing this variable's source data." + type: "string" + tier: "core" + notes: | + Identifies which databases contain the source data for this harmonized variable. + Examples: + - Single database: "cchs2017_p", "rai_hc_2019" + - Multiple databases: "cchs2017_p, cchs2019_p, cchs2021_p" + + - name: "variableStart" + title: "Source variable specification" + description: "How to find and combine source data to create this harmonized variable." + type: "string" + tier: "core" + constraints: + pattern_reference: "See metadata_registry.yaml transformation_patterns for validation rules" + notes: | + Specifies the transformation pattern for creating this variable. + Critical for data harmonization workflows - tells rec_with_table() how to find and transform source data. + + Supports multiple patterns (case-insensitive): + - Simple reference: [HEIGHT] or [height] + - Database-specific: cchs2017_p::HWT_2 or cchs2017_p::hwt_2 + - Derived variables: DerivedVar::[HEIGHT_CM, WEIGHT_KG] + - Multiple sources: cchs2017_p::VAR1, cchs2019_p::VAR2 + - Complex mixed: cchs2001_p::RACA_6A, cchs2003_p::RACC_6A, [ADL_01] + + For complete validation patterns and examples, see metadata_registry.yaml transformation_patterns. + + # ============================================================================ + # OPTIONAL FIELDS - Enhanced documentation and organization + # ============================================================================ + - name: "subject" title: "Subject area" - description: "The general subject or thematic area the variable belongs to (eg, demographics, health behaviours)." + description: "Thematic area or domain this variable belongs to." type: "string" - constraints: - required: false + tier: "optional" + notes: | + Groups variables by topic for better organization. + Examples: "Demographics", "Health behaviors", "Social determinants", "Physical measures" + - name: "section" title: "Survey section" - description: "Specific section or module of the survey/dataset where the variable originates." - type: "string" - constraints: - required: false - - name: "variableType" - title: "Variable type" - description: "The nature or type of the variable indicating how it should be analyzed and processed." + description: "Specific section or module where this variable originates." type: "string" - constraints: - required: true - enum: - - "Continuous" - - "Categorical" - - name: "databaseStart" - title: "Original database name" - description: "Name of the original database or data source from which this variable was derived or sourced." - type: "string" - constraints: - required: true + tier: "optional" + notes: | + Identifies the source questionnaire section or data collection module. + Examples: "Core demographics", "Health status", "Physical activity module" + - name: "units" title: "Measurement units" - description: "Units of measurement for the variable, if applicable (eg, kg, years, minutes/day)." + description: "Units of measurement for continuous variables." type: "string" - constraints: - required: false - - name: "variableStart" - title: "Source variables and transformation rules" - description: | - Specifies the source variable(s) used to create this variable. - This field supports multiple patterns for mapping source data to harmonized variables, - including simple references, database mappings, and derived variable calculations. - See patterns section below for formal specifications. - type: "string" - constraints: - required: true - patterns: - - name: "simple_reference" - pattern: "^\\[[A-Z][A-Z0-9_]*\\]$" - description: "Default pattern - references variable from any unspecified source" - example: "[ADL_005]" - - name: "database_mapping" - pattern: "^[a-zA-Z0-9_]+::[A-Z][A-Z0-9_]*$" - description: "Explicit database::variable mapping" - example: "cchs2001_p::RACA_6A" - - name: "derived_variable" - pattern: "^DerivedVar::\\[([A-Z][A-Z0-9_]*(,\\s*[A-Z][A-Z0-9_]*)*)\\]$" - description: "Derived variable calculation from multiple sources" - example: "DerivedVar::[HEIGHT, WEIGHT]" - - name: "multiple_sources" - pattern: "^([a-zA-Z0-9_]+::[A-Z][A-Z0-9_]*(,\\s*)?)+$" - description: "Multiple database::variable mappings" - example: "cchs2001_p::RACA_6A, cchs2003_p::RACC_6A" - - name: "combined_pattern" - pattern: "^([a-zA-Z0-9_]+::[A-Z][A-Z0-9_]*(,\\s*)?)+(,\\s*)?\\[[A-Z][A-Z0-9_]*\\]$" - description: "Multiple sources with default reference" - example: "cchs2001_p::RACA_6A, cchs2003_p::RACC_6A, [ADL_01]" + tier: "optional" + notes: | + Essential for continuous variables to ensure proper interpretation. + Examples: "kg", "years", "cm", "minutes/day", "score (0-100)" + Leave blank for categorical variables. + - name: "notes" - title: "Notes" - description: "General notes, comments, or annotations about the variable." - type: "string" - constraints: - required: false + title: "General notes" + description: "Additional comments or important information about this variable." + type: "string" + tier: "optional" + notes: | + Use for any special considerations, limitations, or usage notes. + Examples: methodology notes, data quality issues, interpretation guidance. + - name: "description" title: "Detailed description" - description: "A more detailed textual description or definition of the variable, potentially including operationalisation." + description: "Comprehensive definition including methodology and operational details." type: "string" - constraints: - required: false + tier: "optional" + notes: | + Full technical description including how the variable is constructed, + any assumptions made, and methodological considerations. + + # ============================================================================ + # VERSIONING FIELDS - Best-practice project management + # ============================================================================ + - name: "version" - title: "Variable version" - description: "Semantic version of this variable definition (e.g., 1.0.0)" + title: "Version number" + description: "Semantic version of this variable definition." type: "string" + tier: "versioning" constraints: - required: true pattern: "^[0-9]+\\.[0-9]+\\.[0-9]+$" + notes: | + Track changes using semantic versioning (e.g., 1.0.0). + Increment for changes: major.minor.patch + - name: "lastUpdated" - title: "Last updated date" - description: "Date when this variable definition was last modified (ISO format: YYYY-MM-DD)" + title: "Last updated" + description: "Date when this variable definition was last modified." type: "string" + tier: "versioning" format: "date" constraints: - required: true + pattern: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" + notes: | + Use ISO date format: YYYY-MM-DD + Helps track when changes were made for collaboration and maintenance. + - name: "harmonizationStatus" title: "Harmonization status" - description: "Current status of the variable in the harmonization process" + description: "Current status of this variable in your harmonization workflow." type: "string" - constraints: - required: true - enum: - - "development" - - "active" - - "not_harmonizable" - - "pending_review" + tier: "versioning" + constraints: + enum: ["development", "active", "not_harmonizable", "pending_review"] + notes: | + Track progress of harmonization work: + - "development": Still being developed or tested + - "active": Ready for production use + - "not_harmonizable": Cannot be harmonized across sources + - "pending_review": Needs review before finalization + - name: "reviewNotes" title: "Review notes" - description: "Brief notes about harmonization decisions, links to GitHub issues/discussions, or references to external documentation" + description: "Notes about harmonization decisions and review outcomes." type: "string" - constraints: - required: false - - missingValues: ["", "NA", "N/A"] + tier: "versioning" + notes: | + Document decisions, rationale, and any issues discovered during review. + Useful for team collaboration and future reference. - # Validation rules for cross-field dependencies - validation_rules: - cross_field: - - rule: "units_required_for_continuous" - condition: "variableType == 'Continuous'" - requirement: "units != null" - level: "warning" - message: "Continuous variable '{variable}' should have units specified" - - rule: "review_notes_for_not_harmonizable" - condition: "harmonizationStatus == 'not_harmonizable'" - requirement: "reviewNotes != null" - level: "warning" - message: "Variable '{variable}' marked as not_harmonizable should have reviewNotes explaining why" - - rule: "subject_section_mutual_exclusive" - condition: "subject != null AND section != null" - requirement: "section == 'N/A'" - level: "info" - message: "Variable '{variable}' has both subject and section; typically these are mutually exclusive" - - pattern_validation: - # Note: variableStart can contain mixed patterns (e.g., database::var + [var]) - # Pattern precedence to be determined in future versions - mixed_patterns_allowed: true - validation_message: "variableStart for '{variable}' does not match any known pattern: {value}" - - # Permissive for v0.1 - coordinate with recodeflow development - allow_additional_columns: true # Allow development flexibility + # Configuration options (schema-specific) + allow_additional_columns: true extension_schema: null - - # ============================================================================ -# PROJECT-SPECIFIC METADATA: CCHS (Canadian Community Health Survey) -# ============================================================================ -cchs_metadata: - title: "CCHS-specific configuration" - description: "Metadata specific to the Canadian Community Health Survey harmonization project" - version: "0.1" - - # Source database naming conventions - source_databases: - pattern: "cchs{YEAR}_{TYPE}" - description: "CCHS database naming convention" - year_formats: - - "YYYY" # e.g., cchs2001 - - "YYYY_YYYY" # e.g., cchs2007_2008 - type_suffixes: - - code: "p" - description: "Public Use Microdata File (PUMF)" - - code: "i" - description: "ICES data (temporary reference to institute)" - - code: "s" - description: "Shared file (contains variables)" - - code: "m" - description: "Master file" - examples: - - "cchs2001_p" - - "cchs2017_2018_i" - - "cchs2015_2016_s" - - "cchs2015_2016_m" - - # Field-specific enumerations for CCHS - field_enums: - subject: - description: "CCHS subject areas" - values: - - "ADL" - - "Age" - - "Alcohol" - - "BMI" - - "Chronic condition" - - "Diet" - - "Education" - - "Ethnicity" - - "Exercise" - - "Food security" - - "Health behaviour" - - "Health care use" - - "Health status" - - "Height" - - "Home ownership" - - "Household type" - - "Immigration" - - "Income" - - "Indigenous" - - "Life satisfaction" - - "Marital status" - - "Need" - - "Number" - - "Oral health" - - "Province" - - "Proxy" - - "Race" - - "Rec" - - "Sample" - - "Sex" - - "Sleep" - - "Smoking" - - "Vaccination" - - "Weight" - - section: - description: "CCHS survey sections" - values: - - "Demographics" - - "Health behaviour" - - "Health care use" - - "Health status" - - "N/A" - - "Sociodemographics" - # Additional CCHS-specific metadata - notes: - - "Variables with '_A' suffix - To be confirmed (may be newer CCHS variable naming convention)" - - "DerivedVar patterns often combine multiple survey waves or create composite scores" - - "Some variables may not have entries from all survey years due to questionnaire changes" \ No newline at end of file + # Note: Missing data handling, validation modes, and extensions are defined in metadata_registry.yaml \ No newline at end of file From 12de9680d0cb685bce108f421308a23550b8c153 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Sun, 22 Jun 2025 14:22:00 -0400 Subject: [PATCH 3/4] Add database metadata schema with Dublin Core compliance - Implements Dublin Core standard with 10 core fields from PRs #65 and #43 - Follows three-file architecture with registry reference for DRY principles - Includes recodeflow-specific extensions for workflow integration - Supports metadata file naming conventions and validation rules --- .../metadata-schema/database_metadata.yaml | 266 ++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 scope-docs/metadata-schema/database_metadata.yaml diff --git a/scope-docs/metadata-schema/database_metadata.yaml b/scope-docs/metadata-schema/database_metadata.yaml new file mode 100644 index 00000000..bf0e5ee4 --- /dev/null +++ b/scope-docs/metadata-schema/database_metadata.yaml @@ -0,0 +1,266 @@ +schema_version: "1.0.0" +schema_date: "2025-06-22" +description: "Database metadata schema for recodeflow - defines Dublin Core compliant dataset-level metadata for databases and data collections." +registry_file: "metadata_registry.yaml" + +# Note: YAML format specifications are defined in metadata_registry.yaml to maintain DRY principles + +database_metadata_schema: + title: "Database metadata configuration" + description: "Defines dataset-level metadata following Dublin Core standards for database documentation and cataloging." + + standard: "Dublin Core with DCAT extensions" + target_format: "YAML metadata files" + + # Field definitions following Dublin Core standard + fields: + # ============================================================================ + # CORE DUBLIN CORE FIELDS - Essential dataset documentation + # ============================================================================ + + - name: "title" + title: "Dataset title" + description: "Name of the dataset." + type: "string" + tier: "core" + dublin_core_element: "dc:title" + constraints: + required: true + notes: | + Provide a clear, concise name for the dataset. + Examples: "Health Survey 2024", "Primary Biliary Cirrhosis (PBC) Data Set" + + - name: "description" + title: "Dataset description" + description: "Detailed explanation of the dataset." + type: "string" + tier: "core" + dublin_core_element: "dc:description" + constraints: + required: true + notes: | + Comprehensive description of the dataset including purpose, scope, and methodology. + Should be sufficient for users to understand if the dataset meets their needs. + + - name: "creator" + title: "Dataset creator" + description: "Person or organization responsible for creating the data." + type: "array" + tier: "core" + dublin_core_element: "dc:creator" + constraints: + required: true + item_structure: + name: "Creator name" + affiliation: "Creator affiliation (optional)" + orcid: "ORCID identifier (optional)" + notes: | + Attribution of data origin and responsibility. + Examples: "Mayo Clinic", "RecodeFlow Team", "Statistics Canada" + + - name: "publisher" + title: "Dataset publisher" + description: "Organization publishing the data." + type: "string" + tier: "core" + dublin_core_element: "dc:publisher" + constraints: + required: true + notes: | + Identifies the official data publisher or distributing organization. + Examples: "Public Health Agency", "Mayo Clinic", "CRAN" + + - name: "subject" + title: "Dataset subject" + description: "Topics covered by the dataset." + type: "array" + tier: "core" + dublin_core_element: "dc:subject" + constraints: + required: true + notes: | + Categorize dataset's thematic content using relevant keywords or controlled vocabularies. + Examples: ["primary biliary cirrhosis", "clinical study", "medical research"] + + - name: "date_created" + title: "Creation date" + description: "Dataset creation date." + type: "string" + tier: "core" + dublin_core_element: "dc:date" + format: "date" + constraints: + required: true + pattern: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" + notes: | + Use ISO date format: YYYY-MM-DD + Track dataset's initial creation date. + + - name: "date_modified" + title: "Last modification date" + description: "Date when dataset was last modified." + type: "string" + tier: "optional" + dublin_core_element: "dcterms:modified" + format: "date" + constraints: + pattern: "^[0-9]{4}-[0-9]{2}-[0-9]{2}$" + notes: | + Use ISO date format: YYYY-MM-DD + Track most recent updates to the dataset. + + - name: "version" + title: "Dataset version" + description: "Version number of the dataset." + type: "string" + tier: "optional" + dublin_core_element: "dcterms:hasVersion" + constraints: + pattern: "^[0-9]+\\.[0-9]+\\.[0-9]+$" + notes: | + Track dataset iterations using semantic versioning (e.g., 1.0.0). + Increment for changes: major.minor.patch + + - name: "license" + title: "Licensing information" + description: "Licensing and usage rights information." + type: "string" + tier: "core" + dublin_core_element: "dc:rights" + constraints: + required: true + notes: | + Specify usage and distribution rights clearly. + Examples: "CC-BY 4.0", "Open Source", "Restricted - Contact Publisher" + + - name: "contact_point" + title: "Dataset contact" + description: "Contact information for dataset inquiries." + type: "string" + tier: "core" + dublin_core_element: "dcat:contactPoint" + constraints: + required: true + notes: | + Provide communication channel for questions about the dataset. + Examples: "support@example.org", "researcher@institution.edu" + + # ============================================================================ + # EXTENDED DUBLIN CORE / DCAT FIELDS - Enhanced metadata + # ============================================================================ + + - name: "type" + title: "Dataset type" + description: "Type or nature of the dataset." + type: "string" + tier: "optional" + dublin_core_element: "dc:type" + constraints: + enum: ["Dataset", "Survey", "Clinical Trial", "Administrative Data", "Registry"] + notes: | + Classify the nature of the data collection. + + - name: "format" + title: "Dataset format" + description: "Physical or digital manifestation of the dataset." + type: "string" + tier: "optional" + dublin_core_element: "dc:format" + notes: | + Describe the format and structure of the data. + Examples: "Tabular data", "CSV files", "R data frames" + + - name: "identifier" + title: "Dataset identifier" + description: "Unique identifier for the dataset." + type: "array" + tier: "optional" + dublin_core_element: "dc:identifier" + item_structure: + type: "Identifier type" + value: "Identifier value" + notes: | + Provide unique identifiers for referencing the dataset. + Examples: DOI, package name, institutional ID + + - name: "source" + title: "Dataset source" + description: "Source or origin of the dataset." + type: "string" + tier: "optional" + dublin_core_element: "dc:source" + notes: | + Reference to the original source or related datasets. + Examples: URLs, publications, parent datasets + + - name: "language" + title: "Dataset language" + description: "Language(s) used in the dataset." + type: "string" + tier: "optional" + dublin_core_element: "dc:language" + constraints: + pattern: "^[a-z]{2}(-[A-Z]{2})?$" + notes: | + Use ISO 639-1 language codes (e.g., "en", "fr", "en-CA"). + + - name: "relation" + title: "Related resources" + description: "Relationships to other datasets or resources." + type: "array" + tier: "optional" + dublin_core_element: "dc:relation" + item_structure: + type: "Relationship type" + identifier: "Related resource identifier" + description: "Description of relationship" + notes: | + Document connections to related datasets, publications, or projects. + + - name: "coverage" + title: "Dataset coverage" + description: "Spatial or temporal coverage of the dataset." + type: "object" + tier: "optional" + dublin_core_element: "dc:coverage" + structure: + spatial: "Geographic coverage" + temporal: "Time period coverage" + notes: | + Specify the scope of data collection in space and time. + + # ============================================================================ + # RECODEFLOW-SPECIFIC EXTENSIONS - Integration metadata + # ============================================================================ + + - name: "recodeflow_integration" + title: "Recodeflow integration metadata" + description: "Metadata specific to recodeflow usage and integration." + type: "object" + tier: "extension" + structure: + variables_file: "Associated variables.csv file" + variable_details_file: "Associated variable_details.csv file" + harmonization_notes: "Notes about harmonization approach" + rec_with_table_compatible: "Boolean indicating compatibility" + notes: | + Integration metadata for recodeflow workflow compatibility. + Links database metadata to associated variable definition files. + + # Usage patterns + usage_patterns: + metadata_files: + description: "YAML files alongside data files for metadata documentation." + naming_convention: "{dataset_name}_metadata.yaml" + examples: ["pbc_metadata.yaml", "cchs2017_metadata.yaml"] + + # Validation and quality + validation_notes: | + - All required Dublin Core fields must be present + - Date fields must follow ISO 8601 format (YYYY-MM-DD) + - Language codes must follow ISO 639-1 standard + - Contact points should be valid email addresses or URLs + - Version numbers should follow semantic versioning when provided + + # Note: Missing data handling, validation modes, and shared specifications + # are defined in metadata_registry.yaml \ No newline at end of file From 23d303a02e016d438240c0ff80fa851f0b0eaa25 Mon Sep 17 00:00:00 2001 From: Doug Manuel Date: Mon, 23 Jun 2025 10:03:09 -0400 Subject: [PATCH 4/4] Refactor metadata schema architecture with database extensions and usage separation Major changes: - Clean metadata_registry.yaml: removed usage guidance, enhanced extension registry - Add CCHS examples with validation rules and complete variable definitions - Create separate metadata_schema_usage_guide.md for implementation guidance - Register haven::tagged_na() integration and template system extensions - Add cross-validation requirements for schema consistency Architecture improvements: - Clear separation between registry coordination and usage documentation - Database-specific extensions with proper validation rules and examples - Production statistics for template system (87% reduction validated) - Comprehensive CCHS patterns based on 3,577 real records --- .../metadata-schema/metadata_registry.yaml | 86 ++++------ .../metadata_schema_usage_guide.md | 82 ++++++++++ .../variable_details_cchs_example.yaml | 148 ++++++++++++++++++ .../variables_cchs_example.yaml | 109 +++++++++++++ 4 files changed, 371 insertions(+), 54 deletions(-) create mode 100644 scope-docs/metadata-schema/metadata_schema_usage_guide.md create mode 100644 scope-docs/metadata-schema/variable_details_cchs_example.yaml create mode 100644 scope-docs/metadata-schema/variables_cchs_example.yaml diff --git a/scope-docs/metadata-schema/metadata_registry.yaml b/scope-docs/metadata-schema/metadata_registry.yaml index af90070a..a5cafc9d 100644 --- a/scope-docs/metadata-schema/metadata_registry.yaml +++ b/scope-docs/metadata-schema/metadata_registry.yaml @@ -140,6 +140,11 @@ schema_registry: file: "variable_details.yaml" purpose: "Define value-level transformations, recoding logic, and categorical mappings." target_csv: "variable_details.csv" + + cross_validation_requirements: + variable_consistency: "variable_details.variable must exist in variables.variable" + database_consistency: "databaseStart values must match between schemas" + template_consistency: "templateVariable references must be valid" # ============================================================================ # EXTENSION REGISTRY - Current extensions @@ -152,64 +157,37 @@ extension_registry: field_location: "templateVariable field in variable_details.csv" status: "active" values: ["Yes", "No", null, "", ""] - usage_notes: | - Production-tested functionality since 2022. Reduces duplication significantly: - - 8 variables × 132 categories = 1,056 rows reduced to 138 rows (87% reduction) - - Template definition: templateVariable = "Yes" - - Template usage: templateVariable = "" + + tagged_missing_data: + description: "Integration with haven::tagged_na() for survey missing data patterns." + affects_schemas: ["variables", "variable_details"] + field_location: "recEnd, recStart fields" + status: "active" + standard_codes: ["NA::a", "NA::b", "NA::c", "NA::d"] # ============================================================================ -# USAGE GUIDANCE +# DATABASE-SPECIFIC EXTENSIONS - Project-specific metadata # ============================================================================ -usage_guidance: - validation_workflow: - description: "How validation tools should use these specifications." - csv_import_validation: "Use shared_specifications for format requirements and validation rules" - schema_validation: "Use individual schema files for field-specific validation" - cross_validation: "Check variables referenced in variable_details exist in variables.csv" - - file_relationships: - description: "How the three files work together." - registry_first: "Start with metadata_registry.yaml to understand shared specifications" - schema_specific: "Use variables.yaml or variable_details.yaml for detailed field requirements" +database_specific_extensions: + description: "Framework for database/project-specific schema extensions and validation rules." + versioning_strategy: | + Database-specific extensions use independent versioning from core schemas: + - Core schemas (variables.yaml, variable_details.yaml): version 1.0.0 + - Database extensions (e.g., variables_cchs.yaml): version 2.2.0+ + - Registry coordination: metadata_registry.yaml version 1.0.0 - task_specific_usage: - csv_validation: "metadata_registry.yaml + relevant schema file" - layout_assessment: "relevant schema file (variables.yaml or variable_details.yaml)" - extension_discovery: "metadata_registry.yaml extension_registry section" - pattern_validation: "Use transformation_patterns and interval_notation from registry" - - cross_validation_rules: - variable_references: - description: "Variables in variable_details must exist in variables.csv" - validation: "Check variable_details.variable against variables.variable" - - database_consistency: - description: "Database references should be consistent" - validation: "Check databaseStart values match across files" + extension_mechanism: + description: "Projects can create database-specific extensions as separate YAML files." + note: "Implementation patterns may vary based on project needs and validation tool requirements." - template_references: - description: "Template usage must reference existing template definitions" - validation: "templateVariable values must reference defined templates or be 'Yes'/'No'" - - implementation_examples: - dummyVariable_creation: - categorical: | - For age categories: age_cat4_1, age_cat4_2, age_cat4_3, age_cat4_4 - Pattern: {variable}_{cat|cont}{num_categories}_{category_number} - continuous: | - For BMI: bmi_cont1, height_cont1, weight_cont1 - Pattern: {variable}_{cat|cont}{transformation_number} - - variableStart_usage: - simple: "[HEIGHT] - references HEIGHT variable from any database" - database_specific: "cchs2017_p::HWT_2 - specific database and variable" - derived: "DerivedVar::[HEIGHT_CM, WEIGHT_KG] - calculated from multiple variables" - complex: "cchs2001_p::VAR1, cchs2003_p::VAR2, [VAR3] - mixed sources" - - recStart_patterns: - categorical: "English, French, 1, 2, NA::a" - numeric_ranges: "[18.5,24.9], (0,18), [-0.359,1]" - missing_data: "NA::a (valid skip), NA::b (don't know/refusal)" + supported_databases: + cchs: + description: "Canadian Community Health Survey extensions" + files: ["variables_cchs_example.yaml", "variable_details_cchs_example.yaml"] + version: "2.2.0" + status: "production" + +# Note: Usage guidance and implementation examples are documented separately +# in the metadata schema documentation and individual schema files. diff --git a/scope-docs/metadata-schema/metadata_schema_usage_guide.md b/scope-docs/metadata-schema/metadata_schema_usage_guide.md new file mode 100644 index 00000000..a34edcec --- /dev/null +++ b/scope-docs/metadata-schema/metadata_schema_usage_guide.md @@ -0,0 +1,82 @@ +# Metadata Schema Usage Guide + +This guide provides practical guidance for using the recodeflow metadata schema system. + +## Validation Workflow + +### How validation tools should use these specifications: + +- **CSV import validation**: Use `shared_specifications` from metadata_registry.yaml for format requirements and validation rules +- **Schema validation**: Use individual schema files (variables.yaml or variable_details.yaml) for field-specific validation +- **Cross validation**: Check variables referenced in variable_details exist in variables.csv + +## File Relationships + +### How the three files work together: + +1. **Start with registry**: Begin with metadata_registry.yaml to understand shared specifications +2. **Schema-specific details**: Use variables.yaml or variable_details.yaml for detailed field requirements +3. **Database extensions**: Use database-specific files (e.g., variables_cchs_example.yaml) for project customizations + +## Task-Specific Usage + +- **CSV validation**: metadata_registry.yaml + relevant schema file +- **Layout assessment**: relevant schema file (variables.yaml or variable_details.yaml) +- **Extension discovery**: metadata_registry.yaml extension_registry section +- **Pattern validation**: Use transformation_patterns and interval_notation from registry + +## Cross-Validation Rules + +### Variable references +- **Description**: Variables in variable_details must exist in variables.csv +- **Validation**: Check variable_details.variable against variables.variable + +### Database consistency +- **Description**: Database references should be consistent +- **Validation**: Check databaseStart values match across files + +### Template references +- **Description**: Template usage must reference existing template definitions +- **Validation**: templateVariable values must reference defined templates or be 'Yes'/'No' + +## Implementation Examples + +### dummyVariable Creation + +**Categorical variables:** +``` +For age categories: age_cat4_1, age_cat4_2, age_cat4_3, age_cat4_4 +Pattern: {variable}_{cat|cont}{num_categories}_{category_number} +``` + +**Continuous variables:** +``` +For BMI: bmi_cont1, height_cont1, weight_cont1 +Pattern: {variable}_{cat|cont}{transformation_number} +``` + +### variableStart Usage Patterns + +- **Simple**: `[HEIGHT]` - references HEIGHT variable from any database +- **Database specific**: `cchs2017_p::HWT_2` - specific database and variable +- **Derived**: `DerivedVar::[HEIGHT_CM, WEIGHT_KG]` - calculated from multiple variables +- **Complex**: `cchs2001_p::VAR1, cchs2003_p::VAR2, [VAR3]` - mixed sources + +### recStart Patterns + +- **Categorical**: `English, French, 1, 2, NA::a` +- **Numeric ranges**: `[18.5,24.9], (0,18), [-0.359,1]` +- **Missing data**: `NA::a (valid skip), NA::b (don't know/refusal)` + +## Working with Database-Specific Extensions + +### CCHS Example +- Use `variables_cchs_example.yaml` and `variable_details_cchs_example.yaml` as templates +- Follow validation rules defined in these files +- Maintain version consistency (CCHS extensions use v2.2.0+) + +### Creating New Database Extensions +1. Create `variables_{database}_example.yaml` and `variable_details_{database}_example.yaml` +2. Reference `registry_file: "metadata_registry.yaml"` in both files +3. Define database-specific validation rules and field extensions +4. Add entry to `supported_databases` section in metadata_registry.yaml \ No newline at end of file diff --git a/scope-docs/metadata-schema/variable_details_cchs_example.yaml b/scope-docs/metadata-schema/variable_details_cchs_example.yaml new file mode 100644 index 00000000..38758b04 --- /dev/null +++ b/scope-docs/metadata-schema/variable_details_cchs_example.yaml @@ -0,0 +1,148 @@ +schema_version: "2.2.0" +schema_date: "2025-06-22" +description: "CCHS example showing project-specific extensions to variable_details schema." +registry_file: "metadata_registry.yaml" + +# Example: Canadian Community Health Survey (CCHS) project-specific metadata +# This demonstrates how projects can extend the core variable_details schema +# with specialized transformation patterns and validation rules. + +cchs_project_metadata: + title: "CCHS variable details configuration example" + description: "Project-specific metadata for CCHS transformation rules and recoding patterns." + + # Enhanced missing data categories for CCHS + missing_data_categories: + description: "CCHS-specific missing data patterns with haven::tagged_na() integration." + categories: + - code: "NA::a" + description: "Valid skip due to survey logic." + haven_equivalent: "haven::tagged_na('a')" + context: "Conditional questions, routing logic" + frequency: "Common" + + - code: "NA::b" + description: "Don't know, refusal, not stated." + haven_equivalent: "haven::tagged_na('b')" + context: "Missing responses, participant refusal" + frequency: "Common" + + - code: "NA::c" + description: "Not applicable (specific conditions)." + haven_equivalent: "haven::tagged_na('c')" + context: "Condition-specific non-applicability" + frequency: "Rare" + + # Real-world interval notation examples + interval_notation_examples: + description: "Complex interval patterns validated with 3,577 CCHS records." + + standard_intervals: + - pattern: "[18.5,24.9]" + description: "BMI normal weight range (closed interval)." + + - pattern: "(0,18.5)" + description: "BMI underweight range (open interval)." + + - pattern: "[25,30)" + description: "BMI overweight range (half-open interval)." + + complex_decimal_intervals: + - pattern: "[-0.359,1]" + description: "Health Utility Index range with negative decimals." + context: "Utility-based health status measurement" + + - pattern: "[0.0487,0.1846)" + description: "Complex calculated score range." + context: "Derived health indicators with high precision" + + # CCHS function patterns for derived variables + function_patterns: + description: "Common function naming patterns in CCHS recoding." + examples: + - pattern: "Func::{variable}_fun" + description: "Simple transformation function." + example: "Func::bmi_cat_fun" + + - pattern: "Func::{variable}_der_fun{n}" + description: "Derived variable function with sequence number." + example: "Func::smoking_der_fun2" + + # CCHS-specific validation rules + validation_rules: + recStart: + description: "CCHS interval notation must use validated patterns." + constraint: "required" + validation: "pattern_reference" + pattern_reference: "metadata_registry.yaml interval_notation" + additional_patterns: + cchs_complex_decimals: "^\\[[-]?[0-9]*\\.[0-9]+,\\s*[-]?[0-9]*\\.[0-9]+[\\)\\]]$" + error_message: "recStart must use valid CCHS interval notation" + + databaseStart: + description: "Database naming must follow CCHS pattern." + constraint: "required" + validation: "pattern" + pattern: "^cchs[0-9]{4}(_[0-9]{4})?_[pism]$" + error_message: "databaseStart must follow cchs{YEAR}_{TYPE} pattern" + + missing_data_codes: + description: "Missing data must use CCHS haven::tagged_na() integration." + validation: "enum" + allowed_values: ["NA::a", "NA::b", "NA::c", "NA::d"] + error_message: "Use CCHS-approved missing data codes with haven::tagged_na()" + +# ============================================================================ +# COMPLETE VARIABLE DETAIL EXAMPLES - Using CCHS extensions +# ============================================================================ + +example_variable_details: + age_group_example: + - variable: "age_group" + dummyVariable: "age_group_cat4_1" + typeEnd: "cat" + databaseStart: "cchs2017_p" + variableStart: "[AGE_12]" + typeStart: "cont" + recEnd: "1" + numValidCat: "4" + catLabel: "18-34 years" + catLabelLong: "Young adults aged 18 to 34 years" + units: "" + recStart: "[18,35)" + catStartLabel: "Age 18-34" + notes: "Standard CCHS age grouping for young adults" + + bmi_category_example: + - variable: "bmi_category" + dummyVariable: "bmi_cat4_1" + typeEnd: "cat" + databaseStart: "cchs2017_p" + variableStart: "DerivedVar::[HEIGHT_M, WEIGHT_KG]" + typeStart: "cont" + recEnd: "1" + numValidCat: "4" + catLabel: "Underweight" + catLabelLong: "Body mass index less than 18.5 (underweight)" + units: "kg/m²" + recStart: "(0,18.5)" + catStartLabel: "BMI < 18.5" + notes: "WHO classification for underweight BMI category" + + - variable: "bmi_category" + dummyVariable: "bmi_cat4_NA_a" + typeEnd: "cat" + databaseStart: "cchs2017_p" + variableStart: "DerivedVar::[HEIGHT_M, WEIGHT_KG]" + typeStart: "cont" + recEnd: "NA::a" + numValidCat: "4" + catLabel: "Valid skip" + catLabelLong: "BMI calculation not applicable - valid skip" + units: "kg/m²" + recStart: "NA::a" + catStartLabel: "Missing - valid skip" + notes: "Height or weight missing due to survey logic" + +# Note: This file demonstrates project-specific extensions. +# Core schema specifications are defined in metadata_registry.yaml \ No newline at end of file diff --git a/scope-docs/metadata-schema/variables_cchs_example.yaml b/scope-docs/metadata-schema/variables_cchs_example.yaml new file mode 100644 index 00000000..49b68b08 --- /dev/null +++ b/scope-docs/metadata-schema/variables_cchs_example.yaml @@ -0,0 +1,109 @@ +schema_version: "2.2.0" +schema_date: "2025-06-22" +description: "CCHS example showing project-specific extensions to variables schema." +registry_file: "metadata_registry.yaml" + +# Example: Canadian Community Health Survey (CCHS) project-specific metadata +# This demonstrates how projects can extend the core variables schema with +# specialized metadata while maintaining compatibility with recodeflow. + +cchs_project_metadata: + title: "CCHS variables configuration example" + description: "Project-specific metadata for Canadian Community Health Survey harmonization." + + # Source database naming conventions specific to CCHS + source_databases: + pattern: "cchs{YEAR}_{TYPE}" + description: "CCHS database naming convention for survey cycles." + examples: + - "cchs2017_p" # Public Use Microdata File + - "cchs2019_i" # ICES data + - "cchs2021_s" # Shared file + + year_formats: + - "YYYY" # Single year: cchs2017 + - "YYYY_YYYY" # Multi-year: cchs2007_2008 + + type_suffixes: + - code: "p" + description: "Public Use Microdata File (PUMF)" + - code: "i" + description: "ICES data (continuous variables)" + - code: "s" + description: "Shared file" + - code: "m" + description: "Master file" + + # CCHS-specific validation rules + validation_rules: + subject: + description: "CCHS subject areas - mandatory enumeration for all variables." + constraint: "required" + validation: "enum" + allowed_values: + - "Demographics" + - "Health status" + - "Health behaviors" + - "Health care use" + - "Chronic conditions" + - "Physical measures" + - "Mental health" + - "Social determinants" + error_message: "subject must be one of the CCHS-approved subject areas" + + section: + description: "CCHS survey modules - mandatory for organization." + constraint: "required" + validation: "enum" + allowed_values: + - "Core demographics" + - "General health" + - "Chronic conditions" + - "Health behaviors" + - "Optional content" + error_message: "section must specify CCHS survey module" + + databaseStart: + description: "Database naming must follow CCHS pattern." + constraint: "required" + validation: "pattern" + pattern: "^cchs[0-9]{4}(_[0-9]{4})?_[pism]$" + examples: ["cchs2017_p", "cchs2007_2008_i"] + error_message: "databaseStart must follow cchs{YEAR}_{TYPE} pattern" + + # Missing data reference (detailed patterns in variable_details_cchs_example.yaml) + missing_data_reference: + description: "CCHS uses haven::tagged_na() integration." + reference_file: "variable_details_cchs_example.yaml" + standard_codes: ["NA::a", "NA::b"] + extended_codes: ["NA::c", "NA::d"] + +# ============================================================================ +# COMPLETE VARIABLE EXAMPLES - Using CCHS extensions +# ============================================================================ + +example_variables: + - variable: "age_group" + label: "Age group" + labelLong: "Age group categories based on CCHS standard groupings" + variableType: "Categorical" + databaseStart: "cchs2017_p" + variableStart: "[AGE_12]" + subject: "Demographics" # Uses CCHS validation rule + section: "Core demographics" # Uses CCHS validation rule + units: "" + notes: "Standard CCHS age grouping for demographic analysis" + + - variable: "bmi_category" + label: "BMI category" + labelLong: "Body mass index categories using WHO classification standards" + variableType: "Categorical" + databaseStart: "cchs2017_p" + variableStart: "DerivedVar::[HEIGHT_M, WEIGHT_KG]" + subject: "Physical measures" # Uses CCHS validation rule + section: "General health" # Uses CCHS validation rule + units: "" + notes: "Derived from height and weight measurements using standard BMI formula" + +# Note: This file demonstrates project-specific extensions. +# Core schema specifications are defined in metadata_registry.yaml \ No newline at end of file