code-kern-ai · andhreljaKern · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
diff --git a/alembic/versions/31c4968699ad_add_etl_content_to_records.py b/alembic/versions/31c4968699ad_add_etl_content_to_records.py
@@ -0,0 +1,55 @@
+"""add etl content to records
+
+Revision ID: 31c4968699ad
+Revises: 9d5fb67e29f7
+Create Date: 2025-11-23 23:08:27.327070
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "31c4968699ad"
+down_revision = "9d5fb67e29f7"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "etl_task",
+        sa.Column("original_file_name", sa.String(), nullable=True),
+        schema="global",
+    )
+    op.add_column(
+        "github_file",
+        sa.Column("content", sa.String(), nullable=True),
+        schema="integration",
+    )
+    op.add_column(
+        "github_issue",
+        sa.Column("content", sa.String(), nullable=True),
+        schema="integration",
+    )
+    op.add_column(
+        "pdf", sa.Column("content", sa.String(), nullable=True), schema="integration"
+    )
+    op.add_column(
+        "sharepoint",
+        sa.Column("content", sa.String(), nullable=True),
+        schema="integration",
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_column("sharepoint", "content", schema="integration")
+    op.drop_column("pdf", "content", schema="integration")
+    op.drop_column("github_issue", "content", schema="integration")
+    op.drop_column("github_file", "content", schema="integration")
+    op.drop_column("etl_task", "original_file_name", schema="global")
+    # ### end Alembic commands ###
diff --git a/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py b/alembic/versions/64874114490b_remove_cognition_project_fields_for_new_.py
@@ -0,0 +1,218 @@
+"""remove cognition project fields for new etl
+
+Revision ID: 64874114490b
+Revises: c4218a7d06e0
+Create Date: 2025-11-25 15:49:30.097610
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+import json
+import uuid
+from submodules.model.enums import LLMProvider
+
+# revision identifiers, used by Alembic.
+revision = "64874114490b"
+down_revision = "c4218a7d06e0"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "project",
+        sa.Column("useable_etl_configurations", sa.JSON(), nullable=True),
+        schema="cognition",
+    )
+    op.drop_constraint(
+        "etl_config_preset_name_key",
+        "etl_config_preset",
+        schema="cognition",
+        type_="unique",
+    )
+    op.drop_index(
+        "ix_cognition_etl_config_preset_project_id",
+        table_name="etl_config_preset",
+        schema="cognition",
+    )
+    op.drop_constraint(
+        "etl_config_preset_project_id_fkey",
+        "etl_config_preset",
+        schema="cognition",
+        type_="foreignkey",
+    )
+    op.drop_column("etl_config_preset", "project_id", schema="cognition")
+    __conversion_helper()
+    op.drop_column("project", "tokenizer", schema="cognition")
+    op.drop_column("project", "llm_config", schema="cognition")
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    op.drop_column("project", "useable_etl_configurations", schema="cognition")
+    op.add_column(
+        "etl_config_preset",
+        sa.Column("project_id", postgresql.UUID(), autoincrement=False, nullable=True),
+        schema="cognition",
+    )
+    op.create_foreign_key(
+        "etl_config_preset_project_id_fkey",
+        "etl_config_preset",
+        "project",
+        ["project_id"],
+        ["id"],
+        source_schema="cognition",
+        referent_schema="cognition",
+        ondelete="CASCADE",
+    )
+    op.create_index(
+        "ix_cognition_etl_config_preset_project_id",
+        "etl_config_preset",
+        ["project_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_unique_constraint(
+        "etl_config_preset_name_key", "etl_config_preset", ["name"], schema="cognition"
+    )
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.add_column(
+        "project",
+        sa.Column(
+            "llm_config",
+            postgresql.JSON(astext_type=sa.Text()),
+            autoincrement=False,
+            nullable=True,
+        ),
+        schema="cognition",
+    )
+    op.add_column(
+        "project",
+        sa.Column("tokenizer", sa.VARCHAR(), autoincrement=False, nullable=True),
+        schema="cognition",
+    )
+    # ### end Alembic commands ###
+
+
+def __conversion_helper():
+
+    connection = op.get_bind()
+    select_sql = """
+        SELECT id,organization_id, llm_config, tokenizer, name, created_by, created_at
+        FROM cognition.project
+        WHERE (llm_config IS NOT NULL OR tokenizer IS NOT NULL)
+          AND allow_file_upload = true;
+    """
+    result = connection.execute(select_sql)
+
+    # Step 2: Process each row
+    for row in result:
+
+        print(f"Converting project {row['name']} (ID: {row['id']})")
+
+        converted_object = __convert_object(row)
+        insert_sql = sa.text(
+            """
+            INSERT INTO cognition.etl_config_preset(
+                id,
+                organization_id,
+                name,
+                description,
+                created_at,
+                created_by,
+                etl_config,
+                add_config
+            )
+            VALUES (
+                :id,
+                :organization_id,
+                :name,
+                :description,
+                :created_at,
+                :created_by,
+                :etl_config,
+                :add_config
+            )
+        """
+        )
+        connection.execute(insert_sql, converted_object)
+
+        # Step 4: Update the project row with useable_etl_configurations
+        update_sql = sa.text(
+            """
+            UPDATE cognition.project
+            SET useable_etl_configurations = :config_list
+            WHERE id = :project_id
+        """
+        )
+        connection.execute(
+            update_sql,
+            {
+                "config_list": json.dumps(
+                    [{"id": converted_object["id"], "isDefault": True}]
+                ),
+                "project_id": row["id"],
+            },
+        )
+
+
+def __convert_object(row):
+    # --- Your Python conversion here ---
+    # Example placeholder (replace with real conversion)
+    # converted_object = {"llm_config": row["llm_config"], "tokenizer": row["tokenizer"]}
+    pdf_extraction = {}
+    extraction_data = row["llm_config"].get("extraction", {})
+    if extraction_data.get("extractor") == "pdf2markdown":
+        pdf_extraction["extractor"] = "PDF2MD"
+    elif extraction_data.get("extractor").lower() == "azure_di":
+        pdf_extraction["azureDiApiBase"] = extraction_data.get("azureDiApiBase", "")
+        pdf_extraction["azureDiEnvVarId"] = extraction_data.get("azureDiEnvVarId", "")
+        pdf_extraction["extractor"] = "AZURE_DI"
+    elif (
+        extraction_data.get("extractor").lower() == "gpt"
+        or extraction_data.get("extractor").lower() == "vision"
+        or extraction_data.get("extractor").lower() == "gpt-4"
+    ):
+        pdf_extraction["overwriteVisionPrompt"] = extraction_data.get(
+            "overwriteVisionPrompt", False
+        )
+        pdf_extraction["llmIdentifier"] = LLMProvider.from_string(
+            extraction_data.get("llmIdentifier", "")
+        ).value
+        pdf_extraction["extractor"] = "VISION"
+        llm_config = extraction_data.copy()
+        llm_config.pop("extractor", None)
+        llm_config.pop("overwriteVisionPrompt", None)
+        llm_config.pop("llmIdentifier", None)
+        pdf_extraction["llmConfig"] = llm_config
+    transformation_data = row["llm_config"].get("transformation", {})
+    transformation_config = {}
+    transformation_config["llmIdentifier"] = LLMProvider.from_string(
+        transformation_data.get("llmIdentifier", "")
+    ).value
+    transformation_config["type"] = "COMMON_ETL"
+    llm_config = transformation_data.copy()
+    llm_config.pop("llmIdentifier", None)
+    transformation_config["llmConfig"] = llm_config
+    # add pdf & add llm migration
+    converted_object = {
+        "extraction": {"default": {"extractor": "LANGCHAIN"}, "pdf": pdf_extraction},
+        "tokenizer": row["tokenizer"],
+        "transformation": transformation_config,
+    }
+    final_object = {
+        "id": str(uuid.uuid4()),
+        "organization_id": str(row["organization_id"]),
+        "name": row["name"] + " - migrated etl config",
+        "description": "ETL configuration migrated from old project settings",
+        "created_at": row["created_at"].isoformat(),
+        "created_by": str(row["created_by"]),
+        "etl_config": json.dumps(converted_object),
+        "add_config": json.dumps({}),
+    }
+
+    return final_object
+    # -----------------------------------
diff --git a/alembic/versions/9d5fb67e29f7_config_sets.py b/alembic/versions/9d5fb67e29f7_config_sets.py
@@ -0,0 +1,87 @@
+"""Config sets'
+
+
+Revision ID: 9d5fb67e29f7
+Revises: f428a22ecdb3
+Create Date: 2025-11-03 15:28:47.686657
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = "9d5fb67e29f7"
+down_revision = "f428a22ecdb3"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table(
+        "etl_config_preset",
+        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
+        sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("name", sa.String(), nullable=True),
+        sa.Column("description", sa.String(), nullable=True),
+        sa.Column("created_at", sa.DateTime(), nullable=True),
+        sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
+        sa.Column("etl_config", sa.JSON(), nullable=True),
+        sa.Column("add_config", sa.JSON(), nullable=True),
+        sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"),
+        sa.ForeignKeyConstraint(
+            ["organization_id"], ["organization.id"], ondelete="CASCADE"
+        ),
+        sa.ForeignKeyConstraint(
+            ["project_id"], ["cognition.project.id"], ondelete="CASCADE"
+        ),
+        sa.PrimaryKeyConstraint("id"),
+        sa.UniqueConstraint("name"),
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_etl_config_preset_created_by"),
+        "etl_config_preset",
+        ["created_by"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_etl_config_preset_organization_id"),
+        "etl_config_preset",
+        ["organization_id"],
+        unique=False,
+        schema="cognition",
+    )
+    op.create_index(
+        op.f("ix_cognition_etl_config_preset_project_id"),
+        "etl_config_preset",
+        ["project_id"],
+        unique=False,
+        schema="cognition",
+    )
+    # ### end Alembic commands ###
+
+
+def downgrade():
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.drop_index(
+        op.f("ix_cognition_etl_config_preset_project_id"),
+        table_name="etl_config_preset",
+        schema="cognition",
+    )
+    op.drop_index(
+        op.f("ix_cognition_etl_config_preset_organization_id"),
+        table_name="etl_config_preset",
+        schema="cognition",
+    )
+    op.drop_index(
+        op.f("ix_cognition_etl_config_preset_created_by"),
+        table_name="etl_config_preset",
+        schema="cognition",
+    )
+    op.drop_table("etl_config_preset", schema="cognition")
+    # ### end Alembic commands ###