Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
c7a944a
chore: update submodules
andhreljaKern Oct 24, 2025
f63a8ba
perf(alembic): add etl task table
andhreljaKern Oct 24, 2025
7b5fcbe
chore: update submodules
andhreljaKern Oct 24, 2025
4f57597
perf(alembic): update etl task table
andhreljaKern Oct 24, 2025
aa9f193
chore: update submodules
andhreljaKern Oct 24, 2025
c939fac
perf(alembic): add org_id column
andhreljaKern Oct 24, 2025
4d25841
chore: update submodules
andhreljaKern Oct 24, 2025
217832c
perf(alembic): update etl task table
andhreljaKern Oct 24, 2025
c218594
chore: update submodules
andhreljaKern Oct 26, 2025
cb64bf8
perf: add file_size_bytes to etl_task
andhreljaKern Oct 26, 2025
bd1b633
chore: update submodules
andhreljaKern Oct 27, 2025
92e89cd
perf: add split_config
andhreljaKern Oct 27, 2025
5b18a3c
chore: update submodules
andhreljaKern Oct 27, 2025
97b01bc
perf(etl): fkey alignment
andhreljaKern Oct 27, 2025
7a40c81
chore: update submodules
andhreljaKern Oct 28, 2025
79b05b1
perf: task cancellation
andhreljaKern Oct 28, 2025
2b4ad4e
chore: update submodules
andhreljaKern Oct 29, 2025
6264d28
fix: update submodules merge conflict
andhreljaKern Oct 30, 2025
1aff7a8
perf: add cache_config
andhreljaKern Oct 30, 2025
e3a19db
perf: align /notify to etl provider
andhreljaKern Oct 30, 2025
94af481
chore: update submodules
andhreljaKern Oct 30, 2025
fb71eb2
chore: update submodules
andhreljaKern Oct 30, 2025
0b7c896
perf: update minio_upload for execute_etl
andhreljaKern Oct 30, 2025
5e0aafe
fix: markdown_file update after etl_task creation
andhreljaKern Oct 30, 2025
709d22d
chore: merge dev
andhreljaKern Oct 30, 2025
b07fc7d
perf: add etl task table
andhreljaKern Oct 30, 2025
41c9573
perf: disable CLEANSE as default
andhreljaKern Oct 30, 2025
2554a43
chore: update submodules
andhreljaKern Oct 30, 2025
52f715e
perf: standard cache config keys
andhreljaKern Oct 30, 2025
e6ec935
Merge remote-tracking branch 'origin/dev' into cognition-etl-provider
JWittmeyer Nov 3, 2025
356f38e
Merge with dev
JWittmeyer Nov 3, 2025
2770d25
Alembic new table & submodule fix import
JWittmeyer Nov 3, 2025
49b20d4
Merge with dev
JWittmeyer Nov 12, 2025
b70404e
tmp commit
JWittmeyer Nov 12, 2025
a86ef30
Tmp doc almost
JWittmeyer Nov 14, 2025
0e1f1d8
No cache tmp doc working
JWittmeyer Nov 17, 2025
726a31f
Submodule update
JWittmeyer Nov 18, 2025
e94eb89
Ensure complete
JWittmeyer Nov 19, 2025
356d8ae
chore: update submodules
andhreljaKern Nov 22, 2025
2b237e0
perf(alembic): etl original file name
andhreljaKern Nov 22, 2025
30f54d6
chore: update submodules
andhreljaKern Nov 23, 2025
d210b57
perf(alembic): update etl content
andhreljaKern Nov 23, 2025
1fe4c2b
perf(alembic): full_admin table
andhreljaKern Nov 23, 2025
72ad373
chore: update submodules
andhreljaKern Nov 25, 2025
777bc39
perf: tmp doc etl task metadata
andhreljaKern Nov 25, 2025
cc2cb78
Merge with dev
JWittmeyer Nov 25, 2025
8f546fa
Conversion method + alembic
JWittmeyer Nov 26, 2025
b643bd0
Hotfix default value
JWittmeyer Nov 26, 2025
8e4a0b2
tmp doc changes
JWittmeyer Nov 26, 2025
8646787
perf: minio upload minor enhancement
andhreljaKern Nov 26, 2025
725f674
chore: update submodules
andhreljaKern Nov 26, 2025
38b2375
submodule change
JWittmeyer Nov 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions alembic/versions/31c4968699ad_add_etl_content_to_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""add etl content to records

Revision ID: 31c4968699ad
Revises: 9d5fb67e29f7
Create Date: 2025-11-23 23:08:27.327070

"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "31c4968699ad"
down_revision = "9d5fb67e29f7"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"etl_task",
sa.Column("original_file_name", sa.String(), nullable=True),
schema="global",
)
op.add_column(
"github_file",
sa.Column("content", sa.String(), nullable=True),
schema="integration",
)
op.add_column(
"github_issue",
sa.Column("content", sa.String(), nullable=True),
schema="integration",
)
op.add_column(
"pdf", sa.Column("content", sa.String(), nullable=True), schema="integration"
)
op.add_column(
"sharepoint",
sa.Column("content", sa.String(), nullable=True),
schema="integration",
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column("sharepoint", "content", schema="integration")
op.drop_column("pdf", "content", schema="integration")
op.drop_column("github_issue", "content", schema="integration")
op.drop_column("github_file", "content", schema="integration")
op.drop_column("etl_task", "original_file_name", schema="global")
# ### end Alembic commands ###
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
"""remove cognition project fields for new etl

Revision ID: 64874114490b
Revises: c4218a7d06e0
Create Date: 2025-11-25 15:49:30.097610

"""

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
import json
import uuid
from submodules.model.enums import LLMProvider

# revision identifiers, used by Alembic.
revision = "64874114490b"
down_revision = "c4218a7d06e0"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"project",
sa.Column("useable_etl_configurations", sa.JSON(), nullable=True),
schema="cognition",
)
op.drop_constraint(
"etl_config_preset_name_key",
"etl_config_preset",
schema="cognition",
type_="unique",
)
op.drop_index(
"ix_cognition_etl_config_preset_project_id",
table_name="etl_config_preset",
schema="cognition",
)
op.drop_constraint(
"etl_config_preset_project_id_fkey",
"etl_config_preset",
schema="cognition",
type_="foreignkey",
)
op.drop_column("etl_config_preset", "project_id", schema="cognition")
__conversion_helper()
op.drop_column("project", "tokenizer", schema="cognition")
op.drop_column("project", "llm_config", schema="cognition")
# ### end Alembic commands ###


def downgrade():
op.drop_column("project", "useable_etl_configurations", schema="cognition")
op.add_column(
"etl_config_preset",
sa.Column("project_id", postgresql.UUID(), autoincrement=False, nullable=True),
schema="cognition",
)
op.create_foreign_key(
"etl_config_preset_project_id_fkey",
"etl_config_preset",
"project",
["project_id"],
["id"],
source_schema="cognition",
referent_schema="cognition",
ondelete="CASCADE",
)
op.create_index(
"ix_cognition_etl_config_preset_project_id",
"etl_config_preset",
["project_id"],
unique=False,
schema="cognition",
)
op.create_unique_constraint(
"etl_config_preset_name_key", "etl_config_preset", ["name"], schema="cognition"
)
# ### commands auto generated by Alembic - please adjust! ###
op.add_column(
"project",
sa.Column(
"llm_config",
postgresql.JSON(astext_type=sa.Text()),
autoincrement=False,
nullable=True,
),
schema="cognition",
)
op.add_column(
"project",
sa.Column("tokenizer", sa.VARCHAR(), autoincrement=False, nullable=True),
schema="cognition",
)
# ### end Alembic commands ###


def __conversion_helper():

connection = op.get_bind()
select_sql = """
SELECT id,organization_id, llm_config, tokenizer, name, created_by, created_at
FROM cognition.project
WHERE (llm_config IS NOT NULL OR tokenizer IS NOT NULL)
AND allow_file_upload = true;
"""
result = connection.execute(select_sql)

# Step 2: Process each row
for row in result:

print(f"Converting project {row['name']} (ID: {row['id']})")

converted_object = __convert_object(row)
insert_sql = sa.text(
"""
INSERT INTO cognition.etl_config_preset(
id,
organization_id,
name,
description,
created_at,
created_by,
etl_config,
add_config
)
VALUES (
:id,
:organization_id,
:name,
:description,
:created_at,
:created_by,
:etl_config,
:add_config
)
"""
)
connection.execute(insert_sql, converted_object)

# Step 4: Update the project row with useable_etl_configurations
update_sql = sa.text(
"""
UPDATE cognition.project
SET useable_etl_configurations = :config_list
WHERE id = :project_id
"""
)
connection.execute(
update_sql,
{
"config_list": json.dumps(
[{"id": converted_object["id"], "isDefault": True}]
),
"project_id": row["id"],
},
)


def __convert_object(row):
# --- Your Python conversion here ---
# Example placeholder (replace with real conversion)
# converted_object = {"llm_config": row["llm_config"], "tokenizer": row["tokenizer"]}
pdf_extraction = {}
extraction_data = row["llm_config"].get("extraction", {})
if extraction_data.get("extractor") == "pdf2markdown":
pdf_extraction["extractor"] = "PDF2MD"
elif extraction_data.get("extractor").lower() == "azure_di":
pdf_extraction["azureDiApiBase"] = extraction_data.get("azureDiApiBase", "")
pdf_extraction["azureDiEnvVarId"] = extraction_data.get("azureDiEnvVarId", "")
pdf_extraction["extractor"] = "AZURE_DI"
elif (
extraction_data.get("extractor").lower() == "gpt"
or extraction_data.get("extractor").lower() == "vision"
or extraction_data.get("extractor").lower() == "gpt-4"
):
pdf_extraction["overwriteVisionPrompt"] = extraction_data.get(
"overwriteVisionPrompt", False
)
pdf_extraction["llmIdentifier"] = LLMProvider.from_string(
extraction_data.get("llmIdentifier", "")
).value
pdf_extraction["extractor"] = "VISION"
llm_config = extraction_data.copy()
llm_config.pop("extractor", None)
llm_config.pop("overwriteVisionPrompt", None)
llm_config.pop("llmIdentifier", None)
pdf_extraction["llmConfig"] = llm_config
transformation_data = row["llm_config"].get("transformation", {})
transformation_config = {}
transformation_config["llmIdentifier"] = LLMProvider.from_string(
transformation_data.get("llmIdentifier", "")
).value
transformation_config["type"] = "COMMON_ETL"
llm_config = transformation_data.copy()
llm_config.pop("llmIdentifier", None)
transformation_config["llmConfig"] = llm_config
# add pdf & add llm migration
converted_object = {
"extraction": {"default": {"extractor": "LANGCHAIN"}, "pdf": pdf_extraction},
"tokenizer": row["tokenizer"],
"transformation": transformation_config,
}
final_object = {
"id": str(uuid.uuid4()),
"organization_id": str(row["organization_id"]),
"name": row["name"] + " - migrated etl config",
"description": "ETL configuration migrated from old project settings",
"created_at": row["created_at"].isoformat(),
"created_by": str(row["created_by"]),
"etl_config": json.dumps(converted_object),
"add_config": json.dumps({}),
}

return final_object
# -----------------------------------
87 changes: 87 additions & 0 deletions alembic/versions/9d5fb67e29f7_config_sets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
"""Config sets'


Revision ID: 9d5fb67e29f7
Revises: f428a22ecdb3
Create Date: 2025-11-03 15:28:47.686657

"""

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql

# revision identifiers, used by Alembic.
revision = "9d5fb67e29f7"
down_revision = "f428a22ecdb3"
branch_labels = None
depends_on = None


def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table(
"etl_config_preset",
sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
sa.Column("organization_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("name", sa.String(), nullable=True),
sa.Column("description", sa.String(), nullable=True),
sa.Column("created_at", sa.DateTime(), nullable=True),
sa.Column("created_by", postgresql.UUID(as_uuid=True), nullable=True),
sa.Column("etl_config", sa.JSON(), nullable=True),
sa.Column("add_config", sa.JSON(), nullable=True),
sa.ForeignKeyConstraint(["created_by"], ["user.id"], ondelete="SET NULL"),
sa.ForeignKeyConstraint(
["organization_id"], ["organization.id"], ondelete="CASCADE"
),
sa.ForeignKeyConstraint(
["project_id"], ["cognition.project.id"], ondelete="CASCADE"
),
sa.PrimaryKeyConstraint("id"),
sa.UniqueConstraint("name"),
schema="cognition",
)
op.create_index(
op.f("ix_cognition_etl_config_preset_created_by"),
"etl_config_preset",
["created_by"],
unique=False,
schema="cognition",
)
op.create_index(
op.f("ix_cognition_etl_config_preset_organization_id"),
"etl_config_preset",
["organization_id"],
unique=False,
schema="cognition",
)
op.create_index(
op.f("ix_cognition_etl_config_preset_project_id"),
"etl_config_preset",
["project_id"],
unique=False,
schema="cognition",
)
# ### end Alembic commands ###


def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(
op.f("ix_cognition_etl_config_preset_project_id"),
table_name="etl_config_preset",
schema="cognition",
)
op.drop_index(
op.f("ix_cognition_etl_config_preset_organization_id"),
table_name="etl_config_preset",
schema="cognition",
)
op.drop_index(
op.f("ix_cognition_etl_config_preset_created_by"),
table_name="etl_config_preset",
schema="cognition",
)
op.drop_table("etl_config_preset", schema="cognition")
# ### end Alembic commands ###
Loading