From 59e755732c844a161065cd96e5a01625ed7e8ddf Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 17 Sep 2024 15:47:57 +0200 Subject: [PATCH 01/88] fix: finally got pg_replication tests working as is --- tests/postgres/docker-compose.yml | 6 +- tests/postgres/postgresql.conf | 798 ++++++++++++++++++++++++++++++ 2 files changed, 803 insertions(+), 1 deletion(-) create mode 100644 tests/postgres/postgresql.conf diff --git a/tests/postgres/docker-compose.yml b/tests/postgres/docker-compose.yml index aa0a2c5d7..3b901a5ca 100644 --- a/tests/postgres/docker-compose.yml +++ b/tests/postgres/docker-compose.yml @@ -1,4 +1,3 @@ -version: "3.7" services: db: env_file: postgres.env @@ -6,9 +5,14 @@ services: context: postgres dockerfile: Dockerfile container_name: dlt_postgres_db + command: + - postgres + - -c + - config_file=/etc/postgresql/postgresql.conf restart: unless-stopped volumes: - db_home:/var/lib/postgresql/data + - ./postgresql.conf:/etc/postgresql/postgresql.conf:ro ports: - 5432:5432 diff --git a/tests/postgres/postgresql.conf b/tests/postgres/postgresql.conf new file mode 100644 index 000000000..bbb13e1db --- /dev/null +++ b/tests/postgres/postgresql.conf @@ -0,0 +1,798 @@ +# ----------------------------- +# PostgreSQL configuration file +# ----------------------------- +# +# This file consists of lines of the form: +# +# name = value +# +# (The "=" is optional.) Whitespace may be used. Comments are introduced with +# "#" anywhere on a line. The complete list of parameter names and allowed +# values can be found in the PostgreSQL documentation. +# +# The commented-out settings shown in this file represent the default values. +# Re-commenting a setting is NOT sufficient to revert it to the default value; +# you need to reload the server. +# +# This file is read on server startup and when the server receives a SIGHUP +# signal. If you edit the file on a running system, you have to SIGHUP the +# server for the changes to take effect, run "pg_ctl reload", or execute +# "SELECT pg_reload_conf()". Some parameters, which are marked below, +# require a server shutdown and restart to take effect. +# +# Any parameter can also be given as a command-line option to the server, e.g., +# "postgres -c log_connections=on". Some parameters can be changed at run time +# with the "SET" SQL command. +# +# Memory units: B = bytes Time units: us = microseconds +# kB = kilobytes ms = milliseconds +# MB = megabytes s = seconds +# GB = gigabytes min = minutes +# TB = terabytes h = hours +# d = days + + +#------------------------------------------------------------------------------ +# FILE LOCATIONS +#------------------------------------------------------------------------------ + +# The default values of these variables are driven from the -D command-line +# option or PGDATA environment variable, represented here as ConfigDir. + +#data_directory = 'ConfigDir' # use data in another directory + # (change requires restart) +#hba_file = 'ConfigDir/pg_hba.conf' # host-based authentication file + # (change requires restart) +#ident_file = 'ConfigDir/pg_ident.conf' # ident configuration file + # (change requires restart) + +# If external_pid_file is not explicitly set, no extra PID file is written. +#external_pid_file = '' # write an extra PID file + # (change requires restart) + + +#------------------------------------------------------------------------------ +# CONNECTIONS AND AUTHENTICATION +#------------------------------------------------------------------------------ + +# - Connection Settings - + +listen_addresses = '*' + # comma-separated list of addresses; + # defaults to 'localhost'; use '*' for all + # (change requires restart) +#port = 5432 # (change requires restart) +#max_connections = 100 # (change requires restart) +#superuser_reserved_connections = 3 # (change requires restart) +#unix_socket_directories = '/tmp' # comma-separated list of directories + # (change requires restart) +#unix_socket_group = '' # (change requires restart) +#unix_socket_permissions = 0777 # begin with 0 to use octal notation + # (change requires restart) +#bonjour = off # advertise server via Bonjour + # (change requires restart) +#bonjour_name = '' # defaults to the computer name + # (change requires restart) + +# - TCP settings - +# see "man tcp" for details + +#tcp_keepalives_idle = 0 # TCP_KEEPIDLE, in seconds; + # 0 selects the system default +#tcp_keepalives_interval = 0 # TCP_KEEPINTVL, in seconds; + # 0 selects the system default +#tcp_keepalives_count = 0 # TCP_KEEPCNT; + # 0 selects the system default +#tcp_user_timeout = 0 # TCP_USER_TIMEOUT, in milliseconds; + # 0 selects the system default + +#client_connection_check_interval = 0 # time between checks for client + # disconnection while running queries; + # 0 for never + +# - Authentication - + +#authentication_timeout = 1min # 1s-600s +#password_encryption = scram-sha-256 # scram-sha-256 or md5 +#db_user_namespace = off + +# GSSAPI using Kerberos +#krb_server_keyfile = 'FILE:${sysconfdir}/krb5.keytab' +#krb_caseins_users = off + +# - SSL - + +#ssl = off +#ssl_ca_file = '' +#ssl_cert_file = 'server.crt' +#ssl_crl_file = '' +#ssl_crl_dir = '' +#ssl_key_file = 'server.key' +#ssl_ciphers = 'HIGH:MEDIUM:+3DES:!aNULL' # allowed SSL ciphers +#ssl_prefer_server_ciphers = on +#ssl_ecdh_curve = 'prime256v1' +#ssl_min_protocol_version = 'TLSv1.2' +#ssl_max_protocol_version = '' +#ssl_dh_params_file = '' +#ssl_passphrase_command = '' +#ssl_passphrase_command_supports_reload = off + + +#------------------------------------------------------------------------------ +# RESOURCE USAGE (except WAL) +#------------------------------------------------------------------------------ + +# - Memory - + +#shared_buffers = 32MB # min 128kB + # (change requires restart) +#huge_pages = try # on, off, or try + # (change requires restart) +#huge_page_size = 0 # zero for system default + # (change requires restart) +#temp_buffers = 8MB # min 800kB +#max_prepared_transactions = 0 # zero disables the feature + # (change requires restart) +# Caution: it is not advisable to set max_prepared_transactions nonzero unless +# you actively intend to use prepared transactions. +#work_mem = 4MB # min 64kB +#hash_mem_multiplier = 1.0 # 1-1000.0 multiplier on hash table work_mem +#maintenance_work_mem = 64MB # min 1MB +#autovacuum_work_mem = -1 # min 1MB, or -1 to use maintenance_work_mem +#logical_decoding_work_mem = 64MB # min 64kB +#max_stack_depth = 2MB # min 100kB +#shared_memory_type = mmap # the default is the first option + # supported by the operating system: + # mmap + # sysv + # windows + # (change requires restart) +#dynamic_shared_memory_type = posix # the default is the first option + # supported by the operating system: + # posix + # sysv + # windows + # mmap + # (change requires restart) +#min_dynamic_shared_memory = 0MB # (change requires restart) + +# - Disk - + +#temp_file_limit = -1 # limits per-process temp file space + # in kilobytes, or -1 for no limit + +# - Kernel Resources - + +#max_files_per_process = 1000 # min 64 + # (change requires restart) + +# - Cost-Based Vacuum Delay - + +#vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables) +#vacuum_cost_page_hit = 1 # 0-10000 credits +#vacuum_cost_page_miss = 2 # 0-10000 credits +#vacuum_cost_page_dirty = 20 # 0-10000 credits +#vacuum_cost_limit = 200 # 1-10000 credits + +# - Background Writer - + +#bgwriter_delay = 200ms # 10-10000ms between rounds +#bgwriter_lru_maxpages = 100 # max buffers written/round, 0 disables +#bgwriter_lru_multiplier = 2.0 # 0-10.0 multiplier on buffers scanned/round +#bgwriter_flush_after = 0 # measured in pages, 0 disables + +# - Asynchronous Behavior - + +#backend_flush_after = 0 # measured in pages, 0 disables +#effective_io_concurrency = 1 # 1-1000; 0 disables prefetching +#maintenance_io_concurrency = 10 # 1-1000; 0 disables prefetching +#max_worker_processes = 8 # (change requires restart) +#max_parallel_workers_per_gather = 2 # limited by max_parallel_workers +#max_parallel_maintenance_workers = 2 # limited by max_parallel_workers +#max_parallel_workers = 8 # number of max_worker_processes that + # can be used in parallel operations +#parallel_leader_participation = on +#old_snapshot_threshold = -1 # 1min-60d; -1 disables; 0 is immediate + # (change requires restart) + + +#------------------------------------------------------------------------------ +# WRITE-AHEAD LOG +#------------------------------------------------------------------------------ + +# - Settings - + +wal_level = logical # minimal, replica, or logical + # (change requires restart) +#fsync = on # flush data to disk for crash safety + # (turning this off can cause + # unrecoverable data corruption) +#synchronous_commit = on # synchronization level; + # off, local, remote_write, remote_apply, or on +#wal_sync_method = fsync # the default is the first option + # supported by the operating system: + # open_datasync + # fdatasync (default on Linux and FreeBSD) + # fsync + # fsync_writethrough + # open_sync +#full_page_writes = on # recover from partial page writes +#wal_log_hints = off # also do full page writes of non-critical updates + # (change requires restart) +#wal_compression = off # enable compression of full-page writes +#wal_init_zero = on # zero-fill new WAL files +#wal_recycle = on # recycle WAL files +#wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers + # (change requires restart) +#wal_writer_delay = 200ms # 1-10000 milliseconds +#wal_writer_flush_after = 1MB # measured in pages, 0 disables +#wal_skip_threshold = 2MB + +#commit_delay = 0 # range 0-100000, in microseconds +#commit_siblings = 5 # range 1-1000 + +# - Checkpoints - + +#checkpoint_timeout = 5min # range 30s-1d +#checkpoint_completion_target = 0.9 # checkpoint target duration, 0.0 - 1.0 +#checkpoint_flush_after = 0 # measured in pages, 0 disables +#checkpoint_warning = 30s # 0 disables +#max_wal_size = 1GB +#min_wal_size = 80MB + +# - Archiving - + +#archive_mode = off # enables archiving; off, on, or always + # (change requires restart) +#archive_command = '' # command to use to archive a logfile segment + # placeholders: %p = path of file to archive + # %f = file name only + # e.g. 'test ! -f /mnt/server/archivedir/%f && cp %p /mnt/server/archivedir/%f' +#archive_timeout = 0 # force a logfile segment switch after this + # number of seconds; 0 disables + +# - Archive Recovery - + +# These are only used in recovery mode. + +#restore_command = '' # command to use to restore an archived logfile segment + # placeholders: %p = path of file to restore + # %f = file name only + # e.g. 'cp /mnt/server/archivedir/%f %p' +#archive_cleanup_command = '' # command to execute at every restartpoint +#recovery_end_command = '' # command to execute at completion of recovery + +# - Recovery Target - + +# Set these only when performing a targeted recovery. + +#recovery_target = '' # 'immediate' to end recovery as soon as a + # consistent state is reached + # (change requires restart) +#recovery_target_name = '' # the named restore point to which recovery will proceed + # (change requires restart) +#recovery_target_time = '' # the time stamp up to which recovery will proceed + # (change requires restart) +#recovery_target_xid = '' # the transaction ID up to which recovery will proceed + # (change requires restart) +#recovery_target_lsn = '' # the WAL LSN up to which recovery will proceed + # (change requires restart) +#recovery_target_inclusive = on # Specifies whether to stop: + # just after the specified recovery target (on) + # just before the recovery target (off) + # (change requires restart) +#recovery_target_timeline = 'latest' # 'current', 'latest', or timeline ID + # (change requires restart) +#recovery_target_action = 'pause' # 'pause', 'promote', 'shutdown' + # (change requires restart) + + +#------------------------------------------------------------------------------ +# REPLICATION +#------------------------------------------------------------------------------ + +# - Sending Servers - + +# Set these on the primary and on any standby that will send replication data. + +#max_wal_senders = 10 # max number of walsender processes + # (change requires restart) +#max_replication_slots = 10 # max number of replication slots + # (change requires restart) +#wal_keep_size = 0 # in megabytes; 0 disables +#max_slot_wal_keep_size = -1 # in megabytes; -1 disables +#wal_sender_timeout = 60s # in milliseconds; 0 disables +#track_commit_timestamp = off # collect timestamp of transaction commit + # (change requires restart) + +# - Primary Server - + +# These settings are ignored on a standby server. + +#synchronous_standby_names = '' # standby servers that provide sync rep + # method to choose sync standbys, number of sync standbys, + # and comma-separated list of application_name + # from standby(s); '*' = all +#vacuum_defer_cleanup_age = 0 # number of xacts by which cleanup is delayed + +# - Standby Servers - + +# These settings are ignored on a primary server. + +#primary_conninfo = '' # connection string to sending server +#primary_slot_name = '' # replication slot on sending server +#promote_trigger_file = '' # file name whose presence ends recovery +#hot_standby = on # "off" disallows queries during recovery + # (change requires restart) +#max_standby_archive_delay = 30s # max delay before canceling queries + # when reading WAL from archive; + # -1 allows indefinite delay +#max_standby_streaming_delay = 30s # max delay before canceling queries + # when reading streaming WAL; + # -1 allows indefinite delay +#wal_receiver_create_temp_slot = off # create temp slot if primary_slot_name + # is not set +#wal_receiver_status_interval = 10s # send replies at least this often + # 0 disables +#hot_standby_feedback = off # send info from standby to prevent + # query conflicts +#wal_receiver_timeout = 60s # time that receiver waits for + # communication from primary + # in milliseconds; 0 disables +#wal_retrieve_retry_interval = 5s # time to wait before retrying to + # retrieve WAL after a failed attempt +#recovery_min_apply_delay = 0 # minimum delay for applying changes during recovery + +# - Subscribers - + +# These settings are ignored on a publisher. + +#max_logical_replication_workers = 4 # taken from max_worker_processes + # (change requires restart) +#max_sync_workers_per_subscription = 2 # taken from max_logical_replication_workers + + +#------------------------------------------------------------------------------ +# QUERY TUNING +#------------------------------------------------------------------------------ + +# - Planner Method Configuration - + +#enable_async_append = on +#enable_bitmapscan = on +#enable_gathermerge = on +#enable_hashagg = on +#enable_hashjoin = on +#enable_incremental_sort = on +#enable_indexscan = on +#enable_indexonlyscan = on +#enable_material = on +#enable_memoize = on +#enable_mergejoin = on +#enable_nestloop = on +#enable_parallel_append = on +#enable_parallel_hash = on +#enable_partition_pruning = on +#enable_partitionwise_join = off +#enable_partitionwise_aggregate = off +#enable_seqscan = on +#enable_sort = on +#enable_tidscan = on + +# - Planner Cost Constants - + +#seq_page_cost = 1.0 # measured on an arbitrary scale +#random_page_cost = 4.0 # same scale as above +#cpu_tuple_cost = 0.01 # same scale as above +#cpu_index_tuple_cost = 0.005 # same scale as above +#cpu_operator_cost = 0.0025 # same scale as above +#parallel_setup_cost = 1000.0 # same scale as above +#parallel_tuple_cost = 0.1 # same scale as above +#min_parallel_table_scan_size = 8MB +#min_parallel_index_scan_size = 512kB +#effective_cache_size = 4GB + +#jit_above_cost = 100000 # perform JIT compilation if available + # and query more expensive than this; + # -1 disables +#jit_inline_above_cost = 500000 # inline small functions if query is + # more expensive than this; -1 disables +#jit_optimize_above_cost = 500000 # use expensive JIT optimizations if + # query is more expensive than this; + # -1 disables + +# - Genetic Query Optimizer - + +#geqo = on +#geqo_threshold = 12 +#geqo_effort = 5 # range 1-10 +#geqo_pool_size = 0 # selects default based on effort +#geqo_generations = 0 # selects default based on effort +#geqo_selection_bias = 2.0 # range 1.5-2.0 +#geqo_seed = 0.0 # range 0.0-1.0 + +# - Other Planner Options - + +#default_statistics_target = 100 # range 1-10000 +#constraint_exclusion = partition # on, off, or partition +#cursor_tuple_fraction = 0.1 # range 0.0-1.0 +#from_collapse_limit = 8 +#jit = on # allow JIT compilation +#join_collapse_limit = 8 # 1 disables collapsing of explicit + # JOIN clauses +#plan_cache_mode = auto # auto, force_generic_plan or + # force_custom_plan + + +#------------------------------------------------------------------------------ +# REPORTING AND LOGGING +#------------------------------------------------------------------------------ + +# - Where to Log - + +#log_destination = 'stderr' # Valid values are combinations of + # stderr, csvlog, syslog, and eventlog, + # depending on platform. csvlog + # requires logging_collector to be on. + +# This is used when logging to stderr: +#logging_collector = off # Enable capturing of stderr and csvlog + # into log files. Required to be on for + # csvlogs. + # (change requires restart) + +# These are only used if logging_collector is on: +#log_directory = 'log' # directory where log files are written, + # can be absolute or relative to PGDATA +#log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log' # log file name pattern, + # can include strftime() escapes +#log_file_mode = 0600 # creation mode for log files, + # begin with 0 to use octal notation +#log_rotation_age = 1d # Automatic rotation of logfiles will + # happen after that time. 0 disables. +#log_rotation_size = 10MB # Automatic rotation of logfiles will + # happen after that much log output. + # 0 disables. +#log_truncate_on_rotation = off # If on, an existing log file with the + # same name as the new log file will be + # truncated rather than appended to. + # But such truncation only occurs on + # time-driven rotation, not on restarts + # or size-driven rotation. Default is + # off, meaning append to existing files + # in all cases. + +# These are relevant when logging to syslog: +#syslog_facility = 'LOCAL0' +#syslog_ident = 'postgres' +#syslog_sequence_numbers = on +#syslog_split_messages = on + +# This is only relevant when logging to eventlog (Windows): +# (change requires restart) +#event_source = 'PostgreSQL' + +# - When to Log - + +#log_min_messages = warning # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # info + # notice + # warning + # error + # log + # fatal + # panic + +#log_min_error_statement = error # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # info + # notice + # warning + # error + # log + # fatal + # panic (effectively off) + +#log_min_duration_statement = -1 # -1 is disabled, 0 logs all statements + # and their durations, > 0 logs only + # statements running at least this number + # of milliseconds + +#log_min_duration_sample = -1 # -1 is disabled, 0 logs a sample of statements + # and their durations, > 0 logs only a sample of + # statements running at least this number + # of milliseconds; + # sample fraction is determined by log_statement_sample_rate + +#log_statement_sample_rate = 1.0 # fraction of logged statements exceeding + # log_min_duration_sample to be logged; + # 1.0 logs all such statements, 0.0 never logs + + +#log_transaction_sample_rate = 0.0 # fraction of transactions whose statements + # are logged regardless of their duration; 1.0 logs all + # statements from all transactions, 0.0 never logs + +# - What to Log - + +#debug_print_parse = off +#debug_print_rewritten = off +#debug_print_plan = off +#debug_pretty_print = on +#log_autovacuum_min_duration = -1 # log autovacuum activity; + # -1 disables, 0 logs all actions and + # their durations, > 0 logs only + # actions running at least this number + # of milliseconds. +#log_checkpoints = off +#log_connections = off +#log_disconnections = off +#log_duration = off +#log_error_verbosity = default # terse, default, or verbose messages +#log_hostname = off +#log_line_prefix = '%m [%p] ' # special values: + # %a = application name + # %u = user name + # %d = database name + # %r = remote host and port + # %h = remote host + # %b = backend type + # %p = process ID + # %P = process ID of parallel group leader + # %t = timestamp without milliseconds + # %m = timestamp with milliseconds + # %n = timestamp with milliseconds (as a Unix epoch) + # %Q = query ID (0 if none or not computed) + # %i = command tag + # %e = SQL state + # %c = session ID + # %l = session line number + # %s = session start timestamp + # %v = virtual transaction ID + # %x = transaction ID (0 if none) + # %q = stop here in non-session + # processes + # %% = '%' + # e.g. '<%u%%%d> ' +#log_lock_waits = off # log lock waits >= deadlock_timeout +#log_recovery_conflict_waits = off # log standby recovery conflict waits + # >= deadlock_timeout +#log_parameter_max_length = -1 # when logging statements, limit logged + # bind-parameter values to N bytes; + # -1 means print in full, 0 disables +#log_parameter_max_length_on_error = 0 # when logging an error, limit logged + # bind-parameter values to N bytes; + # -1 means print in full, 0 disables +#log_statement = 'none' # none, ddl, mod, all +#log_replication_commands = off +#log_temp_files = -1 # log temporary files equal or larger + # than the specified size in kilobytes; + # -1 disables, 0 logs all temp files +#log_timezone = 'GMT' + + +#------------------------------------------------------------------------------ +# PROCESS TITLE +#------------------------------------------------------------------------------ + +#cluster_name = '' # added to process titles if nonempty + # (change requires restart) +#update_process_title = on + + +#------------------------------------------------------------------------------ +# STATISTICS +#------------------------------------------------------------------------------ + +# - Query and Index Statistics Collector - + +#track_activities = on +#track_activity_query_size = 1024 # (change requires restart) +#track_counts = on +#track_io_timing = off +#track_wal_io_timing = off +#track_functions = none # none, pl, all +#stats_temp_directory = 'pg_stat_tmp' + + +# - Monitoring - + +#compute_query_id = auto +#log_statement_stats = off +#log_parser_stats = off +#log_planner_stats = off +#log_executor_stats = off + + +#------------------------------------------------------------------------------ +# AUTOVACUUM +#------------------------------------------------------------------------------ + +#autovacuum = on # Enable autovacuum subprocess? 'on' + # requires track_counts to also be on. +#autovacuum_max_workers = 3 # max number of autovacuum subprocesses + # (change requires restart) +#autovacuum_naptime = 1min # time between autovacuum runs +#autovacuum_vacuum_threshold = 50 # min number of row updates before + # vacuum +#autovacuum_vacuum_insert_threshold = 1000 # min number of row inserts + # before vacuum; -1 disables insert + # vacuums +#autovacuum_analyze_threshold = 50 # min number of row updates before + # analyze +#autovacuum_vacuum_scale_factor = 0.2 # fraction of table size before vacuum +#autovacuum_vacuum_insert_scale_factor = 0.2 # fraction of inserts over table + # size before insert vacuum +#autovacuum_analyze_scale_factor = 0.1 # fraction of table size before analyze +#autovacuum_freeze_max_age = 200000000 # maximum XID age before forced vacuum + # (change requires restart) +#autovacuum_multixact_freeze_max_age = 400000000 # maximum multixact age + # before forced vacuum + # (change requires restart) +#autovacuum_vacuum_cost_delay = 2ms # default vacuum cost delay for + # autovacuum, in milliseconds; + # -1 means use vacuum_cost_delay +#autovacuum_vacuum_cost_limit = -1 # default vacuum cost limit for + # autovacuum, -1 means use + # vacuum_cost_limit + + +#------------------------------------------------------------------------------ +# CLIENT CONNECTION DEFAULTS +#------------------------------------------------------------------------------ + +# - Statement Behavior - + +#client_min_messages = notice # values in order of decreasing detail: + # debug5 + # debug4 + # debug3 + # debug2 + # debug1 + # log + # notice + # warning + # error +#search_path = '"$user", public' # schema names +#row_security = on +#default_table_access_method = 'heap' +#default_tablespace = '' # a tablespace name, '' uses the default +#default_toast_compression = 'pglz' # 'pglz' or 'lz4' +#temp_tablespaces = '' # a list of tablespace names, '' uses + # only default tablespace +#check_function_bodies = on +#default_transaction_isolation = 'read committed' +#default_transaction_read_only = off +#default_transaction_deferrable = off +#session_replication_role = 'origin' +#statement_timeout = 0 # in milliseconds, 0 is disabled +#lock_timeout = 0 # in milliseconds, 0 is disabled +#idle_in_transaction_session_timeout = 0 # in milliseconds, 0 is disabled +#idle_session_timeout = 0 # in milliseconds, 0 is disabled +#vacuum_freeze_table_age = 150000000 +#vacuum_freeze_min_age = 50000000 +#vacuum_failsafe_age = 1600000000 +#vacuum_multixact_freeze_table_age = 150000000 +#vacuum_multixact_freeze_min_age = 5000000 +#vacuum_multixact_failsafe_age = 1600000000 +#bytea_output = 'hex' # hex, escape +#xmlbinary = 'base64' +#xmloption = 'content' +#gin_pending_list_limit = 4MB + +# - Locale and Formatting - + +#datestyle = 'iso, mdy' +#intervalstyle = 'postgres' +#timezone = 'GMT' +#timezone_abbreviations = 'Default' # Select the set of available time zone + # abbreviations. Currently, there are + # Default + # Australia (historical usage) + # India + # You can create your own file in + # share/timezonesets/. +#extra_float_digits = 1 # min -15, max 3; any value >0 actually + # selects precise output mode +#client_encoding = sql_ascii # actually, defaults to database + # encoding + +# These settings are initialized by initdb, but they can be changed. +#lc_messages = 'C' # locale for system error message + # strings +#lc_monetary = 'C' # locale for monetary formatting +#lc_numeric = 'C' # locale for number formatting +#lc_time = 'C' # locale for time formatting + +# default configuration for text search +#default_text_search_config = 'pg_catalog.simple' + +# - Shared Library Preloading - + +#local_preload_libraries = '' +#session_preload_libraries = '' +#shared_preload_libraries = '' # (change requires restart) +#jit_provider = 'llvmjit' # JIT library to use + +# - Other Defaults - + +#dynamic_library_path = '$libdir' +#extension_destdir = '' # prepend path when loading extensions + # and shared objects (added by Debian) +#gin_fuzzy_search_limit = 0 + + +#------------------------------------------------------------------------------ +# LOCK MANAGEMENT +#------------------------------------------------------------------------------ + +#deadlock_timeout = 1s +#max_locks_per_transaction = 64 # min 10 + # (change requires restart) +#max_pred_locks_per_transaction = 64 # min 10 + # (change requires restart) +#max_pred_locks_per_relation = -2 # negative values mean + # (max_pred_locks_per_transaction + # / -max_pred_locks_per_relation) - 1 +#max_pred_locks_per_page = 2 # min 0 + + +#------------------------------------------------------------------------------ +# VERSION AND PLATFORM COMPATIBILITY +#------------------------------------------------------------------------------ + +# - Previous PostgreSQL Versions - + +#array_nulls = on +#backslash_quote = safe_encoding # on, off, or safe_encoding +#escape_string_warning = on +#lo_compat_privileges = off +#quote_all_identifiers = off +#standard_conforming_strings = on +#synchronize_seqscans = on + +# - Other Platforms and Clients - + +#transform_null_equals = off + + +#------------------------------------------------------------------------------ +# ERROR HANDLING +#------------------------------------------------------------------------------ + +#exit_on_error = off # terminate session on any error? +#restart_after_crash = on # reinitialize after backend crash? +#data_sync_retry = off # retry or panic on failure to fsync + # data? + # (change requires restart) +#recovery_init_sync_method = fsync # fsync, syncfs (Linux 5.8+) + + +#------------------------------------------------------------------------------ +# CONFIG FILE INCLUDES +#------------------------------------------------------------------------------ + +# These options allow settings to be loaded from files other than the +# default postgresql.conf. Note that these are directives, not variable +# assignments, so they can usefully be given more than once. + +#include_dir = '...' # include files ending in '.conf' from + # a directory, e.g., 'conf.d' +#include_if_exists = '...' # include file only if it exists +#include = '...' # include file + + +#------------------------------------------------------------------------------ +# CUSTOMIZED OPTIONS +#------------------------------------------------------------------------------ + +# Add settings for extensions here \ No newline at end of file From 79220b7ea3a1ff2b83c00d0419e8a37f405a6c49 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 17 Sep 2024 16:40:23 +0200 Subject: [PATCH 02/88] feat: got decoderbufs to run and compile in docker --- tests/postgres/check-replication.sh | 10 ++++++++++ tests/postgres/postgres/Dockerfile | 21 +++++++++++++++++++++ tests/postgres/postgresql.conf | 2 +- 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100755 tests/postgres/check-replication.sh diff --git a/tests/postgres/check-replication.sh b/tests/postgres/check-replication.sh new file mode 100755 index 000000000..2e51147d1 --- /dev/null +++ b/tests/postgres/check-replication.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -e + +docker exec dlt_postgres_db \ + psql -x -U loader -d dlt_data \ + -c "select *, + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) as replicationSlotLag, + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn)) as confirmedLag + from pg_replication_slots;" \ No newline at end of file diff --git a/tests/postgres/postgres/Dockerfile b/tests/postgres/postgres/Dockerfile index 1dfd569b5..e7f9aa73c 100644 --- a/tests/postgres/postgres/Dockerfile +++ b/tests/postgres/postgres/Dockerfile @@ -1,2 +1,23 @@ FROM postgres:14 + +# Install dependencies required to build decoderbufs +RUN apt-get update +RUN apt-get install -f -y \ + software-properties-common \ + build-essential \ + pkg-config \ + git + +RUN apt-get install -f -y \ + postgresql-server-dev-14 \ + libprotobuf-c-dev && \ + rm -rf /var/lib/apt/lists/* + +ARG decoderbufs_version=v1.7.0.Final +RUN git clone https://github.com/debezium/postgres-decoderbufs -b $decoderbufs_version --single-branch && \ + cd postgres-decoderbufs && \ + make && make install && \ + cd .. && \ + rm -rf postgres-decoderbufs + COPY 01_init.sql /docker-entrypoint-initdb.d/ \ No newline at end of file diff --git a/tests/postgres/postgresql.conf b/tests/postgres/postgresql.conf index bbb13e1db..a85b40a37 100644 --- a/tests/postgres/postgresql.conf +++ b/tests/postgres/postgresql.conf @@ -720,7 +720,7 @@ wal_level = logical # minimal, replica, or logical #local_preload_libraries = '' #session_preload_libraries = '' -#shared_preload_libraries = '' # (change requires restart) +shared_preload_libraries = 'decoderbufs' # (change requires restart) #jit_provider = 'llvmjit' # JIT library to use # - Other Defaults - From 9de083555e459d9b452e243eb00fdb23c05ebb93 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 17 Sep 2024 18:59:13 +0200 Subject: [PATCH 03/88] chore: updated protobuf to latest compatible version --- pyproject.toml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a1a431d54..f3bd6f85d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,6 +45,7 @@ pytest-mock = "^3.12.0" twisted = "22.10.0" pytest-forked = "^1.6.0" pendulum = "^3.0.0" +types-protobuf = "^5.27.0.20240907" [tool.poetry.group.sql_database.dependencies] sqlalchemy = ">=1.4" @@ -54,6 +55,10 @@ connectorx = ">=0.3.1" [tool.poetry.group.pg_replication.dependencies] psycopg2-binary = ">=2.9.9" +[tool.poetry.group.pg_legacy_replication.dependencies] +psycopg2-binary = ">=2.9.9" +protobuf = ">=4.25" + [tool.poetry.group.google_sheets.dependencies] google-api-python-client = "^2.78.0" @@ -116,4 +121,4 @@ requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [tool.black] -include = '.*py$' +include = '.*py$' \ No newline at end of file From 75a0f7f0ab7c8f425eb5a5bba26aa341131a27ca Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 18 Sep 2024 12:47:21 +0200 Subject: [PATCH 04/88] chore: copying all files from pg_replication; format-lint is reformatting other sources -_- --- sources/pg_legacy_replication/README.md | 79 ++ sources/pg_legacy_replication/__init__.py | 103 +++ sources/pg_legacy_replication/decoders.py | 427 +++++++++ sources/pg_legacy_replication/exceptions.py | 14 + sources/pg_legacy_replication/helpers.py | 787 ++++++++++++++++ .../pg_logicaldec_pb2.py | 40 + .../pg_legacy_replication/requirements.txt | 3 + sources/pg_legacy_replication/schema_types.py | 125 +++ sources/pg_legacy_replication_pipeline.py | 292 ++++++ tests/pg_legacy_replication/__init__.py | 0 tests/pg_legacy_replication/cases.py | 94 ++ tests/pg_legacy_replication/conftest.py | 44 + .../test_pg_replication.py | 868 ++++++++++++++++++ tests/pg_legacy_replication/utils.py | 52 ++ 14 files changed, 2928 insertions(+) create mode 100644 sources/pg_legacy_replication/README.md create mode 100644 sources/pg_legacy_replication/__init__.py create mode 100644 sources/pg_legacy_replication/decoders.py create mode 100644 sources/pg_legacy_replication/exceptions.py create mode 100644 sources/pg_legacy_replication/helpers.py create mode 100644 sources/pg_legacy_replication/pg_logicaldec_pb2.py create mode 100644 sources/pg_legacy_replication/requirements.txt create mode 100644 sources/pg_legacy_replication/schema_types.py create mode 100644 sources/pg_legacy_replication_pipeline.py create mode 100644 tests/pg_legacy_replication/__init__.py create mode 100644 tests/pg_legacy_replication/cases.py create mode 100644 tests/pg_legacy_replication/conftest.py create mode 100644 tests/pg_legacy_replication/test_pg_replication.py create mode 100644 tests/pg_legacy_replication/utils.py diff --git a/sources/pg_legacy_replication/README.md b/sources/pg_legacy_replication/README.md new file mode 100644 index 000000000..f34fcd4d6 --- /dev/null +++ b/sources/pg_legacy_replication/README.md @@ -0,0 +1,79 @@ +# Postgres replication +[Postgres](https://www.postgresql.org/) is one of the most popular relational database management systems. This verified source uses Postgres' replication functionality to efficiently process changes in tables (a process often referred to as _Change Data Capture_ or CDC). It uses [logical decoding](https://www.postgresql.org/docs/current/logicaldecoding.html) and the standard built-in `pgoutput` [output plugin](https://www.postgresql.org/docs/current/logicaldecoding-output-plugin.html). + +Resources that can be loaded using this verified source are: + +| Name | Description | +|----------------------|-------------------------------------------------| +| replication_resource | Load published messages from a replication slot | + +## Initialize the pipeline + +```bash +dlt init pg_replication duckdb +``` + +This uses `duckdb` as destination, but you can choose any of the supported [destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/). + +## Add `sql_database` source + +```bash +dlt init sql_database duckdb +``` + +This source depends on the [sql_database](../sql_database/README.md) verified source internally to perform initial loads. This step can be skipped if you don't do initial loads. +## Set up user + +The Postgres user needs to have the `LOGIN` and `REPLICATION` attributes assigned: + +```sql +CREATE ROLE replication_user WITH LOGIN REPLICATION; +``` + +It also needs `CREATE` privilege on the database: + +```sql +GRANT CREATE ON DATABASE dlt_data TO replication_user; +``` + +### Set up RDS +1. You must enable replication for RDS Postgres instance via **Parameter Group**: https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_PostgreSQL.Replication.ReadReplicas.html +2. `WITH LOGIN REPLICATION;` does not work on RDS, instead do: +```sql +GRANT rds_replication TO replication_user; +``` +3. Do not fallback to non SSL connection by setting connection parameters: +```toml +sources.pg_replication.credentials="postgresql://loader:password@host.rds.amazonaws.com:5432/dlt_data?sslmode=require&connect_timeout=300" +``` + + +## Add credentials +1. Open `.dlt/secrets.toml`. +2. Enter your Postgres credentials: + + ```toml + [sources.pg_replication] + credentials="postgresql://replication_user:<>@localhost:5432/dlt_data" + ``` +3. Enter credentials for your chosen destination as per the [docs](https://dlthub.com/docs/dlt-ecosystem/destinations/). + +## Run the pipeline + +1. Install the necessary dependencies by running the following command: + + ```bash + pip install -r requirements.txt + ``` + +1. Now the pipeline can be run by using the command: + + ```bash + python pg_replication_pipeline.py + ``` + +1. To make sure that everything is loaded as expected, use the command: + + ```bash + dlt pipeline pg_replication_pipeline show + ``` \ No newline at end of file diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py new file mode 100644 index 000000000..74482e226 --- /dev/null +++ b/sources/pg_legacy_replication/__init__.py @@ -0,0 +1,103 @@ +"""Replicates postgres tables in batch using logical decoding.""" + +from typing import Dict, Sequence, Optional, Iterable, Union + +import dlt + +from dlt.common.typing import TDataItem +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.extract.items import DataItemWithMeta +from dlt.sources.credentials import ConnectionStringCredentials + +from .helpers import advance_slot, get_max_lsn, ItemGenerator + + +@dlt.resource( + name=lambda args: args["slot_name"] + "_" + args["pub_name"], + standalone=True, +) +def replication_resource( + slot_name: str, + pub_name: str, + credentials: ConnectionStringCredentials = dlt.secrets.value, + include_columns: Optional[Dict[str, Sequence[str]]] = None, + columns: Optional[Dict[str, TTableSchemaColumns]] = None, + target_batch_size: int = 1000, + flush_slot: bool = True, +) -> Iterable[Union[TDataItem, DataItemWithMeta]]: + """Resource yielding data items for changes in one or more postgres tables. + + - Relies on a replication slot and publication that publishes DML operations + (i.e. `insert`, `update`, and/or `delete`). Helper `init_replication` can be + used to set this up. + - Maintains LSN of last consumed message in state to track progress. + - At start of the run, advances the slot upto last consumed message in previous run. + - Processes in batches to limit memory usage. + + Args: + slot_name (str): Name of the replication slot to consume replication messages from. + pub_name (str): Name of the publication that publishes DML operations for the table(s). + credentials (ConnectionStringCredentials): Postgres database credentials. + include_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to + sequence of names of columns to include in the generated data items. + Any column not in the sequence is excluded. If not provided, all columns + are included. For example: + ``` + include_columns={ + "table_x": ["col_a", "col_c"], + "table_y": ["col_x", "col_y", "col_z"], + } + ``` + columns (Optional[Dict[str, TTableHintTemplate[TAnySchemaColumns]]]): Maps + table name(s) to column hints to apply on the replicated table(s). For example: + ``` + columns={ + "table_x": {"col_a": {"data_type": "complex"}}, + "table_y": {"col_y": {"precision": 32}}, + } + ``` + target_batch_size (int): Desired number of data items yielded in a batch. + Can be used to limit the data items in memory. Note that the number of + data items yielded can be (far) greater than `target_batch_size`, because + all messages belonging to the same transaction are always processed in + the same batch, regardless of the number of messages in the transaction + and regardless of the value of `target_batch_size`. The number of data + items can also be smaller than `target_batch_size` when the replication + slot is exhausted before a batch is full. + flush_slot (bool): Whether processed messages are discarded from the replication + slot. Recommended value is True. Be careful when setting False—not flushing + can eventually lead to a “disk full” condition on the server, because + the server retains all the WAL segments that might be needed to stream + the changes via all of the currently open replication slots. + + Yields: + Data items for changes published in the publication. + """ + # start where we left off in previous run + start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0) + if flush_slot: + advance_slot(start_lsn, slot_name, credentials) + + # continue until last message in replication slot + options = {"publication_names": pub_name, "proto_version": "1"} + upto_lsn = get_max_lsn(slot_name, options, credentials) + if upto_lsn is None: + return + + # generate items in batches + while True: + gen = ItemGenerator( + credentials=credentials, + slot_name=slot_name, + options=options, + upto_lsn=upto_lsn, + start_lsn=start_lsn, + target_batch_size=target_batch_size, + include_columns=include_columns, + columns=columns, + ) + yield from gen + if gen.generated_all: + dlt.current.resource_state()["last_commit_lsn"] = gen.last_commit_lsn + break + start_lsn = gen.last_commit_lsn diff --git a/sources/pg_legacy_replication/decoders.py b/sources/pg_legacy_replication/decoders.py new file mode 100644 index 000000000..c2707b46a --- /dev/null +++ b/sources/pg_legacy_replication/decoders.py @@ -0,0 +1,427 @@ +# flake8: noqa +# file copied from https://raw.githubusercontent.com/dgea005/pypgoutput/master/src/pypgoutput/decoders.py +# we do this instead of importing `pypgoutput` because it depends on `psycopg2`, which causes errors when installing on macOS + +import io +from abc import ABC, abstractmethod +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from typing import List, Optional, Union + +# integer byte lengths +INT8 = 1 +INT16 = 2 +INT32 = 4 +INT64 = 8 + + +def convert_pg_ts(_ts_in_microseconds: int) -> datetime: + ts = datetime(2000, 1, 1, 0, 0, 0, 0, tzinfo=timezone.utc) + return ts + timedelta(microseconds=_ts_in_microseconds) + + +def convert_bytes_to_int(_in_bytes: bytes) -> int: + return int.from_bytes(_in_bytes, byteorder="big", signed=True) + + +def convert_bytes_to_utf8(_in_bytes: Union[bytes, bytearray]) -> str: + return (_in_bytes).decode("utf-8") + + +@dataclass(frozen=True) +class ColumnData: + # col_data_category is NOT the type. it means null value/toasted(not sent)/text formatted + col_data_category: Optional[str] + col_data_length: Optional[int] = None + col_data: Optional[str] = None + + def __repr__(self) -> str: + return f"[col_data_category='{self.col_data_category}', col_data_length={self.col_data_length}, col_data='{self.col_data}']" + + +@dataclass(frozen=True) +class ColumnType: + """https://www.postgresql.org/docs/12/catalog-pg-attribute.html""" + + part_of_pkey: int + name: str + type_id: int + atttypmod: int + + +@dataclass(frozen=True) +class TupleData: + n_columns: int + column_data: List[ColumnData] + + def __repr__(self) -> str: + return f"n_columns: {self.n_columns}, data: {self.column_data}" + + +class PgoutputMessage(ABC): + def __init__(self, buffer: bytes): + self.buffer: io.BytesIO = io.BytesIO(buffer) + self.byte1: str = self.read_utf8(1) + self.decode_buffer() + + @abstractmethod + def decode_buffer(self) -> None: + """Decoding is implemented for each message type""" + + @abstractmethod + def __repr__(self) -> str: + """Implemented for each message type""" + + def read_int8(self) -> int: + return convert_bytes_to_int(self.buffer.read(INT8)) + + def read_int16(self) -> int: + return convert_bytes_to_int(self.buffer.read(INT16)) + + def read_int32(self) -> int: + return convert_bytes_to_int(self.buffer.read(INT32)) + + def read_int64(self) -> int: + return convert_bytes_to_int(self.buffer.read(INT64)) + + def read_utf8(self, n: int = 1) -> str: + return convert_bytes_to_utf8(self.buffer.read(n)) + + def read_timestamp(self) -> datetime: + # 8 chars -> int64 -> timestamp + return convert_pg_ts(_ts_in_microseconds=self.read_int64()) + + def read_string(self) -> str: + output = bytearray() + while (next_char := self.buffer.read(1)) != b"\x00": + output += next_char + return convert_bytes_to_utf8(output) + + def read_tuple_data(self) -> TupleData: + """ + TupleData + Int16 Number of columns. + Next, one of the following submessages appears for each column (except generated columns): + Byte1('n') Identifies the data as NULL value. + Or + Byte1('u') Identifies unchanged TOASTed value (the actual value is not sent). + Or + Byte1('t') Identifies the data as text formatted value. + Int32 Length of the column value. + Byten The value of the column, in text format. (A future release might support additional formats.) n is the above length. + """ + # TODO: investigate what happens with the generated columns + column_data = list() + n_columns = self.read_int16() + for column in range(n_columns): + col_data_category = self.read_utf8() + if col_data_category in ("n", "u"): + # "n"=NULL, "t"=TOASTed + column_data.append(ColumnData(col_data_category=col_data_category)) + elif col_data_category == "t": + # t = tuple + col_data_length = self.read_int32() + col_data = self.read_utf8(col_data_length) + column_data.append( + ColumnData( + col_data_category=col_data_category, + col_data_length=col_data_length, + col_data=col_data, + ) + ) + return TupleData(n_columns=n_columns, column_data=column_data) + + +class Begin(PgoutputMessage): + """ + https://pgpedia.info/x/xlogrecptr.html + https://www.postgresql.org/docs/14/datatype-pg-lsn.html + + byte1 Byte1('B') Identifies the message as a begin message. + lsn Int64 The final LSN of the transaction. + commit_tx_ts Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). + tx_xid Int32 Xid of the transaction. + """ + + byte1: str + lsn: int + commit_ts: datetime + tx_xid: int + + def decode_buffer(self) -> None: + if self.byte1 != "B": + raise ValueError("first byte in buffer does not match Begin message") + self.lsn = self.read_int64() + self.commit_ts = self.read_timestamp() + self.tx_xid = self.read_int64() + + def __repr__(self) -> str: + return ( + f"BEGIN \n\tbyte1: '{self.byte1}', \n\tLSN: {self.lsn}, " + f"\n\tcommit_ts {self.commit_ts}, \n\ttx_xid: {self.tx_xid}" + ) + + +class Commit(PgoutputMessage): + """ + byte1: Byte1('C') Identifies the message as a commit message. + flags: Int8 Flags; currently unused (must be 0). + lsn_commit: Int64 The LSN of the commit. + lsn: Int64 The end LSN of the transaction. + Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). + """ + + byte1: str + flags: int + lsn_commit: int + lsn: int + commit_ts: datetime + + def decode_buffer(self) -> None: + if self.byte1 != "C": + raise ValueError("first byte in buffer does not match Commit message") + self.flags = self.read_int8() + self.lsn_commit = self.read_int64() + self.lsn = self.read_int64() + self.commit_ts = self.read_timestamp() + + def __repr__(self) -> str: + return ( + f"COMMIT \n\tbyte1: {self.byte1}, \n\tflags {self.flags}, \n\tlsn_commit: {self.lsn_commit}" + f"\n\tLSN: {self.lsn}, \n\tcommit_ts {self.commit_ts}" + ) + + +class Origin: + """ + Byte1('O') Identifies the message as an origin message. + Int64 The LSN of the commit on the origin server. + String Name of the origin. + Note that there can be multiple Origin messages inside a single transaction. + This seems to be what origin means: https://www.postgresql.org/docs/12/replication-origins.html + """ + + pass + + +class Relation(PgoutputMessage): + """ + Byte1('R') Identifies the message as a relation message. + Int32 ID of the relation. + String Namespace (empty string for pg_catalog). + String Relation name. + Int8 Replica identity setting for the relation (same as relreplident in pg_class). + # select relreplident from pg_class where relname = 'test_table'; + # from reading the documentation and looking at the tables this is not int8 but a single character + # background: https://www.postgresql.org/docs/10/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY + Int16 Number of columns. + Next, the following message part appears for each column (except generated columns): + Int8 Flags for the column. Currently can be either 0 for no flags or 1 which marks the column as part of the key. + String Name of the column. + Int32 ID of the column's data type. + Int32 Type modifier of the column (atttypmod). + """ + + byte1: str + relation_id: int + namespace: str + relation_name: str + replica_identity_setting: str + n_columns: int + columns: List[ColumnType] + + def decode_buffer(self) -> None: + if self.byte1 != "R": + raise ValueError("first byte in buffer does not match Relation message") + self.relation_id = self.read_int32() + self.namespace = self.read_string() + self.relation_name = self.read_string() + self.replica_identity_setting = self.read_utf8() + self.n_columns = self.read_int16() + self.columns = list() + + for column in range(self.n_columns): + part_of_pkey = self.read_int8() + col_name = self.read_string() + data_type_id = self.read_int32() + # TODO: check on use of signed / unsigned + # check with select oid from pg_type where typname = ; timestamp == 1184, int4 = 23 + col_modifier = self.read_int32() + self.columns.append( + ColumnType( + part_of_pkey=part_of_pkey, + name=col_name, + type_id=data_type_id, + atttypmod=col_modifier, + ) + ) + + def __repr__(self) -> str: + return ( + f"RELATION \n\tbyte1: '{self.byte1}', \n\trelation_id: {self.relation_id}" + f",\n\tnamespace/schema: '{self.namespace}',\n\trelation_name: '{self.relation_name}'" + f",\n\treplica_identity_setting: '{self.replica_identity_setting}',\n\tn_columns: {self.n_columns} " + f",\n\tcolumns: {self.columns}" + ) + + +class PgType: + """ + Renamed to PgType not to collide with "type" + + Byte1('Y') Identifies the message as a type message. + Int32 ID of the data type. + String Namespace (empty string for pg_catalog). + String Name of the data type. + """ + + pass + + +class Insert(PgoutputMessage): + """ + Byte1('I') Identifies the message as an insert message. + Int32 ID of the relation corresponding to the ID in the relation message. + Byte1('N') Identifies the following TupleData message as a new tuple. + TupleData TupleData message part representing the contents of new tuple. + """ + + byte1: str + relation_id: int + new_tuple_byte: str + new_tuple: TupleData + + def decode_buffer(self) -> None: + if self.byte1 != "I": + raise ValueError( + f"first byte in buffer does not match Insert message (expected 'I', got '{self.byte1}'" + ) + self.relation_id = self.read_int32() + self.new_tuple_byte = self.read_utf8() + self.new_tuple = self.read_tuple_data() + + def __repr__(self) -> str: + return ( + f"INSERT \n\tbyte1: '{self.byte1}', \n\trelation_id: {self.relation_id} " + f"\n\tnew tuple byte: '{self.new_tuple_byte}', \n\tnew_tuple: {self.new_tuple}" + ) + + +class Update(PgoutputMessage): + """ + Byte1('U') Identifies the message as an update message. + Int32 ID of the relation corresponding to the ID in the relation message. + Byte1('K') Identifies the following TupleData submessage as a key. This field is optional and is only present if the update changed data in any of the column(s) that are part of the REPLICA IDENTITY index. + Byte1('O') Identifies the following TupleData submessage as an old tuple. This field is optional and is only present if table in which the update happened has REPLICA IDENTITY set to FULL. + TupleData TupleData message part representing the contents of the old tuple or primary key. Only present if the previous 'O' or 'K' part is present. + Byte1('N') Identifies the following TupleData message as a new tuple. + TupleData TupleData message part representing the contents of a new tuple. + + The Update message may contain either a 'K' message part or an 'O' message part or neither of them, but never both of them. + """ + + byte1: str + relation_id: int + next_byte_identifier: Optional[str] + optional_tuple_identifier: Optional[str] + old_tuple: Optional[TupleData] + new_tuple_byte: str + new_tuple: TupleData + + def decode_buffer(self) -> None: + self.optional_tuple_identifier = None + self.old_tuple = None + if self.byte1 != "U": + raise ValueError( + f"first byte in buffer does not match Update message (expected 'U', got '{self.byte1}'" + ) + self.relation_id = self.read_int32() + # TODO test update to PK, test update with REPLICA IDENTITY = FULL + self.next_byte_identifier = self.read_utf8() # one of K, O or N + if self.next_byte_identifier == "K" or self.next_byte_identifier == "O": + self.optional_tuple_identifier = self.next_byte_identifier + self.old_tuple = self.read_tuple_data() + self.new_tuple_byte = self.read_utf8() + else: + self.new_tuple_byte = self.next_byte_identifier + if self.new_tuple_byte != "N": + # TODO: test exception handling + raise ValueError( + f"did not find new_tuple_byte ('N') at position: {self.buffer.tell()}, found: '{self.new_tuple_byte}'" + ) + self.new_tuple = self.read_tuple_data() + + def __repr__(self) -> str: + return ( + f"UPDATE \n\tbyte1: '{self.byte1}', \n\trelation_id: {self.relation_id}" + f"\n\toptional_tuple_identifier: '{self.optional_tuple_identifier}', \n\toptional_old_tuple_data: {self.old_tuple}" + f"\n\tnew_tuple_byte: '{self.new_tuple_byte}', \n\tnew_tuple: {self.new_tuple}" + ) + + +class Delete(PgoutputMessage): + """ + Byte1('D') Identifies the message as a delete message. + Int32 ID of the relation corresponding to the ID in the relation message. + Byte1('K') Identifies the following TupleData submessage as a key. This field is present if the table in which the delete has happened uses an index as REPLICA IDENTITY. + Byte1('O') Identifies the following TupleData message as a old tuple. This field is present if the table in which the delete has happened has REPLICA IDENTITY set to FULL. + TupleData TupleData message part representing the contents of the old tuple or primary key, depending on the previous field. + + The Delete message may contain either a 'K' message part or an 'O' message part, but never both of them. + """ + + byte1: str + relation_id: int + message_type: str + old_tuple: TupleData + + def decode_buffer(self) -> None: + if self.byte1 != "D": + raise ValueError( + f"first byte in buffer does not match Delete message (expected 'D', got '{self.byte1}'" + ) + self.relation_id = self.read_int32() + self.message_type = self.read_utf8() + # TODO: test with replica identity full + if self.message_type not in ["K", "O"]: + raise ValueError( + f"message type byte is not 'K' or 'O', got: '{self.message_type}'" + ) + self.old_tuple = self.read_tuple_data() + + def __repr__(self) -> str: + return ( + f"DELETE \n\tbyte1: {self.byte1} \n\trelation_id: {self.relation_id} " + f"\n\tmessage_type: {self.message_type} \n\told_tuple: {self.old_tuple}" + ) + + +class Truncate(PgoutputMessage): + """ + Byte1('T') Identifies the message as a truncate message. + Int32 Number of relations + Int8 Option bits for TRUNCATE: 1 for CASCADE, 2 for RESTART IDENTITY + Int32 ID of the relation corresponding to the ID in the relation message. This field is repeated for each relation. + """ + + byte1: str + number_of_relations: int + option_bits: int + relation_ids: List[int] + + def decode_buffer(self) -> None: + if self.byte1 != "T": + raise ValueError( + f"first byte in buffer does not match Truncate message (expected 'T', got '{self.byte1}'" + ) + self.number_of_relations = self.read_int32() + self.option_bits = self.read_int8() + self.relation_ids = [] + for relation in range(self.number_of_relations): + self.relation_ids.append(self.read_int32()) + + def __repr__(self) -> str: + return ( + f"TRUNCATE \n\tbyte1: {self.byte1} \n\tn_relations: {self.number_of_relations} " + f"option_bits: {self.option_bits}, relation_ids: {self.relation_ids}" + ) diff --git a/sources/pg_legacy_replication/exceptions.py b/sources/pg_legacy_replication/exceptions.py new file mode 100644 index 000000000..df52c4bab --- /dev/null +++ b/sources/pg_legacy_replication/exceptions.py @@ -0,0 +1,14 @@ +class NoPrimaryKeyException(Exception): + pass + + +class IncompatiblePostgresVersionException(Exception): + pass + + +class SqlDatabaseSourceImportError(Exception): + def __init__(self) -> None: + super().__init__( + "Could not import `sql_database` source. Run `dlt init sql_database `" + " to download the source code." + ) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py new file mode 100644 index 000000000..47e743542 --- /dev/null +++ b/sources/pg_legacy_replication/helpers.py @@ -0,0 +1,787 @@ +from typing import ( + Optional, + Dict, + Iterator, + Union, + List, + Sequence, + Any, +) +from dataclasses import dataclass, field + +import psycopg2 +from psycopg2.extensions import cursor, connection as ConnectionExt +from psycopg2.extras import ( + LogicalReplicationConnection, + ReplicationCursor, + ReplicationMessage, + StopReplication, +) + +import dlt + +from dlt.common import logger +from dlt.common.typing import TDataItem +from dlt.common.pendulum import pendulum +from dlt.common.schema.typing import ( + TTableSchema, + TTableSchemaColumns, + TColumnNames, + TWriteDisposition, +) +from dlt.common.schema.utils import merge_column +from dlt.common.data_writers.escape import escape_postgres_identifier +from dlt.extract.items import DataItemWithMeta +from dlt.extract.resource import DltResource +from dlt.sources.credentials import ConnectionStringCredentials + +from .schema_types import _to_dlt_column_schema, _to_dlt_val +from .exceptions import IncompatiblePostgresVersionException +from .decoders import ( + Begin, + Relation, + Insert, + Update, + Delete, + ColumnData, +) + + +@dlt.sources.config.with_config(sections=("sources", "pg_replication")) +def init_replication( + slot_name: str, + pub_name: str, + schema_name: str, + table_names: Optional[Union[str, Sequence[str]]] = None, + credentials: ConnectionStringCredentials = dlt.secrets.value, + publish: str = "insert, update, delete", + persist_snapshots: bool = False, + include_columns: Optional[Dict[str, Sequence[str]]] = None, + columns: Optional[Dict[str, TTableSchemaColumns]] = None, + reset: bool = False, +) -> Optional[Union[DltResource, List[DltResource]]]: + """Initializes replication for one, several, or all tables within a schema. + + Can be called repeatedly with the same `slot_name` and `pub_name`: + - creates a replication slot and publication with provided names if they do not exist yet + - skips creation of slot and publication if they already exist (unless`reset` is set to `False`) + - supports addition of new tables by extending `table_names` + - removing tables is not supported, i.e. exluding a table from `table_names` + will not remove it from the publication + - switching from a table selection to an entire schema is possible by omitting + the `table_names` argument + - changing `publish` has no effect (altering the published DML operations is not supported) + - table snapshots can only be persisted on the first call (because the snapshot + is exported when the slot is created) + + Args: + slot_name (str): Name of the replication slot to create if it does not exist yet. + pub_name (str): Name of the publication to create if it does not exist yet. + schema_name (str): Name of the schema to replicate tables from. + table_names (Optional[Union[str, Sequence[str]]]): Name(s) of the table(s) + to include in the publication. If not provided, all tables in the schema + are included (also tables added to the schema after the publication was created). + credentials (ConnectionStringCredentials): Postgres database credentials. + publish (str): Comma-separated string of DML operations. Can be used to + control which changes are included in the publication. Allowed operations + are `insert`, `update`, and `delete`. `truncate` is currently not + supported—messages of that type are ignored. + E.g. `publish="insert"` will create a publication that only publishes insert operations. + persist_snapshots (bool): Whether the table states in the snapshot exported + during replication slot creation are persisted to tables. If true, a + snapshot table is created in Postgres for all included tables, and corresponding + resources (`DltResource` objects) for these tables are created and returned. + The resources can be used to perform an initial load of all data present + in the tables at the moment the replication slot got created. + include_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to + sequence of names of columns to include in the snapshot table(s). + Any column not in the sequence is excluded. If not provided, all columns + are included. For example: + ``` + include_columns={ + "table_x": ["col_a", "col_c"], + "table_y": ["col_x", "col_y", "col_z"], + } + ``` + Argument is only used if `persist_snapshots` is `True`. + columns (Optional[Dict[str, TTableSchemaColumns]]): Maps + table name(s) to column hints to apply on the snapshot table resource(s). + For example: + ``` + columns={ + "table_x": {"col_a": {"data_type": "complex"}}, + "table_y": {"col_y": {"precision": 32}}, + } + ``` + Argument is only used if `persist_snapshots` is `True`. + reset (bool): If set to True, the existing slot and publication are dropped + and recreated. Has no effect if a slot and publication with the provided + names do not yet exist. + + Returns: + - None if `persist_snapshots` is `False` + - a `DltResource` object or a list of `DltResource` objects for the snapshot + table(s) if `persist_snapshots` is `True` and the replication slot did not yet exist + """ + if persist_snapshots: + _import_sql_table_resource() + if isinstance(table_names, str): + table_names = [table_names] + cur = _get_rep_conn(credentials).cursor() + if reset: + drop_replication_slot(slot_name, cur) + drop_publication(pub_name, cur) + create_publication(pub_name, cur, publish) + if table_names is None: + add_schema_to_publication(schema_name, pub_name, cur) + else: + add_tables_to_publication(table_names, schema_name, pub_name, cur) + slot = create_replication_slot(slot_name, cur) + if persist_snapshots: + if slot is None: + logger.info( + "Cannot persist snapshots because they do not exist. " + f'The replication slot "{slot_name}" already existed prior to calling this function.' + ) + else: + # need separate session to read the snapshot: https://stackoverflow.com/q/75852587 + cur_snap = _get_conn(credentials).cursor() + snapshot_table_names = [ + persist_snapshot_table( + snapshot_name=slot["snapshot_name"], + table_name=table_name, + schema_name=schema_name, + cur=cur_snap, + include_columns=( + None + if include_columns is None + else include_columns.get(table_name) + ), + ) + for table_name in table_names + ] + snapshot_table_resources = [ + snapshot_table_resource( + snapshot_table_name=snapshot_table_name, + schema_name=schema_name, + table_name=table_name, + write_disposition="append" if publish == "insert" else "merge", + columns=None if columns is None else columns.get(table_name), + credentials=credentials, + ) + for table_name, snapshot_table_name in zip( + table_names, snapshot_table_names + ) + ] + if len(snapshot_table_resources) == 1: + return snapshot_table_resources[0] + return snapshot_table_resources + return None + + +@dlt.sources.config.with_config(sections=("sources", "pg_replication")) +def get_pg_version( + cur: cursor = None, + credentials: ConnectionStringCredentials = dlt.secrets.value, +) -> int: + """Returns Postgres server version as int.""" + if cur is not None: + return cur.connection.server_version + return _get_conn(credentials).server_version + + +def create_publication( + name: str, + cur: cursor, + publish: str = "insert, update, delete", +) -> None: + """Creates a publication for logical replication if it doesn't exist yet. + + Does nothing if the publication already exists. + Raises error if the user does not have the CREATE privilege for the database. + """ + esc_name = escape_postgres_identifier(name) + try: + cur.execute(f"CREATE PUBLICATION {esc_name} WITH (publish = '{publish}');") + logger.info( + f"Successfully created publication {esc_name} with publish = '{publish}'." + ) + except psycopg2.errors.DuplicateObject: # the publication already exists + logger.info(f'Publication "{name}" already exists.') + + +def add_table_to_publication( + table_name: str, + schema_name: str, + pub_name: str, + cur: cursor, +) -> None: + """Adds a table to a publication for logical replication. + + Does nothing if the table is already a member of the publication. + Raises error if the user is not owner of the table. + """ + qual_name = _make_qualified_table_name(table_name, schema_name) + esc_pub_name = escape_postgres_identifier(pub_name) + try: + cur.execute(f"ALTER PUBLICATION {esc_pub_name} ADD TABLE {qual_name};") + logger.info( + f"Successfully added table {qual_name} to publication {esc_pub_name}." + ) + except psycopg2.errors.DuplicateObject: + logger.info( + f"Table {qual_name} is already a member of publication {esc_pub_name}." + ) + + +def add_tables_to_publication( + table_names: Union[str, Sequence[str]], + schema_name: str, + pub_name: str, + cur: cursor, +) -> None: + """Adds one or multiple tables to a publication for logical replication. + + Calls `add_table_to_publication` for each table in `table_names`. + """ + if isinstance(table_names, str): + table_names = table_names + for table_name in table_names: + add_table_to_publication(table_name, schema_name, pub_name, cur) + + +def add_schema_to_publication( + schema_name: str, + pub_name: str, + cur: cursor, +) -> None: + """Adds a schema to a publication for logical replication if the schema is not a member yet. + + Raises error if the user is not a superuser. + """ + if (version := get_pg_version(cur)) < 150000: + raise IncompatiblePostgresVersionException( + f"Cannot add schema to publication because the Postgres server version {version} is too low." + " Adding schemas to a publication is only supported for Postgres version 15 or higher." + " Upgrade your Postgres server version or set the `table_names` argument to explicitly specify table names." + ) + esc_schema_name = escape_postgres_identifier(schema_name) + esc_pub_name = escape_postgres_identifier(pub_name) + try: + cur.execute( + f"ALTER PUBLICATION {esc_pub_name} ADD TABLES IN SCHEMA {esc_schema_name};" + ) + logger.info( + f"Successfully added schema {esc_schema_name} to publication {esc_pub_name}." + ) + except psycopg2.errors.DuplicateObject: + logger.info( + f"Schema {esc_schema_name} is already a member of publication {esc_pub_name}." + ) + + +def create_replication_slot( # type: ignore[return] + name: str, cur: ReplicationCursor, output_plugin: str = "pgoutput" +) -> Optional[Dict[str, str]]: + """Creates a replication slot if it doesn't exist yet.""" + try: + cur.create_replication_slot(name, output_plugin=output_plugin) + logger.info(f'Successfully created replication slot "{name}".') + result = cur.fetchone() + return { + "slot_name": result[0], + "consistent_point": result[1], + "snapshot_name": result[2], + "output_plugin": result[3], + } + except psycopg2.errors.DuplicateObject: # the replication slot already exists + logger.info( + f'Replication slot "{name}" cannot be created because it already exists.' + ) + + +def drop_replication_slot(name: str, cur: ReplicationCursor) -> None: + """Drops a replication slot if it exists.""" + try: + cur.drop_replication_slot(name) + logger.info(f'Successfully dropped replication slot "{name}".') + except psycopg2.errors.UndefinedObject: # the replication slot does not exist + logger.info( + f'Replication slot "{name}" cannot be dropped because it does not exist.' + ) + + +def drop_publication(name: str, cur: ReplicationCursor) -> None: + """Drops a publication if it exists.""" + esc_name = escape_postgres_identifier(name) + try: + cur.execute(f"DROP PUBLICATION {esc_name};") + cur.connection.commit() + logger.info(f"Successfully dropped publication {esc_name}.") + except psycopg2.errors.UndefinedObject: # the publication does not exist + logger.info( + f"Publication {esc_name} cannot be dropped because it does not exist." + ) + + +def persist_snapshot_table( + snapshot_name: str, + table_name: str, + schema_name: str, + cur: cursor, + include_columns: Optional[Sequence[str]] = None, +) -> str: + """Persists exported snapshot table state. + + Reads snapshot table content and copies it into new table. + """ + col_str = "*" + if include_columns is not None: + col_str = ", ".join(map(escape_postgres_identifier, include_columns)) + snapshot_table_name = f"{table_name}_snapshot_{snapshot_name}" + snapshot_qual_name = _make_qualified_table_name(snapshot_table_name, schema_name) + qual_name = _make_qualified_table_name(table_name, schema_name) + cur.execute( + f""" + START TRANSACTION ISOLATION LEVEL REPEATABLE READ; + SET TRANSACTION SNAPSHOT '{snapshot_name}'; + CREATE TABLE {snapshot_qual_name} AS SELECT {col_str} FROM {qual_name}; + """ + ) + cur.connection.commit() + logger.info(f"Successfully persisted snapshot table state in {snapshot_qual_name}.") + return snapshot_table_name + + +def snapshot_table_resource( + snapshot_table_name: str, + schema_name: str, + table_name: str, + write_disposition: TWriteDisposition, + columns: TTableSchemaColumns = None, + credentials: ConnectionStringCredentials = dlt.secrets.value, +) -> DltResource: + """Returns a resource for a persisted snapshot table. + + Can be used to perform an initial load of the table, so all data that + existed in the table prior to initializing replication is also captured. + """ + resource: DltResource = sql_table( # type: ignore[name-defined] + credentials=credentials, + table=snapshot_table_name, + schema=schema_name, + detect_precision_hints=True, + ) + primary_key = _get_pk(table_name, schema_name, credentials) + resource.apply_hints( + table_name=table_name, + write_disposition=write_disposition, + columns=columns, + primary_key=primary_key, + ) + return resource + + +def get_max_lsn( + slot_name: str, + options: Dict[str, str], + credentials: ConnectionStringCredentials, +) -> Optional[int]: + """Returns maximum Log Sequence Number (LSN) in replication slot. + + Returns None if the replication slot is empty. + Does not consume the slot, i.e. messages are not flushed. + Raises error if the replication slot or publication does not exist. + """ + # comma-separated value string + options_str = ", ".join( + f"'{x}'" for xs in list(map(list, options.items())) for x in xs + ) + cur = _get_conn(credentials).cursor() + cur.execute( + "SELECT MAX(lsn) - '0/0' AS max_lsn " # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) + f"FROM pg_logical_slot_peek_binary_changes('{slot_name}', NULL, NULL, {options_str});" + ) + lsn: int = cur.fetchone()[0] + cur.connection.close() + return lsn + + +def get_pub_ops( + pub_name: str, + credentials: ConnectionStringCredentials, +) -> Dict[str, bool]: + """Returns dictionary of DML operations and their publish status.""" + cur = _get_conn(credentials).cursor() + cur.execute( + f""" + SELECT pubinsert, pubupdate, pubdelete, pubtruncate + FROM pg_publication WHERE pubname = '{pub_name}' + """ + ) + result = cur.fetchone() + cur.connection.close() + if result is None: + raise ValueError(f'Publication "{pub_name}" does not exist.') + return { + "insert": result[0], + "update": result[1], + "delete": result[2], + "truncate": result[3], + } + + +def lsn_int_to_hex(lsn: int) -> str: + """Convert integer LSN to postgres' hexadecimal representation.""" + # https://stackoverflow.com/questions/66797767/lsn-external-representation. + return f"{lsn >> 32 & 4294967295:X}/{lsn & 4294967295:08X}" + + +def advance_slot( + upto_lsn: int, + slot_name: str, + credentials: ConnectionStringCredentials, +) -> None: + """Advances position in the replication slot. + + Flushes all messages upto (and including) the message with LSN = `upto_lsn`. + This function is used as alternative to psycopg2's `send_feedback` method, because + the behavior of that method seems odd when used outside of `consume_stream`. + """ + if upto_lsn != 0: + cur = _get_conn(credentials).cursor() + cur.execute( + f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" + ) + cur.connection.close() + + +def _import_sql_table_resource() -> None: + """Imports external `sql_table` resource from `sql_database` source. + + Raises error if `sql_database` source is not available. + """ + global sql_table + try: + from ..sql_database import sql_table # type: ignore[import-untyped] + except Exception: + try: + from sql_database import sql_table + except ImportError as e: + from .exceptions import SqlDatabaseSourceImportError + + raise SqlDatabaseSourceImportError from e + + +def _get_conn( + credentials: ConnectionStringCredentials, + connection_factory: Optional[Any] = None, +) -> ConnectionExt: + """Returns a psycopg2 connection to interact with postgres.""" + return psycopg2.connect( # type: ignore[no-any-return] + database=credentials.database, + user=credentials.username, + password=credentials.password, + host=credentials.host, + port=credentials.port, + connection_factory=connection_factory, + **({} if credentials.query is None else credentials.query), + ) + + +def _get_rep_conn( + credentials: ConnectionStringCredentials, +) -> LogicalReplicationConnection: + """Returns a psycopg2 LogicalReplicationConnection to interact with postgres replication functionality. + + Raises error if the user does not have the REPLICATION attribute assigned. + """ + return _get_conn(credentials, LogicalReplicationConnection) # type: ignore[return-value] + + +def _make_qualified_table_name(table_name: str, schema_name: str) -> str: + """Escapes and combines a schema and table name.""" + return ( + escape_postgres_identifier(schema_name) + + "." + + escape_postgres_identifier(table_name) + ) + + +def _get_pk( + table_name: str, + schema_name: str, + credentials: ConnectionStringCredentials, +) -> Optional[TColumnNames]: + """Returns primary key column(s) for postgres table. + + Returns None if no primary key columns exist. + """ + qual_name = _make_qualified_table_name(table_name, schema_name) + cur = _get_conn(credentials).cursor() + # https://wiki.postgresql.org/wiki/Retrieve_primary_key_columns + cur.execute( + f""" + SELECT a.attname + FROM pg_index i + JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey) + WHERE i.indrelid = '{qual_name}'::regclass + AND i.indisprimary; + """ + ) + result = [tup[0] for tup in cur.fetchall()] + cur.connection.close() + if len(result) == 0: + return None + elif len(result) == 1: + return result[0] # type: ignore[no-any-return] + return result + + +@dataclass +class ItemGenerator: + credentials: ConnectionStringCredentials + slot_name: str + options: Dict[str, str] + upto_lsn: int + start_lsn: int = 0 + target_batch_size: int = 1000 + include_columns: Optional[Dict[str, Sequence[str]]] = (None,) # type: ignore[assignment] + columns: Optional[Dict[str, TTableSchemaColumns]] = (None,) # type: ignore[assignment] + last_commit_lsn: Optional[int] = field(default=None, init=False) + generated_all: bool = False + + def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: + """Yields replication messages from MessageConsumer. + + Starts replication of messages published by the publication from the replication slot. + Maintains LSN of last consumed Commit message in object state. + Does not advance the slot. + """ + try: + cur = _get_rep_conn(self.credentials).cursor() + cur.start_replication( + slot_name=self.slot_name, + start_lsn=self.start_lsn, + decode=False, + options=self.options, + ) + consumer = MessageConsumer( + upto_lsn=self.upto_lsn, + pub_ops=get_pub_ops( + self.options["publication_names"], self.credentials + ), + target_batch_size=self.target_batch_size, + include_columns=self.include_columns, + columns=self.columns, + ) + cur.consume_stream(consumer) + except StopReplication: # completed batch or reached `upto_lsn` + pass + finally: + cur.connection.close() + self.last_commit_lsn = consumer.last_commit_lsn + for rel_id, data_items in consumer.data_items.items(): + table_name = consumer.last_table_schema[rel_id]["name"] + yield data_items[0] # meta item with column hints only, no data + yield dlt.mark.with_table_name(data_items[1:], table_name) + self.generated_all = consumer.consumed_all + + +class MessageConsumer: + """Consumes messages from a ReplicationCursor sequentially. + + Generates data item for each `insert`, `update`, and `delete` message. + Processes in batches to limit memory usage. + Maintains message data needed by subsequent messages in internal state. + """ + + def __init__( + self, + upto_lsn: int, + pub_ops: Dict[str, bool], + target_batch_size: int = 1000, + include_columns: Optional[Dict[str, Sequence[str]]] = None, + columns: Optional[Dict[str, TTableSchemaColumns]] = None, + ) -> None: + self.upto_lsn = upto_lsn + self.pub_ops = pub_ops + self.target_batch_size = target_batch_size + self.include_columns = include_columns + self.columns = columns + + self.consumed_all: bool = False + # data_items attribute maintains all data items + self.data_items: Dict[ + int, List[Union[TDataItem, DataItemWithMeta]] + ] = dict() # maps relation_id to list of data items + # other attributes only maintain last-seen values + self.last_table_schema: Dict[ + int, TTableSchema + ] = dict() # maps relation_id to table schema + self.last_commit_ts: pendulum.DateTime + self.last_commit_lsn = None + + def __call__(self, msg: ReplicationMessage) -> None: + """Processes message received from stream.""" + self.process_msg(msg) + + def process_msg(self, msg: ReplicationMessage) -> None: + """Processes encoded replication message. + + Identifies message type and decodes accordingly. + Message treatment is different for various message types. + Breaks out of stream with StopReplication exception when + - `upto_lsn` is reached + - `target_batch_size` is reached + - a table's schema has changed + """ + op = msg.payload[:1] + if op == b"I": + self.process_change(Insert(msg.payload), msg.data_start) + elif op == b"U": + self.process_change(Update(msg.payload), msg.data_start) + elif op == b"D": + self.process_change(Delete(msg.payload), msg.data_start) + elif op == b"B": + self.last_commit_ts = Begin(msg.payload).commit_ts # type: ignore[assignment] + elif op == b"C": + self.process_commit(msg) + elif op == b"R": + self.process_relation(Relation(msg.payload)) + elif op == b"T": + logger.warning( + "The truncate operation is currently not supported. " + "Truncate replication messages are ignored." + ) + + def process_commit(self, msg: ReplicationMessage) -> None: + """Updates object state when Commit message is observed. + + Raises StopReplication when `upto_lsn` or `target_batch_size` is reached. + """ + self.last_commit_lsn = msg.data_start + if msg.data_start >= self.upto_lsn: + self.consumed_all = True + n_items = sum( + [len(items) for items in self.data_items.values()] + ) # combine items for all tables + if self.consumed_all or n_items >= self.target_batch_size: + raise StopReplication + + def process_relation(self, decoded_msg: Relation) -> None: + """Processes a replication message of type Relation. + + Stores table schema in object state. + Creates meta item to emit column hints while yielding data. + + Raises StopReplication when a table's schema changes. + """ + if ( + self.data_items.get(decoded_msg.relation_id) is not None + ): # table schema change + raise StopReplication + # get table schema information from source and store in object state + table_name = decoded_msg.relation_name + columns: TTableSchemaColumns = { + c.name: _to_dlt_column_schema(c) for c in decoded_msg.columns + } + self.last_table_schema[decoded_msg.relation_id] = { + "name": table_name, + "columns": columns, + } + + # apply user input + # 1) exclude columns + include_columns = ( + None + if self.include_columns is None + else self.include_columns.get(table_name) + ) + if include_columns is not None: + columns = {k: v for k, v in columns.items() if k in include_columns} + # 2) override source hints + column_hints: TTableSchemaColumns = ( + dict() if self.columns is None else self.columns.get(table_name, dict()) + ) + for column_name, column_val in column_hints.items(): + columns[column_name] = merge_column(columns[column_name], column_val) + + # add hints for replication columns + columns["lsn"] = {"data_type": "bigint", "nullable": True} + if self.pub_ops["update"] or self.pub_ops["delete"]: + columns["lsn"]["dedup_sort"] = "desc" + if self.pub_ops["delete"]: + columns["deleted_ts"] = { + "hard_delete": True, + "data_type": "timestamp", + "nullable": True, + } + + # determine write disposition + write_disposition: TWriteDisposition = "append" + if self.pub_ops["update"] or self.pub_ops["delete"]: + write_disposition = "merge" + + # include meta item to emit hints while yielding data + meta_item = dlt.mark.with_hints( + [], + dlt.mark.make_hints( + table_name=table_name, + write_disposition=write_disposition, + columns=columns, + ), + create_table_variant=True, + ) + self.data_items[decoded_msg.relation_id] = [meta_item] + + def process_change( + self, decoded_msg: Union[Insert, Update, Delete], msg_start_lsn: int + ) -> None: + """Processes replication message of type Insert, Update, or Delete. + + Adds data item for inserted/updated/deleted record to instance attribute. + """ + if isinstance(decoded_msg, (Insert, Update)): + column_data = decoded_msg.new_tuple.column_data + elif isinstance(decoded_msg, Delete): + column_data = decoded_msg.old_tuple.column_data + table_name = self.last_table_schema[decoded_msg.relation_id]["name"] + data_item = self.gen_data_item( + data=column_data, + column_schema=self.last_table_schema[decoded_msg.relation_id]["columns"], + lsn=msg_start_lsn, + commit_ts=self.last_commit_ts, + for_delete=isinstance(decoded_msg, Delete), + include_columns=( + None + if self.include_columns is None + else self.include_columns.get(table_name) + ), + ) + self.data_items[decoded_msg.relation_id].append(data_item) + + @staticmethod + def gen_data_item( + data: List[ColumnData], + column_schema: TTableSchemaColumns, + lsn: int, + commit_ts: pendulum.DateTime, + for_delete: bool, + include_columns: Optional[Sequence[str]] = None, + ) -> TDataItem: + """Generates data item from replication message data and corresponding metadata.""" + data_item = { + schema["name"]: _to_dlt_val( + val=data.col_data, + data_type=schema["data_type"], + byte1=data.col_data_category, + for_delete=for_delete, + ) + for (schema, data) in zip(column_schema.values(), data) + if (True if include_columns is None else schema["name"] in include_columns) + } + data_item["lsn"] = lsn + if for_delete: + data_item["deleted_ts"] = commit_ts + return data_item diff --git a/sources/pg_legacy_replication/pg_logicaldec_pb2.py b/sources/pg_legacy_replication/pg_logicaldec_pb2.py new file mode 100644 index 000000000..08fa960a1 --- /dev/null +++ b/sources/pg_legacy_replication/pg_logicaldec_pb2.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: pg_logicaldec.proto +# Protobuf Python Version: 5.26.1 +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import builder as _builder + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x13pg_logicaldec.proto\x12\x0b\x64\x65\x63oderbufs"\x1d\n\x05Point\x12\t\n\x01x\x18\x01 \x02(\x01\x12\t\n\x01y\x18\x02 \x02(\x01"\xa7\x02\n\x0c\x44\x61tumMessage\x12\x13\n\x0b\x63olumn_name\x18\x01 \x01(\t\x12\x13\n\x0b\x63olumn_type\x18\x02 \x01(\x03\x12\x15\n\x0b\x64\x61tum_int32\x18\x03 \x01(\x05H\x00\x12\x15\n\x0b\x64\x61tum_int64\x18\x04 \x01(\x03H\x00\x12\x15\n\x0b\x64\x61tum_float\x18\x05 \x01(\x02H\x00\x12\x16\n\x0c\x64\x61tum_double\x18\x06 \x01(\x01H\x00\x12\x14\n\ndatum_bool\x18\x07 \x01(\x08H\x00\x12\x16\n\x0c\x64\x61tum_string\x18\x08 \x01(\tH\x00\x12\x15\n\x0b\x64\x61tum_bytes\x18\t \x01(\x0cH\x00\x12)\n\x0b\x64\x61tum_point\x18\n \x01(\x0b\x32\x12.decoderbufs.PointH\x00\x12\x17\n\rdatum_missing\x18\x0b \x01(\x08H\x00\x42\x07\n\x05\x64\x61tum"4\n\x08TypeInfo\x12\x10\n\x08modifier\x18\x01 \x02(\t\x12\x16\n\x0evalue_optional\x18\x02 \x02(\x08"\xee\x01\n\nRowMessage\x12\x16\n\x0etransaction_id\x18\x01 \x01(\r\x12\x13\n\x0b\x63ommit_time\x18\x02 \x01(\x04\x12\r\n\x05table\x18\x03 \x01(\t\x12\x1b\n\x02op\x18\x04 \x01(\x0e\x32\x0f.decoderbufs.Op\x12,\n\tnew_tuple\x18\x05 \x03(\x0b\x32\x19.decoderbufs.DatumMessage\x12,\n\told_tuple\x18\x06 \x03(\x0b\x32\x19.decoderbufs.DatumMessage\x12+\n\x0cnew_typeinfo\x18\x07 \x03(\x0b\x32\x15.decoderbufs.TypeInfo*U\n\x02Op\x12\x14\n\x07UNKNOWN\x10\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01\x12\n\n\x06INSERT\x10\x00\x12\n\n\x06UPDATE\x10\x01\x12\n\n\x06\x44\x45LETE\x10\x02\x12\t\n\x05\x42\x45GIN\x10\x03\x12\n\n\x06\x43OMMIT\x10\x04\x42\x33\n&io.debezium.connector.postgresql.protoB\x07PgProtoH\x01' +) + +_globals = globals() +_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) +_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "pg_logicaldec_pb2", _globals) +if not _descriptor._USE_C_DESCRIPTORS: + _globals["DESCRIPTOR"]._loaded_options = None + _globals[ + "DESCRIPTOR" + ]._serialized_options = ( + b"\n&io.debezium.connector.postgresql.protoB\007PgProtoH\001" + ) + _globals["_OP"]._serialized_start = 660 + _globals["_OP"]._serialized_end = 745 + _globals["_POINT"]._serialized_start = 36 + _globals["_POINT"]._serialized_end = 65 + _globals["_DATUMMESSAGE"]._serialized_start = 68 + _globals["_DATUMMESSAGE"]._serialized_end = 363 + _globals["_TYPEINFO"]._serialized_start = 365 + _globals["_TYPEINFO"]._serialized_end = 417 + _globals["_ROWMESSAGE"]._serialized_start = 420 + _globals["_ROWMESSAGE"]._serialized_end = 658 +# @@protoc_insertion_point(module_scope) diff --git a/sources/pg_legacy_replication/requirements.txt b/sources/pg_legacy_replication/requirements.txt new file mode 100644 index 000000000..f2c2be351 --- /dev/null +++ b/sources/pg_legacy_replication/requirements.txt @@ -0,0 +1,3 @@ +dlt>=0.4.13 +psycopg2-binary>=2.9.9 +protobuf>=5 \ No newline at end of file diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py new file mode 100644 index 000000000..ea1e3c057 --- /dev/null +++ b/sources/pg_legacy_replication/schema_types.py @@ -0,0 +1,125 @@ +from functools import lru_cache +import json +from typing import Optional, Any, Dict + +from dlt.common import Decimal +from dlt.common.data_types.typing import TDataType +from dlt.common.data_types.type_helpers import coerce_value +from dlt.common.schema.typing import TColumnSchema, TColumnType + +from .decoders import ColumnType + + +_DUMMY_VALS: Dict[TDataType, Any] = { + "bigint": 0, + "binary": b" ", + "bool": True, + "complex": [0], + "date": "2000-01-01", + "decimal": Decimal(0), + "double": 0.0, + "text": "", + "time": "00:00:00", + "timestamp": "2000-01-01T00:00:00", + "wei": 0, +} +"""Dummy values used to replace NULLs in NOT NULL columns in key-only delete records.""" + + +_PG_TYPES: Dict[int, str] = { + 16: "boolean", + 17: "bytea", + 20: "bigint", + 21: "smallint", + 23: "integer", + 701: "double precision", + 1043: "character varying", + 1082: "date", + 1083: "time without time zone", + 1184: "timestamp with time zone", + 1700: "numeric", + 3802: "jsonb", +} +"""Maps postgres type OID to type string. Only includes types present in PostgresTypeMapper.""" + + +def _get_precision(type_id: int, atttypmod: int) -> Optional[int]: + """Get precision from postgres type attributes.""" + # https://stackoverflow.com/a/3351120 + if type_id == 21: # smallint + return 16 + elif type_id == 23: # integer + return 32 + elif type_id == 20: # bigint + return 64 + if atttypmod != -1: + if type_id == 1700: # numeric + return ((atttypmod - 4) >> 16) & 65535 + elif type_id in ( + 1083, + 1184, + ): # time without time zone, timestamp with time zone + return atttypmod + elif type_id == 1043: # character varying + return atttypmod - 4 + return None + + +def _get_scale(type_id: int, atttypmod: int) -> Optional[int]: + """Get scale from postgres type attributes.""" + # https://stackoverflow.com/a/3351120 + if atttypmod != -1: + if type_id in (21, 23, 20): # smallint, integer, bigint + return 0 + if type_id == 1700: # numeric + return (atttypmod - 4) & 65535 + return None + + +@lru_cache(maxsize=None) +def _type_mapper() -> Any: + from dlt.destinations import postgres + from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper + + return PostgresTypeMapper(postgres().capabilities()) + + +def _to_dlt_column_type(type_id: int, atttypmod: int) -> TColumnType: + """Converts postgres type OID to dlt column type. + + Type OIDs not in _PG_TYPES mapping default to "text" type. + """ + pg_type = _PG_TYPES.get(type_id) + precision = _get_precision(type_id, atttypmod) + scale = _get_scale(type_id, atttypmod) + return _type_mapper().from_db_type(pg_type, precision, scale) # type: ignore[no-any-return] + + +def _to_dlt_column_schema(col: ColumnType) -> TColumnSchema: + """Converts pypgoutput ColumnType to dlt column schema.""" + dlt_column_type = _to_dlt_column_type(col.type_id, col.atttypmod) + partial_column_schema = { + "name": col.name, + "primary_key": bool(col.part_of_pkey), + } + return {**dlt_column_type, **partial_column_schema} # type: ignore[typeddict-item] + + +def _to_dlt_val(val: str, data_type: TDataType, byte1: str, for_delete: bool) -> Any: + """Converts pgoutput's text-formatted value into dlt-compatible data value.""" + if byte1 == "n": + if for_delete: + # replace None with dummy value to prevent NOT NULL violations in staging table + return _DUMMY_VALS[data_type] + return None + elif byte1 == "t": + if data_type == "binary": + # https://www.postgresql.org/docs/current/datatype-binary.html#DATATYPE-BINARY-BYTEA-HEX-FORMAT + return bytes.fromhex(val.replace("\\x", "")) + elif data_type == "complex": + return json.loads(val) + return coerce_value(data_type, "text", val) + else: + raise ValueError( + f"Byte1 in replication message must be 'n' or 't', not '{byte1}'." + ) diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py new file mode 100644 index 000000000..f3c428cce --- /dev/null +++ b/sources/pg_legacy_replication_pipeline.py @@ -0,0 +1,292 @@ +import dlt + +from dlt.common.destination import Destination +from dlt.destinations.impl.postgres.configuration import PostgresCredentials + +from pg_legacy_replication import replication_resource +from pg_legacy_replication.helpers import init_replication + + +PG_CREDS = dlt.secrets.get("sources.pg_replication.credentials", PostgresCredentials) + + +def replicate_single_table() -> None: + """Sets up replication for a single Postgres table and loads changes into a destination. + + Demonstrates basic usage of `init_replication` helper and `replication_resource` resource. + Uses `src_pl` to create and change the replicated Postgres table—this + is only for demonstration purposes, you won't need this when you run in production + as you'll probably have another process feeding your Postgres instance. + """ + # create source and destination pipelines + src_pl = get_postgres_pipeline() + dest_pl = dlt.pipeline( + pipeline_name="pg_replication_pipeline", + destination="duckdb", + dataset_name="replicate_single_table", + full_refresh=True, + ) + + # create table "my_source_table" in source to demonstrate replication + create_source_table( + src_pl, "CREATE TABLE {table_name} (id integer PRIMARY KEY, val bool);" + ) + + # initialize replication for the source table—this creates a replication slot and publication + slot_name = "example_slot" + pub_name = "example_pub" + init_replication( # requires the Postgres user to have the REPLICATION attribute assigned + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="my_source_table", + reset=True, + ) + + # create a resource that generates items for each change in the source table + changes = replication_resource(slot_name, pub_name) + + # insert two records in source table and propagate changes to destination + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, true), (2, false);" + ) + dest_pl.run(changes) + show_destination_table(dest_pl) + + # update record in source table and propagate change to destination + change_source_table(src_pl, "UPDATE {table_name} SET val = true WHERE id = 2;") + dest_pl.run(changes) + show_destination_table(dest_pl) + + # delete record from source table and propagate change to destination + change_source_table(src_pl, "DELETE FROM {table_name} WHERE id = 2;") + dest_pl.run(changes) + show_destination_table(dest_pl) + + +def replicate_with_initial_load() -> None: + """Sets up replication with initial load. + + Demonstrates usage of `persist_snapshots` argument and snapshot resource + returned by `init_replication` helper. + """ + # create source and destination pipelines + src_pl = get_postgres_pipeline() + dest_pl = dlt.pipeline( + pipeline_name="pg_replication_pipeline", + destination="duckdb", + dataset_name="replicate_with_initial_load", + full_refresh=True, + ) + + # create table "my_source_table" in source to demonstrate replication + create_source_table( + src_pl, "CREATE TABLE {table_name} (id integer PRIMARY KEY, val bool);" + ) + + # insert records before initializing replication + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, true), (2, false);" + ) + + # initialize replication for the source table + slot_name = "example_slot" + pub_name = "example_pub" + snapshot = init_replication( # requires the Postgres user to have the REPLICATION attribute assigned + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="my_source_table", + persist_snapshots=True, # persist snapshot table(s) and let function return resource(s) for initial load + reset=True, + ) + + # perform initial load to capture all records present in source table prior to replication initialization + dest_pl.run(snapshot) + show_destination_table(dest_pl) + + # insert record in source table and propagate change to destination + change_source_table(src_pl, "INSERT INTO {table_name} VALUES (3, true);") + changes = replication_resource(slot_name, pub_name) + dest_pl.run(changes) + show_destination_table(dest_pl) + + +def replicate_entire_schema() -> None: + """Demonstrates setup and usage of schema replication. + + Schema replication requires a Postgres server version of 15 or higher. An + exception is raised if that's not the case. + """ + # create source and destination pipelines + src_pl = get_postgres_pipeline() + dest_pl = dlt.pipeline( + pipeline_name="pg_replication_pipeline", + destination="duckdb", + dataset_name="replicate_entire_schema", + full_refresh=True, + ) + + # create two source tables to demonstrate schema replication + create_source_table( + src_pl, + "CREATE TABLE {table_name} (id integer PRIMARY KEY, val bool);", + "tbl_x", + ) + create_source_table( + src_pl, + "CREATE TABLE {table_name} (id integer PRIMARY KEY, val varchar);", + "tbl_y", + ) + + # initialize schema replication by omitting the `table_names` argument + slot_name = "example_slot" + pub_name = "example_pub" + init_replication( # initializing schema replication requires the Postgres user to be a superuser + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + reset=True, + ) + + # create a resource that generates items for each change in the schema's tables + changes = replication_resource(slot_name, pub_name) + + # insert records in source tables and propagate changes to destination + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, true), (2, false);", "tbl_x" + ) + change_source_table(src_pl, "INSERT INTO {table_name} VALUES (1, 'foo');", "tbl_y") + dest_pl.run(changes) + show_destination_table(dest_pl, "tbl_x") + show_destination_table(dest_pl, "tbl_y") + + # tables added to the schema later are also included in the replication + create_source_table( + src_pl, "CREATE TABLE {table_name} (id integer PRIMARY KEY, val date);", "tbl_z" + ) + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, '2023-03-18');", "tbl_z" + ) + dest_pl.run(changes) + show_destination_table(dest_pl, "tbl_z") + + +def replicate_with_column_selection() -> None: + """Sets up replication with column selection. + + Demonstrates usage of `include_columns` argument. + """ + # create source and destination pipelines + src_pl = get_postgres_pipeline() + dest_pl = dlt.pipeline( + pipeline_name="pg_replication_pipeline", + destination="duckdb", + dataset_name="replicate_with_column_selection", + full_refresh=True, + ) + + # create two source tables to demonstrate schema replication + create_source_table( + src_pl, + "CREATE TABLE {table_name} (c1 integer PRIMARY KEY, c2 bool, c3 varchar);", + "tbl_x", + ) + create_source_table( + src_pl, + "CREATE TABLE {table_name} (c1 integer PRIMARY KEY, c2 bool, c3 varchar);", + "tbl_y", + ) + + # initialize schema replication by omitting the `table_names` argument + slot_name = "example_slot" + pub_name = "example_pub" + init_replication( # requires the Postgres user to have the REPLICATION attribute assigned + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + reset=True, + ) + + # create a resource that generates items for each change in the schema's tables + changes = replication_resource( + slot_name=slot_name, + pub_name=pub_name, + include_columns={ + "tbl_x": ("c1", "c2") + }, # columns not specified here are excluded from generated data items + ) + + # insert records in source tables and propagate changes to destination + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, true, 'foo');", "tbl_x" + ) + change_source_table( + src_pl, "INSERT INTO {table_name} VALUES (1, false, 'bar');", "tbl_y" + ) + dest_pl.run(changes) + + # show columns in schema for both tables + # column c3 is not in the schema for tbl_x because we did not include it + # tbl_y does have column c3 because we didn't specify include columns for this table and by default all columns are included + print("tbl_x", ":", list(dest_pl.default_schema.get_table_columns("tbl_x").keys())) + print("tbl_y", ":", list(dest_pl.default_schema.get_table_columns("tbl_y").keys())) + + +# define some helper methods to make examples more readable + + +def get_postgres_pipeline() -> dlt.Pipeline: + """Returns a pipeline loading into `postgres` destination. + + Uses workaround to fix destination to `postgres`, so it does not get replaced + during `dlt init`. + """ + # this trick prevents dlt init command from replacing "destination" argument to "pipeline" + p_call = dlt.pipeline + pipe = p_call( + pipeline_name="source_pipeline", + destination=Destination.from_reference("postgres", credentials=PG_CREDS), + dataset_name="source_dataset", + full_refresh=True, + ) + return pipe + + +def create_source_table( + src_pl: dlt.Pipeline, sql: str, table_name: str = "my_source_table" +) -> None: + with src_pl.sql_client() as c: + try: + c.create_dataset() + except dlt.destinations.exceptions.DatabaseTerminalException: + pass + qual_name = c.make_qualified_table_name(table_name) + c.execute_sql(sql.format(table_name=qual_name)) + + +def change_source_table( + src_pl: dlt.Pipeline, sql: str, table_name: str = "my_source_table" +) -> None: + with src_pl.sql_client() as c: + qual_name = c.make_qualified_table_name(table_name) + c.execute_sql(sql.format(table_name=qual_name)) + + +def show_destination_table( + dest_pl: dlt.Pipeline, + table_name: str = "my_source_table", + column_names: str = "id, val", +) -> None: + with dest_pl.sql_client() as c: + dest_qual_name = c.make_qualified_table_name(table_name) + dest_records = c.execute_sql(f"SELECT {column_names} FROM {dest_qual_name};") + print(table_name, ":", dest_records) + + +if __name__ == "__main__": + replicate_single_table() + # replicate_with_initial_load() + # replicate_entire_schema() + # replicate_with_column_selection() diff --git a/tests/pg_legacy_replication/__init__.py b/tests/pg_legacy_replication/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py new file mode 100644 index 000000000..a17efcad7 --- /dev/null +++ b/tests/pg_legacy_replication/cases.py @@ -0,0 +1,94 @@ +from typing import List + +from dlt.common import Decimal +from dlt.common.schema import TColumnSchema, TTableSchemaColumns + + +TABLE_ROW_ALL_DATA_TYPES = { + "col1": 989127831, + "col2": 898912.821982, + "col3": True, + "col4": "2022-05-23T13:26:45.176451+00:00", + "col5": "string data \n \r \x8e 🦆", + "col6": Decimal("2323.34"), + "col7": b"binary data \n \r \x8e", + # "col8": 2**56 + 92093890840, # TODO: uncommment and make it work + "col9": { + "complex": [1, 2, 3, "a"], + "link": ( + "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" + " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" + ), + }, + "col10": "2023-02-27", + "col11": "13:26:45.176451", + "col1_null": None, + "col2_null": None, + "col3_null": None, + "col4_null": None, + "col5_null": None, + "col6_null": None, + "col7_null": None, + # "col8_null": None, + "col9_null": None, + "col10_null": None, + "col11_null": None, + "col1_precision": 22324, + "col4_precision": "2022-05-23T13:26:46.167231+00:00", + "col5_precision": "string data 2 \n \r \x8e 🦆", + "col6_precision": Decimal("2323.34"), + "col7_precision": b"binary data 2 \n \r \x8e", + "col11_precision": "13:26:45.176451", +} +TABLE_UPDATE: List[TColumnSchema] = [ + {"name": "col1", "data_type": "bigint", "nullable": False}, + {"name": "col2", "data_type": "double", "nullable": False}, + {"name": "col3", "data_type": "bool", "nullable": False}, + {"name": "col4", "data_type": "timestamp", "nullable": False}, + {"name": "col5", "data_type": "text", "nullable": False}, + {"name": "col6", "data_type": "decimal", "nullable": False}, + {"name": "col7", "data_type": "binary", "nullable": False}, + # {"name": "col8", "data_type": "wei", "nullable": False}, + {"name": "col9", "data_type": "complex", "nullable": False, "variant": True}, + {"name": "col10", "data_type": "date", "nullable": False}, + {"name": "col11", "data_type": "time", "nullable": False}, + {"name": "col1_null", "data_type": "bigint", "nullable": True}, + {"name": "col2_null", "data_type": "double", "nullable": True}, + {"name": "col3_null", "data_type": "bool", "nullable": True}, + {"name": "col4_null", "data_type": "timestamp", "nullable": True}, + {"name": "col5_null", "data_type": "text", "nullable": True}, + {"name": "col6_null", "data_type": "decimal", "nullable": True}, + {"name": "col7_null", "data_type": "binary", "nullable": True}, + # {"name": "col8_null", "data_type": "wei", "nullable": True}, + {"name": "col9_null", "data_type": "complex", "nullable": True, "variant": True}, + {"name": "col10_null", "data_type": "date", "nullable": True}, + {"name": "col11_null", "data_type": "time", "nullable": True}, + { + "name": "col1_precision", + "data_type": "bigint", + "precision": 16, + "nullable": False, + }, + { + "name": "col4_precision", + "data_type": "timestamp", + "precision": 3, + "nullable": False, + }, + {"name": "col5_precision", "data_type": "text", "precision": 25, "nullable": False}, + { + "name": "col6_precision", + "data_type": "decimal", + "precision": 6, + "scale": 2, + "nullable": False, + }, + { + "name": "col7_precision", + "data_type": "binary", + "precision": 19, + "nullable": False, + }, + {"name": "col11_precision", "data_type": "time", "precision": 3, "nullable": False}, +] +TABLE_UPDATE_COLUMNS_SCHEMA: TTableSchemaColumns = {t["name"]: t for t in TABLE_UPDATE} diff --git a/tests/pg_legacy_replication/conftest.py b/tests/pg_legacy_replication/conftest.py new file mode 100644 index 000000000..4bfe6f5bd --- /dev/null +++ b/tests/pg_legacy_replication/conftest.py @@ -0,0 +1,44 @@ +import pytest + +from typing import Iterator, Tuple + +import dlt +from dlt.common.utils import uniq_id + + +@pytest.fixture() +def src_config() -> Iterator[Tuple[dlt.Pipeline, str, str]]: + # random slot and pub to enable parallel runs + slot = "test_slot_" + uniq_id(4) + pub = "test_pub" + uniq_id(4) + # setup + src_pl = dlt.pipeline( + pipeline_name="src_pl", + destination=dlt.destinations.postgres( + credentials=dlt.secrets.get("sources.pg_replication.credentials") + ), + dev_mode=True, + ) + yield src_pl, slot, pub + # teardown + with src_pl.sql_client() as c: + # drop tables + try: + c.drop_dataset() + except Exception as e: + print(e) + with c.with_staging_dataset(): + try: + c.drop_dataset() + except Exception as e: + print(e) + # drop replication slot + try: + c.execute_sql(f"SELECT pg_drop_replication_slot('{slot}');") + except Exception as e: + print(e) + # drop publication + try: + c.execute_sql(f"DROP PUBLICATION IF EXISTS {pub};") + except Exception as e: + print(e) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py new file mode 100644 index 000000000..e528e6757 --- /dev/null +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -0,0 +1,868 @@ +import pytest + +from typing import Set, Tuple +from copy import deepcopy +from psycopg2.errors import InsufficientPrivilege + +import dlt +from dlt.destinations.job_client_impl import SqlJobClientBase + +from tests.utils import ( + ALL_DESTINATIONS, + assert_load_info, + load_table_counts, + get_table_metrics, +) +from sources.pg_legacy_replication import replication_resource +from sources.pg_legacy_replication.helpers import init_replication, get_pg_version +from sources.pg_legacy_replication.exceptions import ( + IncompatiblePostgresVersionException, +) + +from .cases import TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA +from .utils import add_pk, assert_loaded_data, is_super_user + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +def test_core_functionality( + src_config: Tuple[dlt.Pipeline, str, str], destination_name: str +) -> None: + @dlt.resource(write_disposition="merge", primary_key="id_x") + def tbl_x(data): + yield data + + @dlt.resource(write_disposition="merge", primary_key="id_y") + def tbl_y(data): + yield data + + src_pl, slot_name, pub_name = src_config + + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo"}), + tbl_y({"id_y": 1, "val_y": True}), + ] + ) + add_pk(src_pl.sql_client, "tbl_x", "id_x") + add_pk(src_pl.sql_client, "tbl_y", "id_y") + + snapshots = init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + persist_snapshots=True, + ) + + changes = replication_resource(slot_name, pub_name) + + src_pl.run( + [ + tbl_x([{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}]), + tbl_y({"id_y": 2, "val_y": False}), + ] + ) + + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, full_refresh=True + ) + + # initial load + info = dest_pl.run(snapshots) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} + exp_tbl_x = [{"id_x": 1, "val_x": "foo"}] + exp_tbl_y = [{"id_y": 1, "val_y": True}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} + exp_tbl_x = [ + {"id_x": 1, "val_x": "foo"}, + {"id_x": 2, "val_x": "bar"}, + {"id_x": 3, "val_x": "baz"}, + ] + exp_tbl_y = [{"id_y": 1, "val_y": True}, {"id_y": 2, "val_y": False}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # change single table + src_pl.run(tbl_y({"id_y": 3, "val_y": True})) + + # process changes + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} + exp_tbl_y = [ + {"id_y": 1, "val_y": True}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # update tables + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + c.execute_sql(f"UPDATE {qual_name} SET val_x = 'foo_updated' WHERE id_x = 1;") + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_y") + c.execute_sql(f"UPDATE {qual_name} SET val_y = false WHERE id_y = 1;") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} + exp_tbl_x = [ + {"id_x": 1, "val_x": "foo_updated"}, + {"id_x": 2, "val_x": "bar"}, + {"id_x": 3, "val_x": "baz"}, + ] + exp_tbl_y = [ + {"id_y": 1, "val_y": False}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # delete from table + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 1;") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 3} + exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] + exp_tbl_y = [ + {"id_y": 1, "val_y": False}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +def test_without_init_load( + src_config: Tuple[dlt.Pipeline, str, str], destination_name: str +) -> None: + @dlt.resource(write_disposition="merge", primary_key="id_x") + def tbl_x(data): + yield data + + @dlt.resource(write_disposition="merge", primary_key="id_y") + def tbl_y(data): + yield data + + src_pl, slot_name, pub_name = src_config + + # create postgres table + # since we're skipping initial load, these records should not be in the replicated table + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo"}), + tbl_y({"id_y": 1, "val_y": True}), + ] + ) + add_pk(src_pl.sql_client, "tbl_x", "id_x") + add_pk(src_pl.sql_client, "tbl_y", "id_y") + + # initialize replication and create resource for changes + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + ) + changes = replication_resource(slot_name, pub_name) + + # change postgres table after replication has been initialized + # these records should be in the replicated table + src_pl.run( + [ + tbl_x([{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}]), + tbl_y({"id_y": 2, "val_y": False}), + ] + ) + + # load changes to destination and assert expectations + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, full_refresh=True + ) + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 1} + exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] + exp_tbl_y = [{"id_y": 2, "val_y": False}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # delete from table + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 2;") + + # process change and assert expectations + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} + exp_tbl_x = [{"id_x": 3, "val_x": "baz"}] + exp_tbl_y = [{"id_y": 2, "val_y": False}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + +def test_insert_only(src_config: Tuple[dlt.Pipeline, str, str]) -> None: + def items(data): + yield data + + src_pl, slot_name, pub_name = src_config + + # create postgres table with single record + src_pl.run(items({"id": 1, "foo": "bar"})) + + # initialize replication and create resource for changes + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="items", + publish="insert", + ) + changes = replication_resource(slot_name, pub_name) + + # insert a record in postgres table + src_pl.run(items({"id": 2, "foo": "bar"})) + + # extract items from resource + dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) + extract_info = dest_pl.extract(changes) + assert get_table_metrics(extract_info, "items")["items_count"] == 1 + + # do an update and a delete—these operations should not lead to items in the resource + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("items") + c.execute_sql(f"UPDATE {qual_name} SET foo = 'baz' WHERE id = 2;") + c.execute_sql(f"DELETE FROM {qual_name} WHERE id = 2;") + extract_info = dest_pl.extract(changes) + assert ( + get_table_metrics(extract_info, "items") is None + ) # there should be no metrics for the "items" table + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("give_hints", [True, False]) +@pytest.mark.parametrize("init_load", [True, False]) +def test_mapped_data_types( + src_config: Tuple[dlt.Pipeline, str, str], + destination_name: str, + give_hints: bool, + init_load: bool, +) -> None: + """Assert common data types (the ones mapped in PostgresTypeMapper) are properly handled.""" + + data = deepcopy(TABLE_ROW_ALL_DATA_TYPES) + column_schema = deepcopy(TABLE_UPDATE_COLUMNS_SCHEMA) + + # resource to load data into postgres source table + @dlt.resource(primary_key="col1", write_disposition="merge", columns=column_schema) + def items(data): + yield data + + src_pl, slot_name, pub_name = src_config + + # create postgres table with single record containing all data types + src_pl.run(items(data)) + add_pk(src_pl.sql_client, "items", "col1") + + # initialize replication and create resources + snapshot = init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="items", + persist_snapshots=init_load, + columns={"items": column_schema} if give_hints else None, + ) + + changes = replication_resource( + slot_name=slot_name, + pub_name=pub_name, + columns={"items": column_schema} if give_hints else None, + ) + + # initial load + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, full_refresh=True + ) + if init_load: + info = dest_pl.run(snapshot) + assert_load_info(info) + assert load_table_counts(dest_pl, "items")["items"] == 1 + + # insert two records in postgres table + r1 = deepcopy(data) + r2 = deepcopy(data) + r1["col1"] = 1 + r2["col1"] = 2 + src_pl.run(items([r1, r2])) + + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 + + if give_hints: + # compare observed with expected column types + observed = dest_pl.default_schema.get_table("items")["columns"] + for name, expected in column_schema.items(): + assert observed[name]["data_type"] == expected["data_type"] + # postgres bytea does not have precision + if ( + expected.get("precision") is not None + and expected["data_type"] != "binary" + ): + assert observed[name]["precision"] == expected["precision"] + + # update two records in postgres table + # this does two deletes and two inserts because dlt implements "merge" as "delete-and-insert" + # as such, postgres will create four replication messages: two of type Delete and two of type Insert + r1["col2"] = 1.5 + r2["col3"] = False + src_pl.run(items([r1, r2])) + + # process changes and assert expectations + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 + exp = [ + {"col1": 1, "col2": 1.5, "col3": True}, + {"col1": 2, "col2": 898912.821982, "col3": False}, + { + "col1": 989127831, + "col2": 898912.821982, + "col3": True, + }, # only present with init load + ] + if not init_load: + del exp[-1] + assert_loaded_data(dest_pl, "items", ["col1", "col2", "col3"], exp, "col1") + + # now do an actual update, so postgres will create a replication message of type Update + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("items") + c.execute_sql(f"UPDATE {qual_name} SET col2 = 2.5 WHERE col1 = 2;") + + # process change and assert expectation + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 + exp = [{"col1": 2, "col2": 2.5, "col3": False}] + assert_loaded_data( + dest_pl, "items", ["col1", "col2", "col3"], exp, "col1", "col1 = 2" + ) + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +def test_unmapped_data_types( + src_config: Tuple[dlt.Pipeline, str, str], destination_name: str +) -> None: + """Assert postgres data types that aren't explicitly mapped default to "text" type.""" + src_pl, slot_name, pub_name = src_config + + # create postgres table with some unmapped types + with src_pl.sql_client() as c: + c.create_dataset() + c.execute_sql( + "CREATE TABLE data_types (bit_col bit(1), box_col box, uuid_col uuid);" + ) + + # initialize replication and create resource + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="data_types", + publish="insert", + ) + changes = replication_resource(slot_name, pub_name) + + # insert record in source table to create replication item + with src_pl.sql_client() as c: + c.execute_sql( + "INSERT INTO data_types VALUES (B'1', box '((1,1), (0,0))', gen_random_uuid());" + ) + + # run destination pipeline and assert resulting data types + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, full_refresh=True + ) + dest_pl.extract(changes) + dest_pl.normalize() + columns = dest_pl.default_schema.get_table_columns("data_types") + assert columns["bit_col"]["data_type"] == "text" + assert columns["box_col"]["data_type"] == "text" + assert columns["uuid_col"]["data_type"] == "text" + + +@pytest.mark.parametrize("publish", ["insert", "insert, update, delete"]) +def test_write_disposition( + src_config: Tuple[dlt.Pipeline, str, str], publish: str +) -> None: + @dlt.resource + def items(data): + yield data + + src_pl, slot_name, pub_name = src_config + + # create postgres table + src_pl.run(items({"id": 1, "val": True})) + + # create resources + snapshot = init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="items", + publish=publish, + persist_snapshots=True, + ) + + # assert write disposition on snapshot resource + expected_write_disposition = "append" if publish == "insert" else "merge" + assert snapshot.write_disposition == expected_write_disposition + + # assert write disposition on tables dispatched by changes resource + changes = replication_resource(slot_name, pub_name) + src_pl.run(items({"id": 2, "val": True})) + dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) + dest_pl.extract(changes) + assert ( + dest_pl.default_schema.get_table("items")["write_disposition"] + == expected_write_disposition + ) + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("init_load", [True, False]) +def test_include_columns( + src_config: Tuple[dlt.Pipeline, str, str], destination_name: str, init_load: bool +) -> None: + def get_cols(pipeline: dlt.Pipeline, table_name: str) -> set: + with pipeline.destination_client(pipeline.default_schema_name) as client: + client: SqlJobClientBase + return { + k + for k in client.get_storage_table(table_name)[1].keys() + if not k.startswith("_dlt_") + } + + @dlt.resource + def tbl_x(data): + yield data + + @dlt.resource + def tbl_y(data): + yield data + + @dlt.resource + def tbl_z(data): + yield data + + src_pl, slot_name, pub_name = src_config + + # create three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo", "another_col_x": 1}), + tbl_y({"id_y": 1, "val_y": "foo", "another_col_y": 1}), + tbl_z({"id_z": 1, "val_z": "foo", "another_col_z": 1}), + ] + ) + + # initialize replication and create resources + include_columns = { + "tbl_x": ["id_x", "val_x"], + "tbl_y": ["id_y", "val_y"], + # tbl_z is not specified, hence all columns should be included + } + snapshots = init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y", "tbl_z"), + publish="insert", + persist_snapshots=init_load, + include_columns=include_columns, + ) + changes = replication_resource( + slot_name=slot_name, pub_name=pub_name, include_columns=include_columns + ) + + # update three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 2, "val_x": "foo", "another_col_x": 1}), + tbl_y({"id_y": 2, "val_y": "foo", "another_col_y": 1}), + tbl_z({"id_z": 2, "val_z": "foo", "another_col_z": 1}), + ] + ) + + # load to destination and assert column expectations + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, full_refresh=True + ) + if init_load: + dest_pl.run(snapshots) + assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x"} + assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y"} + assert get_cols(dest_pl, "tbl_z") == {"id_z", "val_z", "another_col_z"} + dest_pl.run(changes) + assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x", "lsn"} + assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y", "lsn"} + assert get_cols(dest_pl, "tbl_z") == {"id_z", "val_z", "another_col_z", "lsn"} + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("init_load", [True, False]) +def test_column_hints( + src_config: Tuple[dlt.Pipeline, str, str], destination_name: str, init_load: bool +) -> None: + @dlt.resource + def tbl_x(data): + yield data + + @dlt.resource + def tbl_y(data): + yield data + + @dlt.resource + def tbl_z(data): + yield data + + src_pl, slot_name, pub_name = src_config + + # create three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo", "another_col_x": 1}), + tbl_y({"id_y": 1, "val_y": "foo", "another_col_y": 1}), + tbl_z({"id_z": 1, "val_z": "foo", "another_col_z": 1}), + ] + ) + + # initialize replication and create resources + column_hints = { + "tbl_x": {"another_col_x": {"data_type": "double"}}, + "tbl_y": {"another_col_y": {"precision": 32}}, + # tbl_z is not specified, hence all columns should be included + } + snapshots = init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y", "tbl_z"), + publish="insert", + persist_snapshots=init_load, + columns=column_hints, + ) + changes = replication_resource( + slot_name=slot_name, pub_name=pub_name, columns=column_hints + ) + + # update three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 2, "val_x": "foo", "another_col_x": 1}), + tbl_y({"id_y": 2, "val_y": "foo", "another_col_y": 1}), + tbl_z({"id_z": 2, "val_z": "foo", "another_col_z": 1}), + ] + ) + + # load to destination and assert column expectations + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, full_refresh=True + ) + if init_load: + dest_pl.run(snapshots) + assert ( + dest_pl.default_schema.get_table_columns("tbl_x")["another_col_x"][ + "data_type" + ] + == "double" + ) + assert ( + dest_pl.default_schema.get_table_columns("tbl_y")["another_col_y"][ + "precision" + ] + == 32 + ) + assert ( + dest_pl.default_schema.get_table_columns("tbl_z")["another_col_z"][ + "data_type" + ] + == "bigint" + ) + dest_pl.run(changes) + assert ( + dest_pl.default_schema.get_table_columns("tbl_x")["another_col_x"]["data_type"] + == "double" + ) + assert ( + dest_pl.default_schema.get_table_columns("tbl_y")["another_col_y"]["precision"] + == 32 + ) + assert ( + dest_pl.default_schema.get_table_columns("tbl_z")["another_col_z"]["data_type"] + == "bigint" + ) + + # the tests below should pass, but they don't because of a bug that causes + # column hints to be added to other tables when dispatching to multiple tables + assert "another_col_x" not in dest_pl.default_schema.get_table_columns("tbl_y") + assert "another_col_x" not in dest_pl.default_schema.get_table_columns("tbl_z") + assert "another_col_y" not in dest_pl.default_schema.get_table_columns( + "tbl_x", include_incomplete=True + ) + assert "another_col_y" not in dest_pl.default_schema.get_table_columns( + "tbl_z", include_incomplete=True + ) + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +def test_table_schema_change( + src_config: Tuple[dlt.Pipeline, str, str], destination_name: str +) -> None: + src_pl, slot_name, pub_name = src_config + + # create postgres table + src_pl.run([{"c1": 1, "c2": 1}], table_name="items") + + # initialize replication + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="items", + publish="insert", + ) + + # create resource and pipeline + changes = replication_resource(slot_name, pub_name) + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, full_refresh=True + ) + + # add a column in one commit, this will create one Relation message + src_pl.run([{"c1": 2, "c2": 1}, {"c1": 3, "c2": 1, "c3": 1}], table_name="items") + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "items") == {"items": 2} + exp = [{"c1": 2, "c2": 1, "c3": None}, {"c1": 3, "c2": 1, "c3": 1}] + assert_loaded_data(dest_pl, "items", ["c1", "c2", "c3"], exp, "c1") + + # add a column in two commits, this will create two Relation messages + src_pl.run([{"c1": 4, "c2": 1, "c3": 1}], table_name="items") + src_pl.run([{"c1": 5, "c2": 1, "c3": 1, "c4": 1}], table_name="items") + dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "items") == {"items": 4} + exp = [ + {"c1": 4, "c2": 1, "c3": 1, "c4": None}, + {"c1": 5, "c2": 1, "c3": 1, "c4": 1}, + ] + assert_loaded_data( + dest_pl, "items", ["c1", "c2", "c3", "c4"], exp, "c1", "c1 IN (4, 5)" + ) + + +def test_init_replication(src_config: Tuple[dlt.Pipeline, str, str]) -> None: + def get_table_names_in_pub() -> Set[str]: + with src_pl.sql_client() as c: + result = c.execute_sql( + f"SELECT tablename FROM pg_publication_tables WHERE pubname = '{pub_name}';" + ) + return {tup[0] for tup in result} + + @dlt.resource + def tbl_x(data): + yield data + + @dlt.resource + def tbl_y(data): + yield data + + @dlt.resource + def tbl_z(data): + yield data + + src_pl, slot_name, pub_name = src_config + + # create three postgres tables + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo"}), + tbl_y({"id_y": 1, "val_y": "foo"}), + tbl_z({"id_z": 1, "val_z": "foo"}), + ] + ) + + # initialize replication with a single table + snapshot = init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="tbl_x", + persist_snapshots=True, + ) + assert snapshot is not None + assert get_table_names_in_pub() == {"tbl_x"} + + # adding another table is supported, but snapshot tables won't be persisted + snapshots = init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + persist_snapshots=True, + ) + assert snapshots is None + assert get_table_names_in_pub() == {"tbl_x", "tbl_y"} + + # removing a table is not supported + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="tbl_x", # "tbl_y" is no longer provided + ) + # "tbl_y" is still in the publication + assert get_table_names_in_pub() == {"tbl_x", "tbl_y"} + + # switching to whole schema replication is supported by omitting `table_names`, + # but only for Postgres server versions 15 or higher and with superuser privileges + is_su = is_super_user(src_pl.sql_client) + if get_pg_version() >= 150000 and is_su: + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + ) + # includes dlt system tables + assert get_table_names_in_pub() >= {"tbl_x", "tbl_y", "tbl_z"} + else: + exp_err = ( + InsufficientPrivilege if not is_su else IncompatiblePostgresVersionException + ) + with pytest.raises(exp_err): + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + ) + + +def test_replicate_schema(src_config: Tuple[dlt.Pipeline, str, str]) -> None: + if get_pg_version() < 150000: + pytest.skip("incompatible Postgres server version") + if not is_super_user(src_config[0].sql_client): + pytest.skip("Postgres user needs to be superuser") + + @dlt.resource + def tbl_x(data): + yield data + + @dlt.resource + def tbl_y(data): + yield data + + @dlt.resource + def tbl_z(data): + yield data + + src_pl, slot_name, pub_name = src_config + + # create two postgres tables + src_pl.run( + [ + tbl_x({"id_x": 1, "val_x": "foo"}), + tbl_y({"id_y": 1, "val_y": "foo"}), + ] + ) + + # initialize replication and create resource + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, # we only specify `schema_name`, not `table_names` + publish="insert", + ) + changes = replication_resource(slot_name, pub_name) + + # change source tables and load to destination + src_pl.run( + [ + tbl_x({"id_x": 2, "val_x": "foo"}), + tbl_y({"id_y": 2, "val_y": "foo"}), + ] + ) + dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) + dest_pl.extract(changes) + assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y"} + + # introduce new table in source and assert it gets included in the replication + src_pl.run( + [ + tbl_x({"id_x": 3, "val_x": "foo"}), + tbl_y({"id_y": 3, "val_y": "foo"}), + tbl_z({"id_z": 1, "val_z": "foo"}), + ] + ) + dest_pl.extract(changes) + assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y", "tbl_z"} + + +def test_batching(src_config: Tuple[dlt.Pipeline, str, str]) -> None: + # this test asserts the number of data items yielded by the replication resource + # is not affected by `target_batch_size` and the number of replication messages per transaction + src_pl, slot_name, pub_name = src_config + + # create postgres table with single record + data = {"id": 1000, "val": True} + src_pl.run([data], table_name="items") + + # initialize replication and create resource for changes + init_replication( + slot_name=slot_name, + pub_name=pub_name, + schema_name=src_pl.dataset_name, + table_names="items", + ) + changes = replication_resource(slot_name, pub_name, target_batch_size=50) + + # create destination pipeline and resource + dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) + + # insert 100 records into source table in one transaction + batch = [{**r, **{"id": key}} for r in [data] for key in range(1, 101)] + src_pl.run(batch, table_name="items") + extract_info = dest_pl.extract(changes) + assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 + + # insert 100 records into source table in 5 transactions + batch = [{**r, **{"id": key}} for r in [data] for key in range(101, 121)] + src_pl.run(batch, table_name="items") + batch = [{**r, **{"id": key}} for r in [data] for key in range(121, 141)] + src_pl.run(batch, table_name="items") + batch = [{**r, **{"id": key}} for r in [data] for key in range(141, 161)] + src_pl.run(batch, table_name="items") + batch = [{**r, **{"id": key}} for r in [data] for key in range(161, 181)] + src_pl.run(batch, table_name="items") + batch = [{**r, **{"id": key}} for r in [data] for key in range(181, 201)] + src_pl.run(batch, table_name="items") + extract_info = dest_pl.extract(changes) + assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 diff --git a/tests/pg_legacy_replication/utils.py b/tests/pg_legacy_replication/utils.py new file mode 100644 index 000000000..fe7695b91 --- /dev/null +++ b/tests/pg_legacy_replication/utils.py @@ -0,0 +1,52 @@ +from typing import Sequence, List, Dict, Any, Optional + +import dlt +from dlt import Pipeline +from dlt.common.data_writers.escape import escape_postgres_identifier +from dlt.common.configuration.specs import ConnectionStringCredentials + +from tests.utils import select_data + + +def add_pk(sql_client, table_name: str, column_name: str) -> None: + """Adds primary key to postgres table. + + In the context of replication, the primary key serves as REPLICA IDENTITY. + A REPLICA IDENTITY is required when publishing UPDATEs and/or DELETEs. + """ + with sql_client() as c: + qual_name = c.make_qualified_table_name(table_name) + c.execute_sql(f"ALTER TABLE {qual_name} ADD PRIMARY KEY ({column_name});") + + +def assert_loaded_data( + pipeline: Pipeline, + table_name: str, + column_names: Sequence[str], + expectation: List[Dict[str, Any]], + sort_column_name: str, + where_clause: Optional[str] = None, +) -> None: + """Asserts loaded data meets expectation.""" + qual_name = pipeline.sql_client().make_qualified_table_name(table_name) + escape_id = pipeline.destination_client().capabilities.escape_identifier + column_str = ", ".join(map(escape_id, column_names)) + qry = f"SELECT {column_str} FROM {qual_name}" + if where_clause is not None: + qry += " WHERE " + where_clause + observation = [ + {column_name: row[idx] for idx, column_name in enumerate(column_names)} + for row in select_data(pipeline, qry) + ] + assert sorted(observation, key=lambda d: d[sort_column_name]) == expectation + + +def is_super_user(sql_client) -> bool: + """Returns True if Postgres user is superuser, False otherwise.""" + username = dlt.secrets.get( + "sources.pg_replication.credentials", ConnectionStringCredentials + ).username + with sql_client() as c: + return c.execute_sql( + f"SELECT rolsuper FROM pg_roles WHERE rolname = '{username}';" + )[0][0] From 73704af26963379065664b2af584adaadcc5833c Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 18 Sep 2024 16:16:20 +0200 Subject: [PATCH 05/88] wip: saving work --- pyproject.toml | 1 + sources/pg_legacy_replication/__init__.py | 4 ++-- sources/pg_legacy_replication/helpers.py | 18 +++++++++++------- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f3bd6f85d..a68314bc1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ twisted = "22.10.0" pytest-forked = "^1.6.0" pendulum = "^3.0.0" types-protobuf = "^5.27.0.20240907" +devtools = "^0.12.2" [tool.poetry.group.sql_database.dependencies] sqlalchemy = ">=1.4" diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 74482e226..87720451a 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -79,7 +79,7 @@ def replication_resource( advance_slot(start_lsn, slot_name, credentials) # continue until last message in replication slot - options = {"publication_names": pub_name, "proto_version": "1"} + options: Dict[str, str] = {} upto_lsn = get_max_lsn(slot_name, options, credentials) if upto_lsn is None: return @@ -100,4 +100,4 @@ def replication_resource( if gen.generated_all: dlt.current.resource_state()["last_commit_lsn"] = gen.last_commit_lsn break - start_lsn = gen.last_commit_lsn + start_lsn = gen.last_commit_lsn \ No newline at end of file diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 47e743542..b8d19f13b 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -47,7 +47,7 @@ ) -@dlt.sources.config.with_config(sections=("sources", "pg_replication")) +@dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) def init_replication( slot_name: str, pub_name: str, @@ -136,7 +136,7 @@ def init_replication( add_schema_to_publication(schema_name, pub_name, cur) else: add_tables_to_publication(table_names, schema_name, pub_name, cur) - slot = create_replication_slot(slot_name, cur) + slot = create_replication_slot(slot_name, cur, "decoderbufs") if persist_snapshots: if slot is None: logger.info( @@ -179,7 +179,7 @@ def init_replication( return None -@dlt.sources.config.with_config(sections=("sources", "pg_replication")) +@dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) def get_pg_version( cur: cursor = None, credentials: ConnectionStringCredentials = dlt.secrets.value, @@ -566,11 +566,15 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: decode=False, options=self.options, ) + pub_opts = { + "insert": True, + "update": True, + "delete": True, + "truncate": False, + } consumer = MessageConsumer( upto_lsn=self.upto_lsn, - pub_ops=get_pub_ops( - self.options["publication_names"], self.credentials - ), + pub_ops=pub_opts, target_batch_size=self.target_batch_size, include_columns=self.include_columns, columns=self.columns, @@ -784,4 +788,4 @@ def gen_data_item( data_item["lsn"] = lsn if for_delete: data_item["deleted_ts"] = commit_ts - return data_item + return data_item \ No newline at end of file From 7d1b8e7b3bf884758807959e03052bc8c2b50d66 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 1 Oct 2024 23:30:45 +0200 Subject: [PATCH 06/88] wip: saving work --- pyproject.toml | 1 + sources/pg_legacy_replication/__init__.py | 4 +- sources/pg_legacy_replication/helpers.py | 106 +++++++++++++----- sources/pg_legacy_replication/schema_types.py | 9 +- sources/pg_replication/helpers.py | 16 +++ .../pg_legacy_replication/test_extractors.py | 63 +++++++++++ 6 files changed, 166 insertions(+), 33 deletions(-) create mode 100644 tests/pg_legacy_replication/test_extractors.py diff --git a/pyproject.toml b/pyproject.toml index a68314bc1..02feaca1f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,7 @@ pytest-forked = "^1.6.0" pendulum = "^3.0.0" types-protobuf = "^5.27.0.20240907" devtools = "^0.12.2" +pytest-cov = "^5.0.0" [tool.poetry.group.sql_database.dependencies] sqlalchemy = ">=1.4" diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 87720451a..e1e7760ed 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -80,7 +80,7 @@ def replication_resource( # continue until last message in replication slot options: Dict[str, str] = {} - upto_lsn = get_max_lsn(slot_name, options, credentials) + upto_lsn = get_max_lsn(slot_name, credentials) if upto_lsn is None: return @@ -100,4 +100,4 @@ def replication_resource( if gen.generated_all: dlt.current.resource_state()["last_commit_lsn"] = gen.last_commit_lsn break - start_lsn = gen.last_commit_lsn \ No newline at end of file + start_lsn = gen.last_commit_lsn diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index b8d19f13b..c8b159da1 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -35,7 +35,7 @@ from dlt.extract.resource import DltResource from dlt.sources.credentials import ConnectionStringCredentials -from .schema_types import _to_dlt_column_schema, _to_dlt_val +from .schema_types import _to_dlt_column_schema, _to_dlt_val, _PG_TYPES, _type_mapper from .exceptions import IncompatiblePostgresVersionException from .decoders import ( Begin, @@ -44,7 +44,10 @@ Update, Delete, ColumnData, + convert_pg_ts, ) +from .pg_logicaldec_pb2 import RowMessage, Op # type: ignore [attr-defined] +from google.protobuf.json_format import MessageToDict @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) @@ -384,7 +387,6 @@ def snapshot_table_resource( def get_max_lsn( slot_name: str, - options: Dict[str, str], credentials: ConnectionStringCredentials, ) -> Optional[int]: """Returns maximum Log Sequence Number (LSN) in replication slot. @@ -393,14 +395,10 @@ def get_max_lsn( Does not consume the slot, i.e. messages are not flushed. Raises error if the replication slot or publication does not exist. """ - # comma-separated value string - options_str = ", ".join( - f"'{x}'" for xs in list(map(list, options.items())) for x in xs - ) cur = _get_conn(credentials).cursor() cur.execute( "SELECT MAX(lsn) - '0/0' AS max_lsn " # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) - f"FROM pg_logical_slot_peek_binary_changes('{slot_name}', NULL, NULL, {options_str});" + f"FROM pg_logical_slot_peek_binary_changes('{slot_name}', NULL, NULL);" ) lsn: int = cur.fetchone()[0] cur.connection.close() @@ -640,24 +638,36 @@ def process_msg(self, msg: ReplicationMessage) -> None: - `target_batch_size` is reached - a table's schema has changed """ - op = msg.payload[:1] - if op == b"I": - self.process_change(Insert(msg.payload), msg.data_start) - elif op == b"U": - self.process_change(Update(msg.payload), msg.data_start) - elif op == b"D": - self.process_change(Delete(msg.payload), msg.data_start) - elif op == b"B": - self.last_commit_ts = Begin(msg.payload).commit_ts # type: ignore[assignment] - elif op == b"C": - self.process_commit(msg) - elif op == b"R": - self.process_relation(Relation(msg.payload)) - elif op == b"T": - logger.warning( - "The truncate operation is currently not supported. " - "Truncate replication messages are ignored." - ) + row_msg = RowMessage() + row_msg.ParseFromString(msg.payload) + from devtools import debug + + debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] + op = row_msg.op + if op == Op.BEGIN: + self.last_commit_ts = convert_pg_ts(row_msg.commit_time) # type: ignore[assignment] + # if op == Op.UPDATE: + # self.process_change(row_msg) + # op = msg.payload[:1] + # if op == b"I": + # self.process_change(Insert(msg.payload), msg.data_start) + # elif op == b"U": + # self.process_change(Update(msg.payload), msg.data_start) + # elif op == b"D": + # self.process_change(Delete(msg.payload), msg.data_start) + # elif op == b"B": + # self.last_commit_ts = Begin(msg.payload).commit_ts # type: ignore[assignment] + # elif op == b"C": + # self.process_commit(msg) + # elif op == b"R": + # self.process_relation(Relation(msg.payload)) + # elif op == b"T": + # logger.warning( + # "The truncate operation is currently not supported. " + # "Truncate replication messages are ignored." + # ) + else: + raise AssertionError(f"Unsupported operation : {row_msg}") def process_commit(self, msg: ReplicationMessage) -> None: """Updates object state when Commit message is observed. @@ -788,4 +798,48 @@ def gen_data_item( data_item["lsn"] = lsn if for_delete: data_item["deleted_ts"] = commit_ts - return data_item \ No newline at end of file + return data_item + + +from dlt.common.schema.typing import TColumnSchema, TColumnType +from typing import Any, Dict +from devtools import debug + + +def extract_table_schema(row_msg: RowMessage) -> Dict[str, Any]: + debug(row_msg) + schema_name, table_name = row_msg.table.split(".") + # Remove leading and trailing quotes + table_name = table_name[1:-1] + import re + + regex = r"^(?P[a-zA-Z_][a-zA-Z0-9_]{0,62})_snapshot_(?P[a-zA-Z0-9_-]+)$" + match = re.match(regex, table_name) + if match: + table_name = match.group("table_name") + snapshot_name = match.group("snapshot_name") + print(f"Table name: {table_name}, Snapshot name: {snapshot_name}") + + precision_map = { + "datum_int32": 32, + "datum_int64": 64, + "datum_float": 32, + "datum_double": 64, + } + + new_columns = {} + for col, typeinfo in zip(row_msg.new_tuple, row_msg.new_typeinfo): + base_data_type: TColumnType = _type_mapper().from_db_type(typeinfo.modifier) + column_data: TColumnSchema = { + "name": col.column_name, + "nullable": typeinfo.value_optional, + **base_data_type, + } + + precision = precision_map.get(col.WhichOneof("datum")) + if precision is not None: + column_data["precision"] = precision + + new_columns[col.column_name] = column_data + + return {"name": table_name, "columns": new_columns} diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index ea1e3c057..6fcc27f93 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -6,6 +6,8 @@ from dlt.common.data_types.typing import TDataType from dlt.common.data_types.type_helpers import coerce_value from dlt.common.schema.typing import TColumnSchema, TColumnType +from dlt.destinations import postgres +from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper from .decoders import ColumnType @@ -77,10 +79,7 @@ def _get_scale(type_id: int, atttypmod: int) -> Optional[int]: @lru_cache(maxsize=None) -def _type_mapper() -> Any: - from dlt.destinations import postgres - from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper - +def _type_mapper() -> PostgresTypeMapper: return PostgresTypeMapper(postgres().capabilities()) @@ -92,7 +91,7 @@ def _to_dlt_column_type(type_id: int, atttypmod: int) -> TColumnType: pg_type = _PG_TYPES.get(type_id) precision = _get_precision(type_id, atttypmod) scale = _get_scale(type_id, atttypmod) - return _type_mapper().from_db_type(pg_type, precision, scale) # type: ignore[no-any-return] + return _type_mapper().from_db_type(pg_type, precision, scale) def _to_dlt_column_schema(col: ColumnType) -> TColumnSchema: diff --git a/sources/pg_replication/helpers.py b/sources/pg_replication/helpers.py index 3b29f79c3..99acc3911 100644 --- a/sources/pg_replication/helpers.py +++ b/sources/pg_replication/helpers.py @@ -573,6 +573,9 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: self.generated_all = consumer.consumed_all +from devtools import debug + + class MessageConsumer: """Consumes messages from a ReplicationCursor sequentially. @@ -676,6 +679,8 @@ def process_relation(self, decoded_msg: Relation) -> None: "columns": columns, } + debug(self.last_table_schema[decoded_msg.relation_id]) + # apply user input # 1) exclude columns include_columns = ( @@ -718,6 +723,15 @@ def process_relation(self, decoded_msg: Relation) -> None: ), create_table_variant=True, ) + debug(decoded_msg) + debug( + { + "type": "_meta_item", + "table_name": table_name, + "write_disposition": write_disposition, + "columns": columns, + } + ) self.data_items[decoded_msg.relation_id] = [meta_item] def process_change( @@ -745,6 +759,8 @@ def process_change( ), ) self.data_items[decoded_msg.relation_id].append(data_item) + debug(decoded_msg) + debug(data_item) @staticmethod def gen_data_item( diff --git a/tests/pg_legacy_replication/test_extractors.py b/tests/pg_legacy_replication/test_extractors.py new file mode 100644 index 000000000..001ac8e90 --- /dev/null +++ b/tests/pg_legacy_replication/test_extractors.py @@ -0,0 +1,63 @@ +from sources.pg_legacy_replication.helpers import extract_table_schema +from sources.pg_legacy_replication.pg_logicaldec_pb2 import ( + RowMessage, + Op, + TypeInfo, + DatumMessage, +) + + +def test_extract_table_schema(): + row_msg = RowMessage() + row_msg.table = ( + 'src_pl_dataset_202410010746423478."tbl_x_snapshot_00000003-00000149-1"' + ) + row_msg.op = Op.INSERT + row_msg.new_tuple.extend( + [ + DatumMessage(column_name="id_x", column_type=20, datum_int64=1), + DatumMessage(column_name="val_x", column_type=1043, datum_string="foo"), + DatumMessage( + column_name="_dlt_load_id", + column_type=1043, + datum_string="1727812002.3873408", + ), + DatumMessage( + column_name="_dlt_id", column_type=1043, datum_string="EVvtapNpxpWbqA" + ), + ] + ) + row_msg.new_typeinfo.extend( + [ + TypeInfo(modifier="bigint", value_optional=True), + TypeInfo(modifier="character varying", value_optional=True), + TypeInfo(modifier="character varying", value_optional=True), + TypeInfo(modifier="character varying", value_optional=True), + ] + ) + assert extract_table_schema(row_msg) == { + "name": "tbl_x", + "columns": { + "id_x": { + "data_type": "bigint", + "precision": 64, + "name": "id_x", + "nullable": True, + }, + "val_x": { + "data_type": "text", + "name": "val_x", + "nullable": True, + }, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": True, + }, + "_dlt_id": { + "data_type": "text", + "name": "_dlt_id", + "nullable": True, + }, + }, + } From ecbf98dfd255a5226a64a00ac882a3eab0c31990 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 2 Oct 2024 13:19:34 +0200 Subject: [PATCH 07/88] wip: saving work --- sources/pg_legacy_replication/helpers.py | 52 +++---------------- sources/pg_legacy_replication/schema_types.py | 50 ++++++++++++++++-- ...est_extractors.py => test_schema_types.py} | 8 +-- 3 files changed, 57 insertions(+), 53 deletions(-) rename tests/pg_legacy_replication/{test_extractors.py => test_schema_types.py} (89%) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index c8b159da1..215ecd4e6 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -35,7 +35,7 @@ from dlt.extract.resource import DltResource from dlt.sources.credentials import ConnectionStringCredentials -from .schema_types import _to_dlt_column_schema, _to_dlt_val, _PG_TYPES, _type_mapper +from .schema_types import _to_dlt_column_schema, _to_dlt_val from .exceptions import IncompatiblePostgresVersionException from .decoders import ( Begin, @@ -46,7 +46,8 @@ ColumnData, convert_pg_ts, ) -from .pg_logicaldec_pb2 import RowMessage, Op # type: ignore [attr-defined] + +from .pg_logicaldec_pb2 import Op, RowMessage # type: ignore [attr-defined] from google.protobuf.json_format import MessageToDict @@ -590,6 +591,9 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: self.generated_all = consumer.consumed_all +from devtools import debug + + class MessageConsumer: """Consumes messages from a ReplicationCursor sequentially. @@ -799,47 +803,3 @@ def gen_data_item( if for_delete: data_item["deleted_ts"] = commit_ts return data_item - - -from dlt.common.schema.typing import TColumnSchema, TColumnType -from typing import Any, Dict -from devtools import debug - - -def extract_table_schema(row_msg: RowMessage) -> Dict[str, Any]: - debug(row_msg) - schema_name, table_name = row_msg.table.split(".") - # Remove leading and trailing quotes - table_name = table_name[1:-1] - import re - - regex = r"^(?P[a-zA-Z_][a-zA-Z0-9_]{0,62})_snapshot_(?P[a-zA-Z0-9_-]+)$" - match = re.match(regex, table_name) - if match: - table_name = match.group("table_name") - snapshot_name = match.group("snapshot_name") - print(f"Table name: {table_name}, Snapshot name: {snapshot_name}") - - precision_map = { - "datum_int32": 32, - "datum_int64": 64, - "datum_float": 32, - "datum_double": 64, - } - - new_columns = {} - for col, typeinfo in zip(row_msg.new_tuple, row_msg.new_typeinfo): - base_data_type: TColumnType = _type_mapper().from_db_type(typeinfo.modifier) - column_data: TColumnSchema = { - "name": col.column_name, - "nullable": typeinfo.value_optional, - **base_data_type, - } - - precision = precision_map.get(col.WhichOneof("datum")) - if precision is not None: - column_data["precision"] = precision - - new_columns[col.column_name] = column_data - - return {"name": table_name, "columns": new_columns} diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 6fcc27f93..c03ed7afb 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -5,12 +5,17 @@ from dlt.common import Decimal from dlt.common.data_types.typing import TDataType from dlt.common.data_types.type_helpers import coerce_value -from dlt.common.schema.typing import TColumnSchema, TColumnType +from dlt.common.schema.typing import ( + TColumnSchema, + TColumnType, + TTableSchemaColumns, + TTableSchema, +) from dlt.destinations import postgres from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper from .decoders import ColumnType - +from .pg_logicaldec_pb2 import RowMessage # type: ignore[attr-defined] _DUMMY_VALS: Dict[TDataType, Any] = { "bigint": 0, @@ -27,7 +32,6 @@ } """Dummy values used to replace NULLs in NOT NULL columns in key-only delete records.""" - _PG_TYPES: Dict[int, str] = { 16: "boolean", 17: "bytea", @@ -44,6 +48,14 @@ } """Maps postgres type OID to type string. Only includes types present in PostgresTypeMapper.""" +_DATUM_PRECISIONS: Dict[str, int] = { + "datum_int32": 32, + "datum_int64": 64, + "datum_float": 32, + "datum_double": 64, +} +"""TODO: Add comment here""" + def _get_precision(type_id: int, atttypmod: int) -> Optional[int]: """Get precision from postgres type attributes.""" @@ -122,3 +134,35 @@ def _to_dlt_val(val: str, data_type: TDataType, byte1: str, for_delete: bool) -> raise ValueError( f"Byte1 in replication message must be 'n' or 't', not '{byte1}'." ) + + +def _extract_table_schema(row_msg: RowMessage) -> TTableSchema: + schema_name, table_name = row_msg.table.split(".") + # Remove leading and trailing quotes + table_name = table_name[1:-1] + import re + + regex = r"^(?P[a-zA-Z_][a-zA-Z0-9_]{0,62})_snapshot_(?P[a-zA-Z0-9_-]+)$" + match = re.match(regex, table_name) + if match: + table_name = match.group("table_name") + snapshot_name = match.group("snapshot_name") + print(f"Table name: {table_name}, Snapshot name: {snapshot_name}") + + columns: TTableSchemaColumns = {} + for c, c_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): + assert _PG_TYPES[c.column_type] == c_info.modifier + col_type: TColumnType = _type_mapper().from_db_type(c_info.modifier) + col_schema: TColumnSchema = { + "name": c.column_name, + "nullable": c_info.value_optional, + **col_type, + } + + precision = _DATUM_PRECISIONS.get(c.WhichOneof("datum")) + if precision is not None: + col_schema["precision"] = precision + + columns[c.column_name] = col_schema + + return {"name": table_name, "columns": columns} diff --git a/tests/pg_legacy_replication/test_extractors.py b/tests/pg_legacy_replication/test_schema_types.py similarity index 89% rename from tests/pg_legacy_replication/test_extractors.py rename to tests/pg_legacy_replication/test_schema_types.py index 001ac8e90..1421a60bb 100644 --- a/tests/pg_legacy_replication/test_extractors.py +++ b/tests/pg_legacy_replication/test_schema_types.py @@ -1,4 +1,4 @@ -from sources.pg_legacy_replication.helpers import extract_table_schema +from sources.pg_legacy_replication.schema_types import _extract_table_schema from sources.pg_legacy_replication.pg_logicaldec_pb2 import ( RowMessage, Op, @@ -29,20 +29,20 @@ def test_extract_table_schema(): ) row_msg.new_typeinfo.extend( [ - TypeInfo(modifier="bigint", value_optional=True), + TypeInfo(modifier="bigint"), TypeInfo(modifier="character varying", value_optional=True), TypeInfo(modifier="character varying", value_optional=True), TypeInfo(modifier="character varying", value_optional=True), ] ) - assert extract_table_schema(row_msg) == { + assert _extract_table_schema(row_msg) == { "name": "tbl_x", "columns": { "id_x": { "data_type": "bigint", "precision": 64, "name": "id_x", - "nullable": True, + "nullable": False, }, "val_x": { "data_type": "text", From 3ed14da350e2e0d005fd6e13eea4ebd02a66c015 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 3 Oct 2024 18:24:12 +0200 Subject: [PATCH 08/88] wip: removed all references to publications --- sources/pg_legacy_replication/__init__.py | 4 +- sources/pg_legacy_replication/helpers.py | 147 ++---------------- sources/pg_legacy_replication_pipeline.py | 27 ++-- .../test_pg_replication.py | 14 +- 4 files changed, 34 insertions(+), 158 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index e1e7760ed..5082e1554 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -13,12 +13,11 @@ @dlt.resource( - name=lambda args: args["slot_name"] + "_" + args["pub_name"], + name=lambda args: args["slot_name"], standalone=True, ) def replication_resource( slot_name: str, - pub_name: str, credentials: ConnectionStringCredentials = dlt.secrets.value, include_columns: Optional[Dict[str, Sequence[str]]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, @@ -36,7 +35,6 @@ def replication_resource( Args: slot_name (str): Name of the replication slot to consume replication messages from. - pub_name (str): Name of the publication that publishes DML operations for the table(s). credentials (ConnectionStringCredentials): Postgres database credentials. include_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to sequence of names of columns to include in the generated data items. diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 215ecd4e6..af8638f19 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -54,16 +54,14 @@ @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) def init_replication( slot_name: str, - pub_name: str, - schema_name: str, + schema: str, table_names: Optional[Union[str, Sequence[str]]] = None, credentials: ConnectionStringCredentials = dlt.secrets.value, - publish: str = "insert, update, delete", - persist_snapshots: bool = False, + take_snapshots: bool = True, include_columns: Optional[Dict[str, Sequence[str]]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, reset: bool = False, -) -> Optional[Union[DltResource, List[DltResource]]]: +) -> Optional[List[DltResource]]: """Initializes replication for one, several, or all tables within a schema. Can be called repeatedly with the same `slot_name` and `pub_name`: @@ -81,7 +79,7 @@ def init_replication( Args: slot_name (str): Name of the replication slot to create if it does not exist yet. pub_name (str): Name of the publication to create if it does not exist yet. - schema_name (str): Name of the schema to replicate tables from. + schema (str): Name of the schema to replicate tables from. table_names (Optional[Union[str, Sequence[str]]]): Name(s) of the table(s) to include in the publication. If not provided, all tables in the schema are included (also tables added to the schema after the publication was created). @@ -91,7 +89,7 @@ def init_replication( are `insert`, `update`, and `delete`. `truncate` is currently not supported—messages of that type are ignored. E.g. `publish="insert"` will create a publication that only publishes insert operations. - persist_snapshots (bool): Whether the table states in the snapshot exported + take_snapshots (bool): Whether the table states in the snapshot exported during replication slot creation are persisted to tables. If true, a snapshot table is created in Postgres for all included tables, and corresponding resources (`DltResource` objects) for these tables are created and returned. @@ -107,7 +105,7 @@ def init_replication( "table_y": ["col_x", "col_y", "col_z"], } ``` - Argument is only used if `persist_snapshots` is `True`. + Argument is only used if `take_snapshots` is `True`. columns (Optional[Dict[str, TTableSchemaColumns]]): Maps table name(s) to column hints to apply on the snapshot table resource(s). For example: @@ -117,33 +115,27 @@ def init_replication( "table_y": {"col_y": {"precision": 32}}, } ``` - Argument is only used if `persist_snapshots` is `True`. + Argument is only used if `take_snapshots` is `True`. reset (bool): If set to True, the existing slot and publication are dropped and recreated. Has no effect if a slot and publication with the provided names do not yet exist. Returns: - - None if `persist_snapshots` is `False` + - None if `take_snapshots` is `False` - a `DltResource` object or a list of `DltResource` objects for the snapshot - table(s) if `persist_snapshots` is `True` and the replication slot did not yet exist + table(s) if `take_snapshots` is `True` and the replication slot did not yet exist """ - if persist_snapshots: + if take_snapshots: _import_sql_table_resource() if isinstance(table_names, str): table_names = [table_names] cur = _get_rep_conn(credentials).cursor() if reset: drop_replication_slot(slot_name, cur) - drop_publication(pub_name, cur) - create_publication(pub_name, cur, publish) - if table_names is None: - add_schema_to_publication(schema_name, pub_name, cur) - else: - add_tables_to_publication(table_names, schema_name, pub_name, cur) - slot = create_replication_slot(slot_name, cur, "decoderbufs") - if persist_snapshots: + slot = create_replication_slot(slot_name, cur) + if take_snapshots: if slot is None: - logger.info( + raise NotImplementedError( "Cannot persist snapshots because they do not exist. " f'The replication slot "{slot_name}" already existed prior to calling this function.' ) @@ -154,7 +146,7 @@ def init_replication( persist_snapshot_table( snapshot_name=slot["snapshot_name"], table_name=table_name, - schema_name=schema_name, + schema_name=schema, cur=cur_snap, include_columns=( None @@ -167,9 +159,9 @@ def init_replication( snapshot_table_resources = [ snapshot_table_resource( snapshot_table_name=snapshot_table_name, - schema_name=schema_name, + schema_name=schema, table_name=table_name, - write_disposition="append" if publish == "insert" else "merge", + write_disposition="merge", # FIXME Change later columns=None if columns is None else columns.get(table_name), credentials=credentials, ) @@ -177,8 +169,6 @@ def init_replication( table_names, snapshot_table_names ) ] - if len(snapshot_table_resources) == 1: - return snapshot_table_resources[0] return snapshot_table_resources return None @@ -194,98 +184,8 @@ def get_pg_version( return _get_conn(credentials).server_version -def create_publication( - name: str, - cur: cursor, - publish: str = "insert, update, delete", -) -> None: - """Creates a publication for logical replication if it doesn't exist yet. - - Does nothing if the publication already exists. - Raises error if the user does not have the CREATE privilege for the database. - """ - esc_name = escape_postgres_identifier(name) - try: - cur.execute(f"CREATE PUBLICATION {esc_name} WITH (publish = '{publish}');") - logger.info( - f"Successfully created publication {esc_name} with publish = '{publish}'." - ) - except psycopg2.errors.DuplicateObject: # the publication already exists - logger.info(f'Publication "{name}" already exists.') - - -def add_table_to_publication( - table_name: str, - schema_name: str, - pub_name: str, - cur: cursor, -) -> None: - """Adds a table to a publication for logical replication. - - Does nothing if the table is already a member of the publication. - Raises error if the user is not owner of the table. - """ - qual_name = _make_qualified_table_name(table_name, schema_name) - esc_pub_name = escape_postgres_identifier(pub_name) - try: - cur.execute(f"ALTER PUBLICATION {esc_pub_name} ADD TABLE {qual_name};") - logger.info( - f"Successfully added table {qual_name} to publication {esc_pub_name}." - ) - except psycopg2.errors.DuplicateObject: - logger.info( - f"Table {qual_name} is already a member of publication {esc_pub_name}." - ) - - -def add_tables_to_publication( - table_names: Union[str, Sequence[str]], - schema_name: str, - pub_name: str, - cur: cursor, -) -> None: - """Adds one or multiple tables to a publication for logical replication. - - Calls `add_table_to_publication` for each table in `table_names`. - """ - if isinstance(table_names, str): - table_names = table_names - for table_name in table_names: - add_table_to_publication(table_name, schema_name, pub_name, cur) - - -def add_schema_to_publication( - schema_name: str, - pub_name: str, - cur: cursor, -) -> None: - """Adds a schema to a publication for logical replication if the schema is not a member yet. - - Raises error if the user is not a superuser. - """ - if (version := get_pg_version(cur)) < 150000: - raise IncompatiblePostgresVersionException( - f"Cannot add schema to publication because the Postgres server version {version} is too low." - " Adding schemas to a publication is only supported for Postgres version 15 or higher." - " Upgrade your Postgres server version or set the `table_names` argument to explicitly specify table names." - ) - esc_schema_name = escape_postgres_identifier(schema_name) - esc_pub_name = escape_postgres_identifier(pub_name) - try: - cur.execute( - f"ALTER PUBLICATION {esc_pub_name} ADD TABLES IN SCHEMA {esc_schema_name};" - ) - logger.info( - f"Successfully added schema {esc_schema_name} to publication {esc_pub_name}." - ) - except psycopg2.errors.DuplicateObject: - logger.info( - f"Schema {esc_schema_name} is already a member of publication {esc_pub_name}." - ) - - def create_replication_slot( # type: ignore[return] - name: str, cur: ReplicationCursor, output_plugin: str = "pgoutput" + name: str, cur: ReplicationCursor, output_plugin: str = "decoderbufs" ) -> Optional[Dict[str, str]]: """Creates a replication slot if it doesn't exist yet.""" try: @@ -315,19 +215,6 @@ def drop_replication_slot(name: str, cur: ReplicationCursor) -> None: ) -def drop_publication(name: str, cur: ReplicationCursor) -> None: - """Drops a publication if it exists.""" - esc_name = escape_postgres_identifier(name) - try: - cur.execute(f"DROP PUBLICATION {esc_name};") - cur.connection.commit() - logger.info(f"Successfully dropped publication {esc_name}.") - except psycopg2.errors.UndefinedObject: # the publication does not exist - logger.info( - f"Publication {esc_name} cannot be dropped because it does not exist." - ) - - def persist_snapshot_table( snapshot_name: str, table_name: str, diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index f3c428cce..03611f8fb 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -34,17 +34,15 @@ def replicate_single_table() -> None: # initialize replication for the source table—this creates a replication slot and publication slot_name = "example_slot" - pub_name = "example_pub" init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names="my_source_table", reset=True, ) # create a resource that generates items for each change in the source table - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) # insert two records in source table and propagate changes to destination change_source_table( @@ -67,7 +65,7 @@ def replicate_single_table() -> None: def replicate_with_initial_load() -> None: """Sets up replication with initial load. - Demonstrates usage of `persist_snapshots` argument and snapshot resource + Demonstrates usage of `take_snapshots` argument and snapshot resource returned by `init_replication` helper. """ # create source and destination pipelines @@ -91,13 +89,11 @@ def replicate_with_initial_load() -> None: # initialize replication for the source table slot_name = "example_slot" - pub_name = "example_pub" snapshot = init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names="my_source_table", - persist_snapshots=True, # persist snapshot table(s) and let function return resource(s) for initial load + take_snapshots=True, # persist snapshot table(s) and let function return resource(s) for initial load reset=True, ) @@ -107,7 +103,7 @@ def replicate_with_initial_load() -> None: # insert record in source table and propagate change to destination change_source_table(src_pl, "INSERT INTO {table_name} VALUES (3, true);") - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) dest_pl.run(changes) show_destination_table(dest_pl) @@ -141,16 +137,14 @@ def replicate_entire_schema() -> None: # initialize schema replication by omitting the `table_names` argument slot_name = "example_slot" - pub_name = "example_pub" init_replication( # initializing schema replication requires the Postgres user to be a superuser slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, reset=True, ) # create a resource that generates items for each change in the schema's tables - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) # insert records in source tables and propagate changes to destination change_source_table( @@ -200,11 +194,9 @@ def replicate_with_column_selection() -> None: # initialize schema replication by omitting the `table_names` argument slot_name = "example_slot" - pub_name = "example_pub" init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), reset=True, ) @@ -212,7 +204,6 @@ def replicate_with_column_selection() -> None: # create a resource that generates items for each change in the schema's tables changes = replication_resource( slot_name=slot_name, - pub_name=pub_name, include_columns={ "tbl_x": ("c1", "c2") }, # columns not specified here are excluded from generated data items diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index e528e6757..8018c869c 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -51,7 +51,7 @@ def tbl_y(data): pub_name=pub_name, schema_name=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - persist_snapshots=True, + take_snapshots=True, ) changes = replication_resource(slot_name, pub_name) @@ -286,7 +286,7 @@ def items(data): pub_name=pub_name, schema_name=src_pl.dataset_name, table_names="items", - persist_snapshots=init_load, + take_snapshots=init_load, columns={"items": column_schema} if give_hints else None, ) @@ -429,7 +429,7 @@ def items(data): schema_name=src_pl.dataset_name, table_names="items", publish=publish, - persist_snapshots=True, + take_snapshots=True, ) # assert write disposition on snapshot resource @@ -496,7 +496,7 @@ def tbl_z(data): schema_name=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), publish="insert", - persist_snapshots=init_load, + take_snapshots=init_load, include_columns=include_columns, ) changes = replication_resource( @@ -567,7 +567,7 @@ def tbl_z(data): schema_name=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), publish="insert", - persist_snapshots=init_load, + take_snapshots=init_load, columns=column_hints, ) changes = replication_resource( @@ -717,7 +717,7 @@ def tbl_z(data): pub_name=pub_name, schema_name=src_pl.dataset_name, table_names="tbl_x", - persist_snapshots=True, + take_snapshots=True, ) assert snapshot is not None assert get_table_names_in_pub() == {"tbl_x"} @@ -728,7 +728,7 @@ def tbl_z(data): pub_name=pub_name, schema_name=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - persist_snapshots=True, + take_snapshots=True, ) assert snapshots is None assert get_table_names_in_pub() == {"tbl_x", "tbl_y"} From 9fe03012490ce2ed9c9d7eff1ce5c94a69cc552e Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 8 Oct 2024 16:50:23 +0200 Subject: [PATCH 09/88] fix: applied suggested changes mentioned here https://github.com/dlt-hub/dlt/issues/1920 --- tests/postgres/check-replication.sh | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100755 tests/postgres/check-replication.sh diff --git a/tests/postgres/check-replication.sh b/tests/postgres/check-replication.sh deleted file mode 100755 index 2e51147d1..000000000 --- a/tests/postgres/check-replication.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -set -e - -docker exec dlt_postgres_db \ - psql -x -U loader -d dlt_data \ - -c "select *, - pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) as replicationSlotLag, - pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn)) as confirmedLag - from pg_replication_slots;" \ No newline at end of file From 197ba821fe1c5cbb6be074a6f2568474a8b23f7c Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 8 Oct 2024 18:21:51 +0200 Subject: [PATCH 10/88] wip: saving work --- sources/pg_legacy_replication/helpers.py | 112 ++++++----- sources/pg_legacy_replication_pipeline.py | 6 +- .../test_pg_replication.py | 185 +++++++++--------- 3 files changed, 153 insertions(+), 150 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index af8638f19..786d558da 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -54,17 +54,16 @@ @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) def init_replication( slot_name: str, - schema: str, - table_names: Optional[Union[str, Sequence[str]]] = None, + schema: Optional[str] = dlt.config.value, + table_names: Optional[List[str]] = dlt.config.value, credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = True, include_columns: Optional[Dict[str, Sequence[str]]] = None, - columns: Optional[Dict[str, TTableSchemaColumns]] = None, reset: bool = False, ) -> Optional[List[DltResource]]: """Initializes replication for one, several, or all tables within a schema. - Can be called repeatedly with the same `slot_name` and `pub_name`: + Can be called repeatedly with the same `slot_name`: - creates a replication slot and publication with provided names if they do not exist yet - skips creation of slot and publication if they already exist (unless`reset` is set to `False`) - supports addition of new tables by extending `table_names` @@ -78,17 +77,11 @@ def init_replication( Args: slot_name (str): Name of the replication slot to create if it does not exist yet. - pub_name (str): Name of the publication to create if it does not exist yet. schema (str): Name of the schema to replicate tables from. table_names (Optional[Union[str, Sequence[str]]]): Name(s) of the table(s) to include in the publication. If not provided, all tables in the schema are included (also tables added to the schema after the publication was created). credentials (ConnectionStringCredentials): Postgres database credentials. - publish (str): Comma-separated string of DML operations. Can be used to - control which changes are included in the publication. Allowed operations - are `insert`, `update`, and `delete`. `truncate` is currently not - supported—messages of that type are ignored. - E.g. `publish="insert"` will create a publication that only publishes insert operations. take_snapshots (bool): Whether the table states in the snapshot exported during replication slot creation are persisted to tables. If true, a snapshot table is created in Postgres for all included tables, and corresponding @@ -106,14 +99,6 @@ def init_replication( } ``` Argument is only used if `take_snapshots` is `True`. - columns (Optional[Dict[str, TTableSchemaColumns]]): Maps - table name(s) to column hints to apply on the snapshot table resource(s). - For example: - ``` - columns={ - "table_x": {"col_a": {"data_type": "complex"}}, - "table_y": {"col_y": {"precision": 32}}, - } ``` Argument is only used if `take_snapshots` is `True`. reset (bool): If set to True, the existing slot and publication are dropped @@ -127,49 +112,68 @@ def init_replication( """ if take_snapshots: _import_sql_table_resource() - if isinstance(table_names, str): - table_names = [table_names] cur = _get_rep_conn(credentials).cursor() if reset: drop_replication_slot(slot_name, cur) slot = create_replication_slot(slot_name, cur) if take_snapshots: - if slot is None: - raise NotImplementedError( - "Cannot persist snapshots because they do not exist. " - f'The replication slot "{slot_name}" already existed prior to calling this function.' - ) - else: - # need separate session to read the snapshot: https://stackoverflow.com/q/75852587 - cur_snap = _get_conn(credentials).cursor() - snapshot_table_names = [ - persist_snapshot_table( - snapshot_name=slot["snapshot_name"], - table_name=table_name, - schema_name=schema, - cur=cur_snap, - include_columns=( - None - if include_columns is None - else include_columns.get(table_name) - ), - ) - for table_name in table_names - ] - snapshot_table_resources = [ - snapshot_table_resource( - snapshot_table_name=snapshot_table_name, - schema_name=schema, - table_name=table_name, - write_disposition="merge", # FIXME Change later - columns=None if columns is None else columns.get(table_name), - credentials=credentials, + from sqlalchemy import text, Connection + + def init_connection(conn: Connection) -> Connection: + if slot is None: + # Using the same isolation level that pg_backup uses + conn.execute( + text( + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE, READ ONLY, DEFERRABLE;" + ) ) - for table_name, snapshot_table_name in zip( - table_names, snapshot_table_names + else: + conn.execute(text("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;")) + conn.execute( + text(f"SET TRANSACTION SNAPSHOT '{slot['snapshot_name']}';") ) - ] - return snapshot_table_resources + return conn + + snapshot_resources = [ + sql_table( # type: ignore[name-defined] + credentials=credentials, + table=table_name, + schema=schema, + included_columns=include_columns.get(table_name), + conn_init_callback=init_connection, + ) + for table_name in table_names + ] + # # need separate session to read the snapshot: https://stackoverflow.com/q/75852587 + # cur_snap = _get_conn(credentials).cursor() + # snapshot_table_names = [ + # persist_snapshot_table( + # snapshot_name=slot["snapshot_name"], + # table_name=table_name, + # schema_name=schema, + # cur=cur_snap, + # include_columns=( + # None + # if include_columns is None + # else include_columns.get(table_name) + # ), + # ) + # for table_name in table_names + # ] + # snapshot_table_resources = [ + # snapshot_table_resource( + # snapshot_table_name=snapshot_table_name, + # schema_name=schema, + # table_name=table_name, + # write_disposition="merge", # FIXME Change later + # columns=None if columns is None else columns.get(table_name), + # credentials=credentials, + # ) + # for table_name, snapshot_table_name in zip( + # table_names, snapshot_table_names + # ) + # ] + return snapshot_resources return None diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 03611f8fb..95f02fafc 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -37,7 +37,7 @@ def replicate_single_table() -> None: init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, schema=src_pl.dataset_name, - table_names="my_source_table", + table_names=["my_source_table"], reset=True, ) @@ -92,7 +92,7 @@ def replicate_with_initial_load() -> None: snapshot = init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, schema=src_pl.dataset_name, - table_names="my_source_table", + table_names=["my_source_table"], take_snapshots=True, # persist snapshot table(s) and let function return resource(s) for initial load reset=True, ) @@ -197,7 +197,7 @@ def replicate_with_column_selection() -> None: init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, schema=src_pl.dataset_name, - table_names=("tbl_x", "tbl_y"), + table_names=["tbl_x", "tbl_y"], reset=True, ) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 8018c869c..1842c51be 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -48,103 +48,102 @@ def tbl_y(data): snapshots = init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), take_snapshots=True, ) - changes = replication_resource(slot_name, pub_name) - - src_pl.run( - [ - tbl_x([{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}]), - tbl_y({"id_y": 2, "val_y": False}), - ] - ) - - dest_pl = dlt.pipeline( - pipeline_name="dest_pl", destination=destination_name, full_refresh=True - ) - - # initial load - info = dest_pl.run(snapshots) - assert_load_info(info) - assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} - exp_tbl_x = [{"id_x": 1, "val_x": "foo"}] - exp_tbl_y = [{"id_y": 1, "val_y": True}] - assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - - # process changes - info = dest_pl.run(changes) - assert_load_info(info) - assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} - exp_tbl_x = [ - {"id_x": 1, "val_x": "foo"}, - {"id_x": 2, "val_x": "bar"}, - {"id_x": 3, "val_x": "baz"}, - ] - exp_tbl_y = [{"id_y": 1, "val_y": True}, {"id_y": 2, "val_y": False}] - assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - - # change single table - src_pl.run(tbl_y({"id_y": 3, "val_y": True})) - - # process changes - info = dest_pl.run(changes) - assert_load_info(info) - assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} - exp_tbl_y = [ - {"id_y": 1, "val_y": True}, - {"id_y": 2, "val_y": False}, - {"id_y": 3, "val_y": True}, - ] - assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - - # update tables - with src_pl.sql_client() as c: - qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") - c.execute_sql(f"UPDATE {qual_name} SET val_x = 'foo_updated' WHERE id_x = 1;") - qual_name = src_pl.sql_client().make_qualified_table_name("tbl_y") - c.execute_sql(f"UPDATE {qual_name} SET val_y = false WHERE id_y = 1;") - - # process changes - info = dest_pl.run(changes) - assert_load_info(info) - assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} - exp_tbl_x = [ - {"id_x": 1, "val_x": "foo_updated"}, - {"id_x": 2, "val_x": "bar"}, - {"id_x": 3, "val_x": "baz"}, - ] - exp_tbl_y = [ - {"id_y": 1, "val_y": False}, - {"id_y": 2, "val_y": False}, - {"id_y": 3, "val_y": True}, - ] - assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - - # delete from table - with src_pl.sql_client() as c: - qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") - c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 1;") - - # process changes - info = dest_pl.run(changes) - assert_load_info(info) - assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 3} - exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] - exp_tbl_y = [ - {"id_y": 1, "val_y": False}, - {"id_y": 2, "val_y": False}, - {"id_y": 3, "val_y": True}, - ] - assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + # changes = replication_resource(slot_name, pub_name) + # + # src_pl.run( + # [ + # tbl_x([{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}]), + # tbl_y({"id_y": 2, "val_y": False}), + # ] + # ) + # + # dest_pl = dlt.pipeline( + # pipeline_name="dest_pl", destination=destination_name, full_refresh=True + # ) + # + # # initial load + # info = dest_pl.run(snapshots) + # assert_load_info(info) + # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} + # exp_tbl_x = [{"id_x": 1, "val_x": "foo"}] + # exp_tbl_y = [{"id_y": 1, "val_y": True}] + # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + # + # # process changes + # info = dest_pl.run(changes) + # assert_load_info(info) + # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} + # exp_tbl_x = [ + # {"id_x": 1, "val_x": "foo"}, + # {"id_x": 2, "val_x": "bar"}, + # {"id_x": 3, "val_x": "baz"}, + # ] + # exp_tbl_y = [{"id_y": 1, "val_y": True}, {"id_y": 2, "val_y": False}] + # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + # + # # change single table + # src_pl.run(tbl_y({"id_y": 3, "val_y": True})) + # + # # process changes + # info = dest_pl.run(changes) + # assert_load_info(info) + # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} + # exp_tbl_y = [ + # {"id_y": 1, "val_y": True}, + # {"id_y": 2, "val_y": False}, + # {"id_y": 3, "val_y": True}, + # ] + # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + # + # # update tables + # with src_pl.sql_client() as c: + # qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + # c.execute_sql(f"UPDATE {qual_name} SET val_x = 'foo_updated' WHERE id_x = 1;") + # qual_name = src_pl.sql_client().make_qualified_table_name("tbl_y") + # c.execute_sql(f"UPDATE {qual_name} SET val_y = false WHERE id_y = 1;") + # + # # process changes + # info = dest_pl.run(changes) + # assert_load_info(info) + # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} + # exp_tbl_x = [ + # {"id_x": 1, "val_x": "foo_updated"}, + # {"id_x": 2, "val_x": "bar"}, + # {"id_x": 3, "val_x": "baz"}, + # ] + # exp_tbl_y = [ + # {"id_y": 1, "val_y": False}, + # {"id_y": 2, "val_y": False}, + # {"id_y": 3, "val_y": True}, + # ] + # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + # + # # delete from table + # with src_pl.sql_client() as c: + # qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + # c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 1;") + # + # # process changes + # info = dest_pl.run(changes) + # assert_load_info(info) + # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 3} + # exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] + # exp_tbl_y = [ + # {"id_y": 1, "val_y": False}, + # {"id_y": 2, "val_y": False}, + # {"id_y": 3, "val_y": True}, + # ] + # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) From c897ee07e43579d3ec0c2c5f2792b8763cd6d24f Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 9 Oct 2024 18:27:49 +0200 Subject: [PATCH 11/88] wip: finally got snapshot to work --- sources/pg_legacy_replication/helpers.py | 362 +++++++++--------- .../test_pg_replication.py | 50 +-- tests/postgres/postgresql.conf | 6 +- 3 files changed, 203 insertions(+), 215 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 786d558da..ed57b01e1 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -48,6 +48,8 @@ ) from .pg_logicaldec_pb2 import Op, RowMessage # type: ignore [attr-defined] +from .schema_types import _to_dlt_column_schema, _to_dlt_val +from .exceptions import SqlDatabaseSourceImportError from google.protobuf.json_format import MessageToDict @@ -110,71 +112,57 @@ def init_replication( - a `DltResource` object or a list of `DltResource` objects for the snapshot table(s) if `take_snapshots` is `True` and the replication slot did not yet exist """ - if take_snapshots: - _import_sql_table_resource() - cur = _get_rep_conn(credentials).cursor() + rep_conn = _get_rep_conn(credentials) + rep_cur = rep_conn.cursor() if reset: - drop_replication_slot(slot_name, cur) - slot = create_replication_slot(slot_name, cur) - if take_snapshots: - from sqlalchemy import text, Connection - - def init_connection(conn: Connection) -> Connection: - if slot is None: - # Using the same isolation level that pg_backup uses - conn.execute( - text( - "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE, READ ONLY, DEFERRABLE;" - ) - ) - else: - conn.execute(text("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ;")) - conn.execute( - text(f"SET TRANSACTION SNAPSHOT '{slot['snapshot_name']}';") - ) - return conn - - snapshot_resources = [ - sql_table( # type: ignore[name-defined] - credentials=credentials, - table=table_name, - schema=schema, - included_columns=include_columns.get(table_name), - conn_init_callback=init_connection, + drop_replication_slot(slot_name, rep_cur) + slot = create_replication_slot(slot_name, rep_cur) + + if not take_snapshots: + rep_conn.close() + return None + + _import_sql_table_resource() + # If this point is reached it means that sqlalchemy is available + from sqlalchemy import Connection as ConnectionSqla, Engine, event + + if include_columns is None: + include_columns = {} + + engine: Engine = engine_from_credentials( # type: ignore[name-defined] + credentials, + may_dispose_after_use=False, + pool_size=1, # Only one connection in the pool + max_overflow=0, # No additional connections beyond the pool size + pool_timeout=30, # Time to wait for a connection to be available + pool_recycle=-1, # Disable automatic connection recycling + pool_pre_ping=True, # Test the connection before using it (optional) + ) + engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) + setattr(engine, "rep_conn", rep_conn) # noqa + + @event.listens_for(engine, "begin") + def on_begin(conn: ConnectionSqla) -> None: + cur = conn.connection.cursor() + if slot is None: + # Using the same isolation level that pg_backup uses + cur.execute( + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE, READ ONLY, DEFERRABLE" ) - for table_name in table_names - ] - # # need separate session to read the snapshot: https://stackoverflow.com/q/75852587 - # cur_snap = _get_conn(credentials).cursor() - # snapshot_table_names = [ - # persist_snapshot_table( - # snapshot_name=slot["snapshot_name"], - # table_name=table_name, - # schema_name=schema, - # cur=cur_snap, - # include_columns=( - # None - # if include_columns is None - # else include_columns.get(table_name) - # ), - # ) - # for table_name in table_names - # ] - # snapshot_table_resources = [ - # snapshot_table_resource( - # snapshot_table_name=snapshot_table_name, - # schema_name=schema, - # table_name=table_name, - # write_disposition="merge", # FIXME Change later - # columns=None if columns is None else columns.get(table_name), - # credentials=credentials, - # ) - # for table_name, snapshot_table_name in zip( - # table_names, snapshot_table_names - # ) - # ] - return snapshot_resources - return None + else: + cur.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") + cur.execute(f"SET TRANSACTION SNAPSHOT '{slot['snapshot_name']}'") + + snapshot_resources = [ + sql_table( # type: ignore[name-defined] + credentials=engine, + table=table_name, + schema=schema, + included_columns=include_columns.get(table_name), + ) + for table_name in table_names + ] + return snapshot_resources @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) @@ -219,62 +207,62 @@ def drop_replication_slot(name: str, cur: ReplicationCursor) -> None: ) -def persist_snapshot_table( - snapshot_name: str, - table_name: str, - schema_name: str, - cur: cursor, - include_columns: Optional[Sequence[str]] = None, -) -> str: - """Persists exported snapshot table state. - - Reads snapshot table content and copies it into new table. - """ - col_str = "*" - if include_columns is not None: - col_str = ", ".join(map(escape_postgres_identifier, include_columns)) - snapshot_table_name = f"{table_name}_snapshot_{snapshot_name}" - snapshot_qual_name = _make_qualified_table_name(snapshot_table_name, schema_name) - qual_name = _make_qualified_table_name(table_name, schema_name) - cur.execute( - f""" - START TRANSACTION ISOLATION LEVEL REPEATABLE READ; - SET TRANSACTION SNAPSHOT '{snapshot_name}'; - CREATE TABLE {snapshot_qual_name} AS SELECT {col_str} FROM {qual_name}; - """ - ) - cur.connection.commit() - logger.info(f"Successfully persisted snapshot table state in {snapshot_qual_name}.") - return snapshot_table_name - - -def snapshot_table_resource( - snapshot_table_name: str, - schema_name: str, - table_name: str, - write_disposition: TWriteDisposition, - columns: TTableSchemaColumns = None, - credentials: ConnectionStringCredentials = dlt.secrets.value, -) -> DltResource: - """Returns a resource for a persisted snapshot table. - - Can be used to perform an initial load of the table, so all data that - existed in the table prior to initializing replication is also captured. - """ - resource: DltResource = sql_table( # type: ignore[name-defined] - credentials=credentials, - table=snapshot_table_name, - schema=schema_name, - detect_precision_hints=True, - ) - primary_key = _get_pk(table_name, schema_name, credentials) - resource.apply_hints( - table_name=table_name, - write_disposition=write_disposition, - columns=columns, - primary_key=primary_key, - ) - return resource +# def persist_snapshot_table( +# snapshot_name: str, +# table_name: str, +# schema_name: str, +# cur: cursor, +# include_columns: Optional[Sequence[str]] = None, +# ) -> str: +# """Persists exported snapshot table state. +# +# Reads snapshot table content and copies it into new table. +# """ +# col_str = "*" +# if include_columns is not None: +# col_str = ", ".join(map(escape_postgres_identifier, include_columns)) +# snapshot_table_name = f"{table_name}_snapshot_{snapshot_name}" +# snapshot_qual_name = _make_qualified_table_name(snapshot_table_name, schema_name) +# qual_name = _make_qualified_table_name(table_name, schema_name) +# cur.execute( +# f""" +# START TRANSACTION ISOLATION LEVEL REPEATABLE READ; +# SET TRANSACTION SNAPSHOT '{snapshot_name}'; +# CREATE TABLE {snapshot_qual_name} AS SELECT {col_str} FROM {qual_name}; +# """ +# ) +# cur.connection.commit() +# logger.info(f"Successfully persisted snapshot table state in {snapshot_qual_name}.") +# return snapshot_table_name +# +# +# def snapshot_table_resource( +# snapshot_table_name: str, +# schema_name: str, +# table_name: str, +# write_disposition: TWriteDisposition, +# columns: TTableSchemaColumns = None, +# credentials: ConnectionStringCredentials = dlt.secrets.value, +# ) -> DltResource: +# """Returns a resource for a persisted snapshot table. +# +# Can be used to perform an initial load of the table, so all data that +# existed in the table prior to initializing replication is also captured. +# """ +# resource: DltResource = sql_table( # type: ignore[name-defined] +# credentials=credentials, +# table=snapshot_table_name, +# schema=schema_name, +# detect_precision_hints=True, +# ) +# primary_key = _get_pk(table_name, schema_name, credentials) +# resource.apply_hints( +# table_name=table_name, +# write_disposition=write_disposition, +# columns=columns, +# primary_key=primary_key, +# ) +# return resource def get_max_lsn( @@ -297,28 +285,28 @@ def get_max_lsn( return lsn -def get_pub_ops( - pub_name: str, - credentials: ConnectionStringCredentials, -) -> Dict[str, bool]: - """Returns dictionary of DML operations and their publish status.""" - cur = _get_conn(credentials).cursor() - cur.execute( - f""" - SELECT pubinsert, pubupdate, pubdelete, pubtruncate - FROM pg_publication WHERE pubname = '{pub_name}' - """ - ) - result = cur.fetchone() - cur.connection.close() - if result is None: - raise ValueError(f'Publication "{pub_name}" does not exist.') - return { - "insert": result[0], - "update": result[1], - "delete": result[2], - "truncate": result[3], - } +# def get_pub_ops( +# pub_name: str, +# credentials: ConnectionStringCredentials, +# ) -> Dict[str, bool]: +# """Returns dictionary of DML operations and their publish status.""" +# cur = _get_conn(credentials).cursor() +# cur.execute( +# f""" +# SELECT pubinsert, pubupdate, pubdelete, pubtruncate +# FROM pg_publication WHERE pubname = '{pub_name}' +# """ +# ) +# result = cur.fetchone() +# cur.connection.close() +# if result is None: +# raise ValueError(f'Publication "{pub_name}" does not exist.') +# return { +# "insert": result[0], +# "update": result[1], +# "delete": result[2], +# "truncate": result[3], +# } def lsn_int_to_hex(lsn: int) -> str: @@ -351,16 +339,17 @@ def _import_sql_table_resource() -> None: Raises error if `sql_database` source is not available. """ - global sql_table + global sql_table, engine_from_credentials try: - from ..sql_database import sql_table # type: ignore[import-untyped] - except Exception: + from ..sql_database import sql_table, engine_from_credentials # type: ignore[import-untyped] + except ImportError: try: - from sql_database import sql_table - except ImportError as e: - from .exceptions import SqlDatabaseSourceImportError - - raise SqlDatabaseSourceImportError from e + from dlt.sources.sql_database import sql_table, engine_from_credentials # type: ignore[import-not-found] + except ImportError: + try: + from sql_database import sql_table, engine_from_credentials + except ImportError as e: + raise SqlDatabaseSourceImportError from e def _get_conn( @@ -389,43 +378,43 @@ def _get_rep_conn( return _get_conn(credentials, LogicalReplicationConnection) # type: ignore[return-value] -def _make_qualified_table_name(table_name: str, schema_name: str) -> str: - """Escapes and combines a schema and table name.""" - return ( - escape_postgres_identifier(schema_name) - + "." - + escape_postgres_identifier(table_name) - ) - - -def _get_pk( - table_name: str, - schema_name: str, - credentials: ConnectionStringCredentials, -) -> Optional[TColumnNames]: - """Returns primary key column(s) for postgres table. - - Returns None if no primary key columns exist. - """ - qual_name = _make_qualified_table_name(table_name, schema_name) - cur = _get_conn(credentials).cursor() - # https://wiki.postgresql.org/wiki/Retrieve_primary_key_columns - cur.execute( - f""" - SELECT a.attname - FROM pg_index i - JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey) - WHERE i.indrelid = '{qual_name}'::regclass - AND i.indisprimary; - """ - ) - result = [tup[0] for tup in cur.fetchall()] - cur.connection.close() - if len(result) == 0: - return None - elif len(result) == 1: - return result[0] # type: ignore[no-any-return] - return result +# def _make_qualified_table_name(table_name: str, schema_name: str) -> str: +# """Escapes and combines a schema and table name.""" +# return ( +# escape_postgres_identifier(schema_name) +# + "." +# + escape_postgres_identifier(table_name) +# ) +# +# +# def _get_pk( +# table_name: str, +# schema_name: str, +# credentials: ConnectionStringCredentials, +# ) -> Optional[TColumnNames]: +# """Returns primary key column(s) for postgres table. +# +# Returns None if no primary key columns exist. +# """ +# qual_name = _make_qualified_table_name(table_name, schema_name) +# cur = _get_conn(credentials).cursor() +# # https://wiki.postgresql.org/wiki/Retrieve_primary_key_columns +# cur.execute( +# f""" +# SELECT a.attname +# FROM pg_index i +# JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey) +# WHERE i.indrelid = '{qual_name}'::regclass +# AND i.indisprimary; +# """ +# ) +# result = [tup[0] for tup in cur.fetchall()] +# cur.connection.close() +# if len(result) == 0: +# return None +# elif len(result) == 1: +# return result[0] # type: ignore[no-any-return] +# return result @dataclass @@ -535,7 +524,6 @@ def process_msg(self, msg: ReplicationMessage) -> None: """ row_msg = RowMessage() row_msg.ParseFromString(msg.payload) - from devtools import debug debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] op = row_msg.op diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 1842c51be..119b8d59a 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -53,31 +53,31 @@ def tbl_y(data): take_snapshots=True, ) - # changes = replication_resource(slot_name, pub_name) - # - # src_pl.run( - # [ - # tbl_x([{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}]), - # tbl_y({"id_y": 2, "val_y": False}), - # ] - # ) - # - # dest_pl = dlt.pipeline( - # pipeline_name="dest_pl", destination=destination_name, full_refresh=True - # ) - # - # # initial load - # info = dest_pl.run(snapshots) - # assert_load_info(info) - # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} - # exp_tbl_x = [{"id_x": 1, "val_x": "foo"}] - # exp_tbl_y = [{"id_y": 1, "val_y": True}] - # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - # - # # process changes - # info = dest_pl.run(changes) - # assert_load_info(info) + changes = replication_resource(slot_name) + + src_pl.run( + [ + tbl_x([{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}]), + tbl_y({"id_y": 2, "val_y": False}), + ] + ) + + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, full_refresh=True + ) + + # initial load + info = dest_pl.run(snapshots) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} + exp_tbl_x = [{"id_x": 1, "val_x": "foo"}] + exp_tbl_y = [{"id_y": 1, "val_y": True}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info) # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} # exp_tbl_x = [ # {"id_x": 1, "val_x": "foo"}, diff --git a/tests/postgres/postgresql.conf b/tests/postgres/postgresql.conf index a85b40a37..93a3dab5a 100644 --- a/tests/postgres/postgresql.conf +++ b/tests/postgres/postgresql.conf @@ -534,8 +534,8 @@ wal_level = logical # minimal, replica, or logical # actions running at least this number # of milliseconds. #log_checkpoints = off -#log_connections = off -#log_disconnections = off +log_connections = on +log_disconnections = on #log_duration = off #log_error_verbosity = default # terse, default, or verbose messages #log_hostname = off @@ -572,7 +572,7 @@ wal_level = logical # minimal, replica, or logical #log_parameter_max_length_on_error = 0 # when logging an error, limit logged # bind-parameter values to N bytes; # -1 means print in full, 0 disables -#log_statement = 'none' # none, ddl, mod, all +log_statement = 'all' # none, ddl, mod, all #log_replication_commands = off #log_temp_files = -1 # log temporary files equal or larger # than the specified size in kilobytes; From d303c04a691ff98516a66efc72932f22a9356260 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 9 Oct 2024 19:49:55 +0200 Subject: [PATCH 12/88] chore: simply cleaning up --- sources/pg_legacy_replication/helpers.py | 39 ++++++++++--------- .../pg_legacy_replication/requirements.txt | 3 +- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index ed57b01e1..14b93fe05 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -47,6 +47,8 @@ convert_pg_ts, ) +from sqlalchemy import Connection as ConnectionSqla, Engine, event + from .pg_logicaldec_pb2 import Op, RowMessage # type: ignore [attr-defined] from .schema_types import _to_dlt_column_schema, _to_dlt_val from .exceptions import SqlDatabaseSourceImportError @@ -118,28 +120,14 @@ def init_replication( drop_replication_slot(slot_name, rep_cur) slot = create_replication_slot(slot_name, rep_cur) + # Close connection if no snapshots are needed if not take_snapshots: rep_conn.close() return None + # Ensure `sqlalchemy` and `sql_table` are available _import_sql_table_resource() - # If this point is reached it means that sqlalchemy is available - from sqlalchemy import Connection as ConnectionSqla, Engine, event - - if include_columns is None: - include_columns = {} - - engine: Engine = engine_from_credentials( # type: ignore[name-defined] - credentials, - may_dispose_after_use=False, - pool_size=1, # Only one connection in the pool - max_overflow=0, # No additional connections beyond the pool size - pool_timeout=30, # Time to wait for a connection to be available - pool_recycle=-1, # Disable automatic connection recycling - pool_pre_ping=True, # Test the connection before using it (optional) - ) - engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) - setattr(engine, "rep_conn", rep_conn) # noqa + engine = _configure_engine(credentials, rep_conn) @event.listens_for(engine, "begin") def on_begin(conn: ConnectionSqla) -> None: @@ -153,7 +141,8 @@ def on_begin(conn: ConnectionSqla) -> None: cur.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") cur.execute(f"SET TRANSACTION SNAPSHOT '{slot['snapshot_name']}'") - snapshot_resources = [ + include_columns = include_columns or {} + return [ sql_table( # type: ignore[name-defined] credentials=engine, table=table_name, @@ -162,7 +151,19 @@ def on_begin(conn: ConnectionSqla) -> None: ) for table_name in table_names ] - return snapshot_resources + + +def _configure_engine( + credentials: ConnectionStringCredentials, rep_conn: LogicalReplicationConnection +) -> Engine: + """ + Configures the SQLAlchemy engine. + Also attaches the replication connection in order to prevent it being garbage collected and closed. + """ + engine: Engine = engine_from_credentials(credentials, may_dispose_after_use=False) # type: ignore[name-defined] + engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) + setattr(engine, "rep_conn", rep_conn) # noqa + return engine @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) diff --git a/sources/pg_legacy_replication/requirements.txt b/sources/pg_legacy_replication/requirements.txt index f2c2be351..432a62270 100644 --- a/sources/pg_legacy_replication/requirements.txt +++ b/sources/pg_legacy_replication/requirements.txt @@ -1,3 +1,4 @@ dlt>=0.4.13 psycopg2-binary>=2.9.9 -protobuf>=5 \ No newline at end of file +protobuf>=5 +sqlalchemy>=1.4 \ No newline at end of file From 6566fe4cc2b4ca759ded5bc73b89b327e42e1923 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 9 Oct 2024 22:23:24 +0200 Subject: [PATCH 13/88] chore: need to find a better way to clean up the underlying engine --- sources/pg_legacy_replication/helpers.py | 134 ++---------------- .../test_pg_replication.py | 7 +- 2 files changed, 19 insertions(+), 122 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 14b93fe05..1d0433fb2 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -163,9 +163,22 @@ def _configure_engine( engine: Engine = engine_from_credentials(credentials, may_dispose_after_use=False) # type: ignore[name-defined] engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) setattr(engine, "rep_conn", rep_conn) # noqa + + @event.listens_for(engine, "engine_disposed") + def on_engine_disposed(engine: Engine) -> None: + delattr(engine, "rep_conn") + return engine +def cleanup_snapshot_resources(snapshots: List[DltResource]) -> None: + """FIXME Awful hack to release the underlying SQL engine when snapshotting tables""" + if not snapshots: + return + engine: Engine = snapshots[0]._explicit_args["credentials"] + engine.dispose() + + @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) def get_pg_version( cur: cursor = None, @@ -208,64 +221,6 @@ def drop_replication_slot(name: str, cur: ReplicationCursor) -> None: ) -# def persist_snapshot_table( -# snapshot_name: str, -# table_name: str, -# schema_name: str, -# cur: cursor, -# include_columns: Optional[Sequence[str]] = None, -# ) -> str: -# """Persists exported snapshot table state. -# -# Reads snapshot table content and copies it into new table. -# """ -# col_str = "*" -# if include_columns is not None: -# col_str = ", ".join(map(escape_postgres_identifier, include_columns)) -# snapshot_table_name = f"{table_name}_snapshot_{snapshot_name}" -# snapshot_qual_name = _make_qualified_table_name(snapshot_table_name, schema_name) -# qual_name = _make_qualified_table_name(table_name, schema_name) -# cur.execute( -# f""" -# START TRANSACTION ISOLATION LEVEL REPEATABLE READ; -# SET TRANSACTION SNAPSHOT '{snapshot_name}'; -# CREATE TABLE {snapshot_qual_name} AS SELECT {col_str} FROM {qual_name}; -# """ -# ) -# cur.connection.commit() -# logger.info(f"Successfully persisted snapshot table state in {snapshot_qual_name}.") -# return snapshot_table_name -# -# -# def snapshot_table_resource( -# snapshot_table_name: str, -# schema_name: str, -# table_name: str, -# write_disposition: TWriteDisposition, -# columns: TTableSchemaColumns = None, -# credentials: ConnectionStringCredentials = dlt.secrets.value, -# ) -> DltResource: -# """Returns a resource for a persisted snapshot table. -# -# Can be used to perform an initial load of the table, so all data that -# existed in the table prior to initializing replication is also captured. -# """ -# resource: DltResource = sql_table( # type: ignore[name-defined] -# credentials=credentials, -# table=snapshot_table_name, -# schema=schema_name, -# detect_precision_hints=True, -# ) -# primary_key = _get_pk(table_name, schema_name, credentials) -# resource.apply_hints( -# table_name=table_name, -# write_disposition=write_disposition, -# columns=columns, -# primary_key=primary_key, -# ) -# return resource - - def get_max_lsn( slot_name: str, credentials: ConnectionStringCredentials, @@ -286,30 +241,6 @@ def get_max_lsn( return lsn -# def get_pub_ops( -# pub_name: str, -# credentials: ConnectionStringCredentials, -# ) -> Dict[str, bool]: -# """Returns dictionary of DML operations and their publish status.""" -# cur = _get_conn(credentials).cursor() -# cur.execute( -# f""" -# SELECT pubinsert, pubupdate, pubdelete, pubtruncate -# FROM pg_publication WHERE pubname = '{pub_name}' -# """ -# ) -# result = cur.fetchone() -# cur.connection.close() -# if result is None: -# raise ValueError(f'Publication "{pub_name}" does not exist.') -# return { -# "insert": result[0], -# "update": result[1], -# "delete": result[2], -# "truncate": result[3], -# } - - def lsn_int_to_hex(lsn: int) -> str: """Convert integer LSN to postgres' hexadecimal representation.""" # https://stackoverflow.com/questions/66797767/lsn-external-representation. @@ -379,45 +310,6 @@ def _get_rep_conn( return _get_conn(credentials, LogicalReplicationConnection) # type: ignore[return-value] -# def _make_qualified_table_name(table_name: str, schema_name: str) -> str: -# """Escapes and combines a schema and table name.""" -# return ( -# escape_postgres_identifier(schema_name) -# + "." -# + escape_postgres_identifier(table_name) -# ) -# -# -# def _get_pk( -# table_name: str, -# schema_name: str, -# credentials: ConnectionStringCredentials, -# ) -> Optional[TColumnNames]: -# """Returns primary key column(s) for postgres table. -# -# Returns None if no primary key columns exist. -# """ -# qual_name = _make_qualified_table_name(table_name, schema_name) -# cur = _get_conn(credentials).cursor() -# # https://wiki.postgresql.org/wiki/Retrieve_primary_key_columns -# cur.execute( -# f""" -# SELECT a.attname -# FROM pg_index i -# JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey) -# WHERE i.indrelid = '{qual_name}'::regclass -# AND i.indisprimary; -# """ -# ) -# result = [tup[0] for tup in cur.fetchall()] -# cur.connection.close() -# if len(result) == 0: -# return None -# elif len(result) == 1: -# return result[0] # type: ignore[no-any-return] -# return result - - @dataclass class ItemGenerator: credentials: ConnectionStringCredentials diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 119b8d59a..b52a87bfd 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -14,7 +14,11 @@ get_table_metrics, ) from sources.pg_legacy_replication import replication_resource -from sources.pg_legacy_replication.helpers import init_replication, get_pg_version +from sources.pg_legacy_replication.helpers import ( + init_replication, + get_pg_version, + cleanup_snapshot_resources, +) from sources.pg_legacy_replication.exceptions import ( IncompatiblePostgresVersionException, ) @@ -74,6 +78,7 @@ def tbl_y(data): exp_tbl_y = [{"id_y": 1, "val_y": True}] assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + cleanup_snapshot_resources(snapshots) # process changes info = dest_pl.run(changes) From 70d40a09d2703e3c2168408d99138508239122f0 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 11 Oct 2024 11:36:08 +0200 Subject: [PATCH 14/88] wip: handling begin/commit --- sources/pg_legacy_replication/decoders.py | 131 ++++++++++-------- sources/pg_legacy_replication/helpers.py | 10 +- sources/pg_legacy_replication/schema_types.py | 10 -- .../test_schema_types.py | 22 +-- 4 files changed, 92 insertions(+), 81 deletions(-) diff --git a/sources/pg_legacy_replication/decoders.py b/sources/pg_legacy_replication/decoders.py index c2707b46a..531ed23c8 100644 --- a/sources/pg_legacy_replication/decoders.py +++ b/sources/pg_legacy_replication/decoders.py @@ -7,6 +7,7 @@ from dataclasses import dataclass from datetime import datetime, timedelta, timezone from typing import List, Optional, Union +from .pg_logicaldec_pb2 import Op, RowMessage # type: ignore [attr-defined] # integer byte lengths INT8 = 1 @@ -132,64 +133,86 @@ def read_tuple_data(self) -> TupleData: return TupleData(n_columns=n_columns, column_data=column_data) -class Begin(PgoutputMessage): - """ - https://pgpedia.info/x/xlogrecptr.html - https://www.postgresql.org/docs/14/datatype-pg-lsn.html - - byte1 Byte1('B') Identifies the message as a begin message. - lsn Int64 The final LSN of the transaction. - commit_tx_ts Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). - tx_xid Int32 Xid of the transaction. - """ - - byte1: str - lsn: int +@dataclass +class Begin: + transaction_id: int commit_ts: datetime - tx_xid: int - - def decode_buffer(self) -> None: - if self.byte1 != "B": - raise ValueError("first byte in buffer does not match Begin message") - self.lsn = self.read_int64() - self.commit_ts = self.read_timestamp() - self.tx_xid = self.read_int64() - - def __repr__(self) -> str: - return ( - f"BEGIN \n\tbyte1: '{self.byte1}', \n\tLSN: {self.lsn}, " - f"\n\tcommit_ts {self.commit_ts}, \n\ttx_xid: {self.tx_xid}" - ) - -class Commit(PgoutputMessage): - """ - byte1: Byte1('C') Identifies the message as a commit message. - flags: Int8 Flags; currently unused (must be 0). - lsn_commit: Int64 The LSN of the commit. - lsn: Int64 The end LSN of the transaction. - Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). - """ - - byte1: str - flags: int - lsn_commit: int - lsn: int + def __init__(self, row_msg: RowMessage): + assert row_msg.op == Op.BEGIN + self.transaction_id = row_msg.transaction_id + self.commit_ts = convert_pg_ts(row_msg.commit_time) + + +# class Begin(PgoutputMessage): +# """ +# https://pgpedia.info/x/xlogrecptr.html +# https://www.postgresql.org/docs/14/datatype-pg-lsn.html +# +# byte1 Byte1('B') Identifies the message as a begin message. +# lsn Int64 The final LSN of the transaction. +# commit_tx_ts Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). +# tx_xid Int32 Xid of the transaction. +# """ +# +# byte1: str +# lsn: int +# commit_ts: datetime +# tx_xid: int +# +# def decode_buffer(self) -> None: +# if self.byte1 != "B": +# raise ValueError("first byte in buffer does not match Begin message") +# self.lsn = self.read_int64() +# self.commit_ts = self.read_timestamp() +# self.tx_xid = self.read_int64() +# +# def __repr__(self) -> str: +# return ( +# f"BEGIN \n\tbyte1: '{self.byte1}', \n\tLSN: {self.lsn}, " +# f"\n\tcommit_ts {self.commit_ts}, \n\ttx_xid: {self.tx_xid}" +# ) + + +@dataclass +class Commit: + transaction_id: int commit_ts: datetime - def decode_buffer(self) -> None: - if self.byte1 != "C": - raise ValueError("first byte in buffer does not match Commit message") - self.flags = self.read_int8() - self.lsn_commit = self.read_int64() - self.lsn = self.read_int64() - self.commit_ts = self.read_timestamp() - - def __repr__(self) -> str: - return ( - f"COMMIT \n\tbyte1: {self.byte1}, \n\tflags {self.flags}, \n\tlsn_commit: {self.lsn_commit}" - f"\n\tLSN: {self.lsn}, \n\tcommit_ts {self.commit_ts}" - ) + def __init__(self, row_msg: RowMessage): + assert row_msg.op == Op.COMMIT + self.transaction_id = row_msg.transaction_id + self.commit_ts = convert_pg_ts(row_msg.commit_time) + + +# class Commit(PgoutputMessage): +# """ +# byte1: Byte1('C') Identifies the message as a commit message. +# flags: Int8 Flags; currently unused (must be 0). +# lsn_commit: Int64 The LSN of the commit. +# lsn: Int64 The end LSN of the transaction. +# Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). +# """ +# +# byte1: str +# flags: int +# lsn_commit: int +# lsn: int +# commit_ts: datetime +# +# def decode_buffer(self) -> None: +# if self.byte1 != "C": +# raise ValueError("first byte in buffer does not match Commit message") +# self.flags = self.read_int8() +# self.lsn_commit = self.read_int64() +# self.lsn = self.read_int64() +# self.commit_ts = self.read_timestamp() +# +# def __repr__(self) -> str: +# return ( +# f"COMMIT \n\tbyte1: {self.byte1}, \n\tflags {self.flags}, \n\tlsn_commit: {self.lsn_commit}" +# f"\n\tLSN: {self.lsn}, \n\tcommit_ts {self.commit_ts}" +# ) class Origin: diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 1d0433fb2..fd085c776 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -415,13 +415,15 @@ def process_msg(self, msg: ReplicationMessage) -> None: - `target_batch_size` is reached - a table's schema has changed """ + debug(msg) row_msg = RowMessage() row_msg.ParseFromString(msg.payload) - debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] op = row_msg.op if op == Op.BEGIN: - self.last_commit_ts = convert_pg_ts(row_msg.commit_time) # type: ignore[assignment] + self.last_commit_ts = Begin(row_msg).commit_ts # type: ignore[assignment] + elif op == Op.COMMIT: + self.process_commit(msg) # if op == Op.UPDATE: # self.process_change(row_msg) # op = msg.payload[:1] @@ -431,10 +433,6 @@ def process_msg(self, msg: ReplicationMessage) -> None: # self.process_change(Update(msg.payload), msg.data_start) # elif op == b"D": # self.process_change(Delete(msg.payload), msg.data_start) - # elif op == b"B": - # self.last_commit_ts = Begin(msg.payload).commit_ts # type: ignore[assignment] - # elif op == b"C": - # self.process_commit(msg) # elif op == b"R": # self.process_relation(Relation(msg.payload)) # elif op == b"T": diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index c03ed7afb..e302859c2 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -138,16 +138,6 @@ def _to_dlt_val(val: str, data_type: TDataType, byte1: str, for_delete: bool) -> def _extract_table_schema(row_msg: RowMessage) -> TTableSchema: schema_name, table_name = row_msg.table.split(".") - # Remove leading and trailing quotes - table_name = table_name[1:-1] - import re - - regex = r"^(?P[a-zA-Z_][a-zA-Z0-9_]{0,62})_snapshot_(?P[a-zA-Z0-9_-]+)$" - match = re.match(regex, table_name) - if match: - table_name = match.group("table_name") - snapshot_name = match.group("snapshot_name") - print(f"Table name: {table_name}, Snapshot name: {snapshot_name}") columns: TTableSchemaColumns = {} for c, c_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): diff --git a/tests/pg_legacy_replication/test_schema_types.py b/tests/pg_legacy_replication/test_schema_types.py index 1421a60bb..d4cc68b5d 100644 --- a/tests/pg_legacy_replication/test_schema_types.py +++ b/tests/pg_legacy_replication/test_schema_types.py @@ -9,21 +9,21 @@ def test_extract_table_schema(): row_msg = RowMessage() - row_msg.table = ( - 'src_pl_dataset_202410010746423478."tbl_x_snapshot_00000003-00000149-1"' - ) + row_msg.transaction_id = 818 + row_msg.commit_time = 1728637822344316 + row_msg.table = "src_pl_dataset_202410110910185374_staging.tbl_x" row_msg.op = Op.INSERT row_msg.new_tuple.extend( [ - DatumMessage(column_name="id_x", column_type=20, datum_int64=1), - DatumMessage(column_name="val_x", column_type=1043, datum_string="foo"), + DatumMessage(column_name="id_x", column_type=20, datum_int64=2), + DatumMessage(column_name="val_x", column_type=1043, datum_string="bar"), DatumMessage( column_name="_dlt_load_id", column_type=1043, - datum_string="1727812002.3873408", + datum_string="1728637821.2016037", ), DatumMessage( - column_name="_dlt_id", column_type=1043, datum_string="EVvtapNpxpWbqA" + column_name="_dlt_id", column_type=1043, datum_string="q52p9Y2Ac5ZXaA" ), ] ) @@ -31,8 +31,8 @@ def test_extract_table_schema(): [ TypeInfo(modifier="bigint"), TypeInfo(modifier="character varying", value_optional=True), - TypeInfo(modifier="character varying", value_optional=True), - TypeInfo(modifier="character varying", value_optional=True), + TypeInfo(modifier="character varying"), + TypeInfo(modifier="character varying"), ] ) assert _extract_table_schema(row_msg) == { @@ -52,12 +52,12 @@ def test_extract_table_schema(): "_dlt_load_id": { "data_type": "text", "name": "_dlt_load_id", - "nullable": True, + "nullable": False, }, "_dlt_id": { "data_type": "text", "name": "_dlt_id", - "nullable": True, + "nullable": False, }, }, } From f70343169925619be60a333dd88bd5e839a51dd5 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 11 Oct 2024 18:08:45 +0200 Subject: [PATCH 15/88] wip: saving work --- pyproject.toml | 1 + sources/pg_legacy_replication/decoders.py | 131 ++++++-------- sources/pg_legacy_replication/helpers.py | 113 +++++++++--- .../pg_legacy_replication/pg_logicaldec.proto | 50 ++++++ .../pg_logicaldec_pb2.pyi | 166 ++++++++++++++++++ sources/pg_legacy_replication/schema_types.py | 53 ++---- tests/pg_legacy_replication/test_helpers.py | 81 +++++++++ 7 files changed, 450 insertions(+), 145 deletions(-) create mode 100644 sources/pg_legacy_replication/pg_logicaldec.proto create mode 100644 sources/pg_legacy_replication/pg_logicaldec_pb2.pyi create mode 100644 tests/pg_legacy_replication/test_helpers.py diff --git a/pyproject.toml b/pyproject.toml index 02feaca1f..60f6a965c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ pendulum = "^3.0.0" types-protobuf = "^5.27.0.20240907" devtools = "^0.12.2" pytest-cov = "^5.0.0" +mypy-protobuf = "^3.6.0" [tool.poetry.group.sql_database.dependencies] sqlalchemy = ">=1.4" diff --git a/sources/pg_legacy_replication/decoders.py b/sources/pg_legacy_replication/decoders.py index 531ed23c8..c2707b46a 100644 --- a/sources/pg_legacy_replication/decoders.py +++ b/sources/pg_legacy_replication/decoders.py @@ -7,7 +7,6 @@ from dataclasses import dataclass from datetime import datetime, timedelta, timezone from typing import List, Optional, Union -from .pg_logicaldec_pb2 import Op, RowMessage # type: ignore [attr-defined] # integer byte lengths INT8 = 1 @@ -133,86 +132,64 @@ def read_tuple_data(self) -> TupleData: return TupleData(n_columns=n_columns, column_data=column_data) -@dataclass -class Begin: - transaction_id: int +class Begin(PgoutputMessage): + """ + https://pgpedia.info/x/xlogrecptr.html + https://www.postgresql.org/docs/14/datatype-pg-lsn.html + + byte1 Byte1('B') Identifies the message as a begin message. + lsn Int64 The final LSN of the transaction. + commit_tx_ts Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). + tx_xid Int32 Xid of the transaction. + """ + + byte1: str + lsn: int commit_ts: datetime + tx_xid: int + + def decode_buffer(self) -> None: + if self.byte1 != "B": + raise ValueError("first byte in buffer does not match Begin message") + self.lsn = self.read_int64() + self.commit_ts = self.read_timestamp() + self.tx_xid = self.read_int64() + + def __repr__(self) -> str: + return ( + f"BEGIN \n\tbyte1: '{self.byte1}', \n\tLSN: {self.lsn}, " + f"\n\tcommit_ts {self.commit_ts}, \n\ttx_xid: {self.tx_xid}" + ) - def __init__(self, row_msg: RowMessage): - assert row_msg.op == Op.BEGIN - self.transaction_id = row_msg.transaction_id - self.commit_ts = convert_pg_ts(row_msg.commit_time) - - -# class Begin(PgoutputMessage): -# """ -# https://pgpedia.info/x/xlogrecptr.html -# https://www.postgresql.org/docs/14/datatype-pg-lsn.html -# -# byte1 Byte1('B') Identifies the message as a begin message. -# lsn Int64 The final LSN of the transaction. -# commit_tx_ts Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). -# tx_xid Int32 Xid of the transaction. -# """ -# -# byte1: str -# lsn: int -# commit_ts: datetime -# tx_xid: int -# -# def decode_buffer(self) -> None: -# if self.byte1 != "B": -# raise ValueError("first byte in buffer does not match Begin message") -# self.lsn = self.read_int64() -# self.commit_ts = self.read_timestamp() -# self.tx_xid = self.read_int64() -# -# def __repr__(self) -> str: -# return ( -# f"BEGIN \n\tbyte1: '{self.byte1}', \n\tLSN: {self.lsn}, " -# f"\n\tcommit_ts {self.commit_ts}, \n\ttx_xid: {self.tx_xid}" -# ) - - -@dataclass -class Commit: - transaction_id: int + +class Commit(PgoutputMessage): + """ + byte1: Byte1('C') Identifies the message as a commit message. + flags: Int8 Flags; currently unused (must be 0). + lsn_commit: Int64 The LSN of the commit. + lsn: Int64 The end LSN of the transaction. + Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). + """ + + byte1: str + flags: int + lsn_commit: int + lsn: int commit_ts: datetime - def __init__(self, row_msg: RowMessage): - assert row_msg.op == Op.COMMIT - self.transaction_id = row_msg.transaction_id - self.commit_ts = convert_pg_ts(row_msg.commit_time) - - -# class Commit(PgoutputMessage): -# """ -# byte1: Byte1('C') Identifies the message as a commit message. -# flags: Int8 Flags; currently unused (must be 0). -# lsn_commit: Int64 The LSN of the commit. -# lsn: Int64 The end LSN of the transaction. -# Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). -# """ -# -# byte1: str -# flags: int -# lsn_commit: int -# lsn: int -# commit_ts: datetime -# -# def decode_buffer(self) -> None: -# if self.byte1 != "C": -# raise ValueError("first byte in buffer does not match Commit message") -# self.flags = self.read_int8() -# self.lsn_commit = self.read_int64() -# self.lsn = self.read_int64() -# self.commit_ts = self.read_timestamp() -# -# def __repr__(self) -> str: -# return ( -# f"COMMIT \n\tbyte1: {self.byte1}, \n\tflags {self.flags}, \n\tlsn_commit: {self.lsn_commit}" -# f"\n\tLSN: {self.lsn}, \n\tcommit_ts {self.commit_ts}" -# ) + def decode_buffer(self) -> None: + if self.byte1 != "C": + raise ValueError("first byte in buffer does not match Commit message") + self.flags = self.read_int8() + self.lsn_commit = self.read_int64() + self.lsn = self.read_int64() + self.commit_ts = self.read_timestamp() + + def __repr__(self) -> str: + return ( + f"COMMIT \n\tbyte1: {self.byte1}, \n\tflags {self.flags}, \n\tlsn_commit: {self.lsn_commit}" + f"\n\tLSN: {self.lsn}, \n\tcommit_ts {self.commit_ts}" + ) class Origin: diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index fd085c776..6089e8d56 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -49,7 +49,7 @@ from sqlalchemy import Connection as ConnectionSqla, Engine, event -from .pg_logicaldec_pb2 import Op, RowMessage # type: ignore [attr-defined] +from .pg_logicaldec_pb2 import Op, RowMessage from .schema_types import _to_dlt_column_schema, _to_dlt_val from .exceptions import SqlDatabaseSourceImportError from google.protobuf.json_format import MessageToDict @@ -421,9 +421,27 @@ def process_msg(self, msg: ReplicationMessage) -> None: debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] op = row_msg.op if op == Op.BEGIN: - self.last_commit_ts = Begin(row_msg).commit_ts # type: ignore[assignment] + self.last_commit_ts = convert_pg_ts(row_msg.commit_time) # type: ignore[assignment] elif op == Op.COMMIT: self.process_commit(msg) + # elif op == Op.INSERT: + # column_data = decoded_msg.new_tuple.column_data + # table_name = row_msg.table + # data_item = self.gen_data_item( + # data=column_data, + # column_schema=self.last_table_schema[decoded_msg.relation_id][ + # "columns" + # ], + # lsn=msg.data_start, + # commit_ts=convert_pg_ts(row_msg.commit_time), + # for_delete=False, + # include_columns=( + # None + # if self.include_columns is None + # else self.include_columns.get(table_name) + # ), + # ) + # self.data_items[decoded_msg.relation_id].append(data_item) # if op == Op.UPDATE: # self.process_change(row_msg) # op = msg.payload[:1] @@ -523,31 +541,31 @@ def process_relation(self, decoded_msg: Relation) -> None: ) self.data_items[decoded_msg.relation_id] = [meta_item] - def process_change( - self, decoded_msg: Union[Insert, Update, Delete], msg_start_lsn: int - ) -> None: - """Processes replication message of type Insert, Update, or Delete. - - Adds data item for inserted/updated/deleted record to instance attribute. - """ - if isinstance(decoded_msg, (Insert, Update)): - column_data = decoded_msg.new_tuple.column_data - elif isinstance(decoded_msg, Delete): - column_data = decoded_msg.old_tuple.column_data - table_name = self.last_table_schema[decoded_msg.relation_id]["name"] - data_item = self.gen_data_item( - data=column_data, - column_schema=self.last_table_schema[decoded_msg.relation_id]["columns"], - lsn=msg_start_lsn, - commit_ts=self.last_commit_ts, - for_delete=isinstance(decoded_msg, Delete), - include_columns=( - None - if self.include_columns is None - else self.include_columns.get(table_name) - ), - ) - self.data_items[decoded_msg.relation_id].append(data_item) + # def process_change( + # self, decoded_msg: Union[Insert, Update, Delete], msg_start_lsn: int + # ) -> None: + # """Processes replication message of type Insert, Update, or Delete. + # + # Adds data item for inserted/updated/deleted record to instance attribute. + # """ + # if isinstance(decoded_msg, (Insert, Update)): + # column_data = decoded_msg.new_tuple.column_data + # elif isinstance(decoded_msg, Delete): + # column_data = decoded_msg.old_tuple.column_data + # table_name = self.last_table_schema[decoded_msg.relation_id]["name"] + # data_item = self.gen_data_item( + # data=column_data, + # column_schema=self.last_table_schema[decoded_msg.relation_id]["columns"], + # lsn=msg_start_lsn, + # commit_ts=self.last_commit_ts, + # for_delete=isinstance(decoded_msg, Delete), + # include_columns=( + # None + # if self.include_columns is None + # else self.include_columns.get(table_name) + # ), + # ) + # self.data_items[decoded_msg.relation_id].append(data_item) @staticmethod def gen_data_item( @@ -573,3 +591,44 @@ def gen_data_item( if for_delete: data_item["deleted_ts"] = commit_ts return data_item + + +# FIXME Refactor later +from .schema_types import _PG_TYPES, _type_mapper +from dlt.common.schema.typing import TColumnType, TColumnSchema + +_DATUM_PRECISIONS: Dict[str, int] = { + "datum_int32": 32, + "datum_int64": 64, + "datum_float": 32, + "datum_double": 64, +} +"""TODO: Add comment here""" + + +def extract_table_schema(row_msg: RowMessage) -> TTableSchema: + schema_name, table_name = row_msg.table.split(".") + + columns: TTableSchemaColumns = {} + for c, c_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): + assert _PG_TYPES[c.column_type] == c_info.modifier + col_type: TColumnType = _type_mapper().from_db_type(c_info.modifier) + col_schema: TColumnSchema = { + "name": c.column_name, + "nullable": c_info.value_optional, + **col_type, + } + + precision = _DATUM_PRECISIONS.get(c.WhichOneof("datum")) + if precision is not None: + col_schema["precision"] = precision + + columns[c.column_name] = col_schema + + return {"name": table_name, "columns": columns} + + +def gen_data_item( + row_msg: RowMessage, lsn: int, include_columns: Optional[Sequence[str]] = None +) -> TDataItem: + pass diff --git a/sources/pg_legacy_replication/pg_logicaldec.proto b/sources/pg_legacy_replication/pg_logicaldec.proto new file mode 100644 index 000000000..43371f5a8 --- /dev/null +++ b/sources/pg_legacy_replication/pg_logicaldec.proto @@ -0,0 +1,50 @@ +package decoderbufs; + +option java_package="io.debezium.connector.postgresql.proto"; +option java_outer_classname = "PgProto"; +option optimize_for = SPEED; + +enum Op { + UNKNOWN = -1; + INSERT = 0; + UPDATE = 1; + DELETE = 2; + BEGIN = 3; + COMMIT = 4; +} + +message Point { + required double x = 1; + required double y = 2; +} + +message DatumMessage { + optional string column_name = 1; + optional int64 column_type = 2; + oneof datum { + int32 datum_int32 = 3; + int64 datum_int64 = 4; + float datum_float = 5; + double datum_double = 6; + bool datum_bool = 7; + string datum_string = 8; + bytes datum_bytes = 9; + Point datum_point = 10; + bool datum_missing = 11; + } +} + +message TypeInfo { + required string modifier = 1; + required bool value_optional = 2; +} + +message RowMessage { + optional uint32 transaction_id = 1; + optional uint64 commit_time = 2; + optional string table = 3; + optional Op op = 4; + repeated DatumMessage new_tuple = 5; + repeated DatumMessage old_tuple = 6; + repeated TypeInfo new_typeinfo = 7; +} diff --git a/sources/pg_legacy_replication/pg_logicaldec_pb2.pyi b/sources/pg_legacy_replication/pg_logicaldec_pb2.pyi new file mode 100644 index 000000000..abd25bf22 --- /dev/null +++ b/sources/pg_legacy_replication/pg_logicaldec_pb2.pyi @@ -0,0 +1,166 @@ +""" +@generated by mypy-protobuf. Do not edit manually! +isort:skip_file +""" + +import builtins +import collections.abc +import google.protobuf.descriptor +import google.protobuf.internal.containers +import google.protobuf.internal.enum_type_wrapper +import google.protobuf.message +import sys +import typing + +if sys.version_info >= (3, 10): + import typing as typing_extensions +else: + import typing_extensions + +DESCRIPTOR: google.protobuf.descriptor.FileDescriptor + +class _Op: + ValueType = typing.NewType("ValueType", builtins.int) + V: typing_extensions.TypeAlias = ValueType + +class _OpEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._EnumTypeWrapper[_Op.ValueType], builtins.type): + DESCRIPTOR: google.protobuf.descriptor.EnumDescriptor + UNKNOWN: _Op.ValueType # -1 + INSERT: _Op.ValueType # 0 + UPDATE: _Op.ValueType # 1 + DELETE: _Op.ValueType # 2 + BEGIN: _Op.ValueType # 3 + COMMIT: _Op.ValueType # 4 + +class Op(_Op, metaclass=_OpEnumTypeWrapper): ... + +UNKNOWN: Op.ValueType # -1 +INSERT: Op.ValueType # 0 +UPDATE: Op.ValueType # 1 +DELETE: Op.ValueType # 2 +BEGIN: Op.ValueType # 3 +COMMIT: Op.ValueType # 4 +global___Op = Op + +@typing.final +class Point(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + X_FIELD_NUMBER: builtins.int + Y_FIELD_NUMBER: builtins.int + x: builtins.float + y: builtins.float + def __init__( + self, + *, + x: builtins.float | None = ..., + y: builtins.float | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["x", b"x", "y", b"y"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["x", b"x", "y", b"y"]) -> None: ... + +global___Point = Point + +@typing.final +class DatumMessage(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + COLUMN_NAME_FIELD_NUMBER: builtins.int + COLUMN_TYPE_FIELD_NUMBER: builtins.int + DATUM_INT32_FIELD_NUMBER: builtins.int + DATUM_INT64_FIELD_NUMBER: builtins.int + DATUM_FLOAT_FIELD_NUMBER: builtins.int + DATUM_DOUBLE_FIELD_NUMBER: builtins.int + DATUM_BOOL_FIELD_NUMBER: builtins.int + DATUM_STRING_FIELD_NUMBER: builtins.int + DATUM_BYTES_FIELD_NUMBER: builtins.int + DATUM_POINT_FIELD_NUMBER: builtins.int + DATUM_MISSING_FIELD_NUMBER: builtins.int + column_name: builtins.str + column_type: builtins.int + datum_int32: builtins.int + datum_int64: builtins.int + datum_float: builtins.float + datum_double: builtins.float + datum_bool: builtins.bool + datum_string: builtins.str + datum_bytes: builtins.bytes + datum_missing: builtins.bool + @property + def datum_point(self) -> global___Point: ... + def __init__( + self, + *, + column_name: builtins.str | None = ..., + column_type: builtins.int | None = ..., + datum_int32: builtins.int | None = ..., + datum_int64: builtins.int | None = ..., + datum_float: builtins.float | None = ..., + datum_double: builtins.float | None = ..., + datum_bool: builtins.bool | None = ..., + datum_string: builtins.str | None = ..., + datum_bytes: builtins.bytes | None = ..., + datum_point: global___Point | None = ..., + datum_missing: builtins.bool | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["column_name", b"column_name", "column_type", b"column_type", "datum", b"datum", "datum_bool", b"datum_bool", "datum_bytes", b"datum_bytes", "datum_double", b"datum_double", "datum_float", b"datum_float", "datum_int32", b"datum_int32", "datum_int64", b"datum_int64", "datum_missing", b"datum_missing", "datum_point", b"datum_point", "datum_string", b"datum_string"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["column_name", b"column_name", "column_type", b"column_type", "datum", b"datum", "datum_bool", b"datum_bool", "datum_bytes", b"datum_bytes", "datum_double", b"datum_double", "datum_float", b"datum_float", "datum_int32", b"datum_int32", "datum_int64", b"datum_int64", "datum_missing", b"datum_missing", "datum_point", b"datum_point", "datum_string", b"datum_string"]) -> None: ... + def WhichOneof(self, oneof_group: typing.Literal["datum", b"datum"]) -> typing.Literal["datum_int32", "datum_int64", "datum_float", "datum_double", "datum_bool", "datum_string", "datum_bytes", "datum_point", "datum_missing"] | None: ... + +global___DatumMessage = DatumMessage + +@typing.final +class TypeInfo(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + MODIFIER_FIELD_NUMBER: builtins.int + VALUE_OPTIONAL_FIELD_NUMBER: builtins.int + modifier: builtins.str + value_optional: builtins.bool + def __init__( + self, + *, + modifier: builtins.str | None = ..., + value_optional: builtins.bool | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["modifier", b"modifier", "value_optional", b"value_optional"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["modifier", b"modifier", "value_optional", b"value_optional"]) -> None: ... + +global___TypeInfo = TypeInfo + +@typing.final +class RowMessage(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + TRANSACTION_ID_FIELD_NUMBER: builtins.int + COMMIT_TIME_FIELD_NUMBER: builtins.int + TABLE_FIELD_NUMBER: builtins.int + OP_FIELD_NUMBER: builtins.int + NEW_TUPLE_FIELD_NUMBER: builtins.int + OLD_TUPLE_FIELD_NUMBER: builtins.int + NEW_TYPEINFO_FIELD_NUMBER: builtins.int + transaction_id: builtins.int + commit_time: builtins.int + table: builtins.str + op: global___Op.ValueType + @property + def new_tuple(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___DatumMessage]: ... + @property + def old_tuple(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___DatumMessage]: ... + @property + def new_typeinfo(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___TypeInfo]: ... + def __init__( + self, + *, + transaction_id: builtins.int | None = ..., + commit_time: builtins.int | None = ..., + table: builtins.str | None = ..., + op: global___Op.ValueType | None = ..., + new_tuple: collections.abc.Iterable[global___DatumMessage] | None = ..., + old_tuple: collections.abc.Iterable[global___DatumMessage] | None = ..., + new_typeinfo: collections.abc.Iterable[global___TypeInfo] | None = ..., + ) -> None: ... + def HasField(self, field_name: typing.Literal["commit_time", b"commit_time", "op", b"op", "table", b"table", "transaction_id", b"transaction_id"]) -> builtins.bool: ... + def ClearField(self, field_name: typing.Literal["commit_time", b"commit_time", "new_tuple", b"new_tuple", "new_typeinfo", b"new_typeinfo", "old_tuple", b"old_tuple", "op", b"op", "table", b"table", "transaction_id", b"transaction_id"]) -> None: ... + +global___RowMessage = RowMessage diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index e302859c2..81692e0bb 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -5,17 +5,10 @@ from dlt.common import Decimal from dlt.common.data_types.typing import TDataType from dlt.common.data_types.type_helpers import coerce_value -from dlt.common.schema.typing import ( - TColumnSchema, - TColumnType, - TTableSchemaColumns, - TTableSchema, -) -from dlt.destinations import postgres -from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper +from dlt.common.schema.typing import TColumnSchema, TColumnType from .decoders import ColumnType -from .pg_logicaldec_pb2 import RowMessage # type: ignore[attr-defined] + _DUMMY_VALS: Dict[TDataType, Any] = { "bigint": 0, @@ -32,6 +25,7 @@ } """Dummy values used to replace NULLs in NOT NULL columns in key-only delete records.""" + _PG_TYPES: Dict[int, str] = { 16: "boolean", 17: "bytea", @@ -48,14 +42,6 @@ } """Maps postgres type OID to type string. Only includes types present in PostgresTypeMapper.""" -_DATUM_PRECISIONS: Dict[str, int] = { - "datum_int32": 32, - "datum_int64": 64, - "datum_float": 32, - "datum_double": 64, -} -"""TODO: Add comment here""" - def _get_precision(type_id: int, atttypmod: int) -> Optional[int]: """Get precision from postgres type attributes.""" @@ -91,7 +77,14 @@ def _get_scale(type_id: int, atttypmod: int) -> Optional[int]: @lru_cache(maxsize=None) -def _type_mapper() -> PostgresTypeMapper: +def _type_mapper() -> Any: + from dlt.destinations import postgres + + try: + from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper + except ImportError: + from dlt.destinations.impl.postgres.factory import PostgresTypeMapper # type: ignore + return PostgresTypeMapper(postgres().capabilities()) @@ -103,7 +96,7 @@ def _to_dlt_column_type(type_id: int, atttypmod: int) -> TColumnType: pg_type = _PG_TYPES.get(type_id) precision = _get_precision(type_id, atttypmod) scale = _get_scale(type_id, atttypmod) - return _type_mapper().from_db_type(pg_type, precision, scale) + return _type_mapper().from_db_type(pg_type, precision, scale) # type: ignore[no-any-return] def _to_dlt_column_schema(col: ColumnType) -> TColumnSchema: @@ -134,25 +127,3 @@ def _to_dlt_val(val: str, data_type: TDataType, byte1: str, for_delete: bool) -> raise ValueError( f"Byte1 in replication message must be 'n' or 't', not '{byte1}'." ) - - -def _extract_table_schema(row_msg: RowMessage) -> TTableSchema: - schema_name, table_name = row_msg.table.split(".") - - columns: TTableSchemaColumns = {} - for c, c_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): - assert _PG_TYPES[c.column_type] == c_info.modifier - col_type: TColumnType = _type_mapper().from_db_type(c_info.modifier) - col_schema: TColumnSchema = { - "name": c.column_name, - "nullable": c_info.value_optional, - **col_type, - } - - precision = _DATUM_PRECISIONS.get(c.WhichOneof("datum")) - if precision is not None: - col_schema["precision"] = precision - - columns[c.column_name] = col_schema - - return {"name": table_name, "columns": columns} diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py new file mode 100644 index 000000000..e3c85ca21 --- /dev/null +++ b/tests/pg_legacy_replication/test_helpers.py @@ -0,0 +1,81 @@ +from sources.pg_legacy_replication.helpers import extract_table_schema, gen_data_item +from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage +from google.protobuf.json_format import ParseDict as parse_dict + + +def test_extract_table_schema(): + row_msg = RowMessage() + data = { + "transactionId": 969, + "commitTime": "1728662646949062", + "table": "src_pl_dataset_202410110404048747_staging.tbl_y", + "op": "INSERT", + "newTuple": [ + { + "columnName": "id_y", + "columnType": "20", + "datumInt64": "2", + }, + { + "columnName": "val_y", + "columnType": "16", + "datumBool": False, + }, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + "datumString": "1728662646.2657657", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + "datumString": "gGjifTMTAUs5ag", + }, + ], + "newTypeinfo": [ + { + "modifier": "bigint", + "valueOptional": False, + }, + { + "modifier": "boolean", + "valueOptional": True, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + ], + "oldTuple": [], + } + parse_dict(data, row_msg) + assert extract_table_schema(row_msg) == { + "name": "tbl_y", + "columns": { + "id_y": { + "data_type": "bigint", + "precision": 64, + "name": "id_y", + "nullable": False, + }, + "val_y": { + "data_type": "bool", + "name": "val_y", + "nullable": True, + }, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "_dlt_id": { + "data_type": "text", + "name": "_dlt_id", + "nullable": False, + }, + }, + } From f0016333c9b63c5e57186034b0a40e0e0e657dcd Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 11 Oct 2024 19:23:51 +0200 Subject: [PATCH 16/88] wip: saving work --- sources/pg_legacy_replication/helpers.py | 29 +++++++++- tests/pg_legacy_replication/test_helpers.py | 62 ++++++++++++++++++++- 2 files changed, 88 insertions(+), 3 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 6089e8d56..4a6af9957 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -629,6 +629,31 @@ def extract_table_schema(row_msg: RowMessage) -> TTableSchema: def gen_data_item( - row_msg: RowMessage, lsn: int, include_columns: Optional[Sequence[str]] = None + row_msg: RowMessage, + column_schema: TTableSchemaColumns, + *, + lsn: int, + include_columns: Optional[Sequence[str]] = None, ) -> TDataItem: - pass + """Generates data item from a `RowMessage` and corresponding metadata.""" + assert row_msg.op in (Op.INSERT, Op.UPDATE, Op.DELETE) + column_data = ( + row_msg.new_tuple if row_msg.op in (Op.INSERT, Op.UPDATE) else row_msg.old_tuple + ) + + data_item = { + data.column_name: getattr(data, data.WhichOneof("datum")) + for schema, data in zip(column_schema.values(), column_data) + if include_columns is None or data.column_name in include_columns + } + + data_item["lsn"] = lsn + if row_msg.op == Op.DELETE: + data_item["deleted_ts"] = _convert_pg_timestamp(row_msg.commit_time) + + return data_item + + +def _convert_pg_timestamp(microseconds_since_2000: int) -> pendulum.DateTime: + epoch_2000 = pendulum.datetime(2000, 1, 1, tz="UTC") + return epoch_2000.add(microseconds=microseconds_since_2000) \ No newline at end of file diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index e3c85ca21..daaad1d8e 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -14,7 +14,7 @@ def test_extract_table_schema(): { "columnName": "id_y", "columnType": "20", - "datumInt64": "2", + "datumInt64": 2, }, { "columnName": "val_y", @@ -79,3 +79,63 @@ def test_extract_table_schema(): }, }, } + + +def test_gen_data_item(): + row_msg = RowMessage() + data = { + "transactionId": 969, + "commitTime": "1728662646949062", + "table": "src_pl_dataset_202410110404048747_staging.tbl_y", + "op": "INSERT", + "newTuple": [ + { + "columnName": "id_y", + "columnType": "20", + "datumInt64": "2", + }, + { + "columnName": "val_y", + "columnType": "16", + "datumBool": False, + }, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + "datumString": "1728662646.2657657", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + "datumString": "gGjifTMTAUs5ag", + }, + ], + "newTypeinfo": [ + { + "modifier": "bigint", + "valueOptional": False, + }, + { + "modifier": "boolean", + "valueOptional": True, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + ], + "oldTuple": [], + } + parse_dict(data, row_msg) + table_schema = extract_table_schema(row_msg) + assert gen_data_item(row_msg, table_schema["columns"], lsn=27078296) == { + "_dlt_id": "gGjifTMTAUs5ag", + "_dlt_load_id": "1728662646.2657657", + "id_y": 2, + "lsn": 27078296, + "val_y": False, + } From c0df7c9a554ee48926834eabb871bb6b20e661a8 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 14 Oct 2024 09:06:40 +0200 Subject: [PATCH 17/88] wip: saving work --- sources/pg_legacy_replication/helpers.py | 258 ++++++++++++-------- tests/pg_legacy_replication/test_helpers.py | 10 + 2 files changed, 160 insertions(+), 108 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 4a6af9957..33bda95e0 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -53,6 +53,7 @@ from .schema_types import _to_dlt_column_schema, _to_dlt_val from .exceptions import SqlDatabaseSourceImportError from google.protobuf.json_format import MessageToDict +from collections import defaultdict @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) @@ -357,8 +358,8 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: finally: cur.connection.close() self.last_commit_lsn = consumer.last_commit_lsn - for rel_id, data_items in consumer.data_items.items(): - table_name = consumer.last_table_schema[rel_id]["name"] + for qual_table_name, data_items in consumer.data_items.items(): + table_name = consumer.last_table_schema[qual_table_name]["name"] yield data_items[0] # meta item with column hints only, no data yield dlt.mark.with_table_name(data_items[1:], table_name) self.generated_all = consumer.consumed_all @@ -392,11 +393,13 @@ def __init__( self.consumed_all: bool = False # data_items attribute maintains all data items self.data_items: Dict[ - int, List[Union[TDataItem, DataItemWithMeta]] - ] = dict() # maps relation_id to list of data items + str, List[Union[TDataItem, DataItemWithMeta]] + ] = defaultdict( + list + ) # maps qualified table names to list of data items # other attributes only maintain last-seen values self.last_table_schema: Dict[ - int, TTableSchema + str, TTableSchema ] = dict() # maps relation_id to table schema self.last_commit_ts: pendulum.DateTime self.last_commit_lsn = None @@ -424,8 +427,24 @@ def process_msg(self, msg: ReplicationMessage) -> None: self.last_commit_ts = convert_pg_ts(row_msg.commit_time) # type: ignore[assignment] elif op == Op.COMMIT: self.process_commit(msg) - # elif op == Op.INSERT: - # column_data = decoded_msg.new_tuple.column_data + elif op == Op.INSERT: + last_table_schema = self.last_table_schema.get(row_msg.table) + table_schema = extract_table_schema(row_msg) + if last_table_schema is not None and last_table_schema != table_schema: + raise StopReplication # table schema change + self.last_table_schema[row_msg.table] = table_schema + schema, table_name = row_msg.table.split(".") + data_item = gen_data_item( + row_msg, + table_schema["columns"], + lsn=msg.data_start, + include_columns=( + None + if self.include_columns is None + else self.include_columns.get(table_name) + ), + ) + self.data_items[row_msg.table].append(data_item) # table_name = row_msg.table # data_item = self.gen_data_item( # data=column_data, @@ -475,71 +494,71 @@ def process_commit(self, msg: ReplicationMessage) -> None: if self.consumed_all or n_items >= self.target_batch_size: raise StopReplication - def process_relation(self, decoded_msg: Relation) -> None: - """Processes a replication message of type Relation. - - Stores table schema in object state. - Creates meta item to emit column hints while yielding data. - - Raises StopReplication when a table's schema changes. - """ - if ( - self.data_items.get(decoded_msg.relation_id) is not None - ): # table schema change - raise StopReplication - # get table schema information from source and store in object state - table_name = decoded_msg.relation_name - columns: TTableSchemaColumns = { - c.name: _to_dlt_column_schema(c) for c in decoded_msg.columns - } - self.last_table_schema[decoded_msg.relation_id] = { - "name": table_name, - "columns": columns, - } - - # apply user input - # 1) exclude columns - include_columns = ( - None - if self.include_columns is None - else self.include_columns.get(table_name) - ) - if include_columns is not None: - columns = {k: v for k, v in columns.items() if k in include_columns} - # 2) override source hints - column_hints: TTableSchemaColumns = ( - dict() if self.columns is None else self.columns.get(table_name, dict()) - ) - for column_name, column_val in column_hints.items(): - columns[column_name] = merge_column(columns[column_name], column_val) - - # add hints for replication columns - columns["lsn"] = {"data_type": "bigint", "nullable": True} - if self.pub_ops["update"] or self.pub_ops["delete"]: - columns["lsn"]["dedup_sort"] = "desc" - if self.pub_ops["delete"]: - columns["deleted_ts"] = { - "hard_delete": True, - "data_type": "timestamp", - "nullable": True, - } - - # determine write disposition - write_disposition: TWriteDisposition = "append" - if self.pub_ops["update"] or self.pub_ops["delete"]: - write_disposition = "merge" - - # include meta item to emit hints while yielding data - meta_item = dlt.mark.with_hints( - [], - dlt.mark.make_hints( - table_name=table_name, - write_disposition=write_disposition, - columns=columns, - ), - create_table_variant=True, - ) - self.data_items[decoded_msg.relation_id] = [meta_item] + # def process_relation(self, decoded_msg: Relation) -> None: + # """Processes a replication message of type Relation. + # + # Stores table schema in object state. + # Creates meta item to emit column hints while yielding data. + # + # Raises StopReplication when a table's schema changes. + # """ + # if ( + # self.data_items.get(decoded_msg.relation_id) is not None + # ): # table schema change + # raise StopReplication + # # get table schema information from source and store in object state + # table_name = decoded_msg.relation_name + # columns: TTableSchemaColumns = { + # c.name: _to_dlt_column_schema(c) for c in decoded_msg.columns + # } + # self.last_table_schema[decoded_msg.relation_id] = { + # "name": table_name, + # "columns": columns, + # } + # + # # apply user input + # # 1) exclude columns + # include_columns = ( + # None + # if self.include_columns is None + # else self.include_columns.get(table_name) + # ) + # if include_columns is not None: + # columns = {k: v for k, v in columns.items() if k in include_columns} + # # 2) override source hints + # column_hints: TTableSchemaColumns = ( + # dict() if self.columns is None else self.columns.get(table_name, dict()) + # ) + # for column_name, column_val in column_hints.items(): + # columns[column_name] = merge_column(columns[column_name], column_val) + # + # # add hints for replication columns + # columns["lsn"] = {"data_type": "bigint", "nullable": True} + # if self.pub_ops["update"] or self.pub_ops["delete"]: + # columns["lsn"]["dedup_sort"] = "desc" + # if self.pub_ops["delete"]: + # columns["deleted_ts"] = { + # "hard_delete": True, + # "data_type": "timestamp", + # "nullable": True, + # } + # + # # determine write disposition + # write_disposition: TWriteDisposition = "append" + # if self.pub_ops["update"] or self.pub_ops["delete"]: + # write_disposition = "merge" + # + # # include meta item to emit hints while yielding data + # meta_item = dlt.mark.with_hints( + # [], + # dlt.mark.make_hints( + # table_name=table_name, + # write_disposition=write_disposition, + # columns=columns, + # ), + # create_table_variant=True, + # ) + # self.data_items[decoded_msg.relation_id] = [meta_item] # def process_change( # self, decoded_msg: Union[Insert, Update, Delete], msg_start_lsn: int @@ -566,31 +585,31 @@ def process_relation(self, decoded_msg: Relation) -> None: # ), # ) # self.data_items[decoded_msg.relation_id].append(data_item) - - @staticmethod - def gen_data_item( - data: List[ColumnData], - column_schema: TTableSchemaColumns, - lsn: int, - commit_ts: pendulum.DateTime, - for_delete: bool, - include_columns: Optional[Sequence[str]] = None, - ) -> TDataItem: - """Generates data item from replication message data and corresponding metadata.""" - data_item = { - schema["name"]: _to_dlt_val( - val=data.col_data, - data_type=schema["data_type"], - byte1=data.col_data_category, - for_delete=for_delete, - ) - for (schema, data) in zip(column_schema.values(), data) - if (True if include_columns is None else schema["name"] in include_columns) - } - data_item["lsn"] = lsn - if for_delete: - data_item["deleted_ts"] = commit_ts - return data_item + # + # @staticmethod + # def gen_data_item( + # data: List[ColumnData], + # column_schema: TTableSchemaColumns, + # lsn: int, + # commit_ts: pendulum.DateTime, + # for_delete: bool, + # include_columns: Optional[Sequence[str]] = None, + # ) -> TDataItem: + # """Generates data item from replication message data and corresponding metadata.""" + # data_item = { + # schema["name"]: _to_dlt_val( + # val=data.col_data, + # data_type=schema["data_type"], + # byte1=data.col_data_category, + # for_delete=for_delete, + # ) + # for (schema, data) in zip(column_schema.values(), data) + # if (True if include_columns is None else schema["name"] in include_columns) + # } + # data_item["lsn"] = lsn + # if for_delete: + # data_item["deleted_ts"] = commit_ts + # return data_item # FIXME Refactor later @@ -606,25 +625,48 @@ def gen_data_item( """TODO: Add comment here""" -def extract_table_schema(row_msg: RowMessage) -> TTableSchema: - schema_name, table_name = row_msg.table.split(".") - +def extract_table_schema( + row_msg: RowMessage, *, include_columns: Optional[Sequence[str]] = None +) -> TTableSchema: columns: TTableSchemaColumns = {} - for c, c_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): - assert _PG_TYPES[c.column_type] == c_info.modifier - col_type: TColumnType = _type_mapper().from_db_type(c_info.modifier) + type_mapper = _type_mapper() + for col, col_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): + col_name = col.column_name + if include_columns is not None and col_name not in include_columns: + continue + assert ( + _PG_TYPES[col.column_type] == col_info.modifier + ), f"Type mismatch for column {col_name}" + col_type: TColumnType = type_mapper.from_db_type(col_info.modifier) col_schema: TColumnSchema = { - "name": c.column_name, - "nullable": c_info.value_optional, + "name": col_name, + "nullable": col_info.value_optional, **col_type, } - precision = _DATUM_PRECISIONS.get(c.WhichOneof("datum")) + precision = _DATUM_PRECISIONS.get(col.WhichOneof("datum")) if precision is not None: col_schema["precision"] = precision - columns[c.column_name] = col_schema + columns[col_name] = col_schema + + # Add replication columns + columns.update( + { + "lsn": { + "data_type": "bigint", + "nullable": True, + "dedup_sort": "desc", + }, + "deleted_ts": { + "data_type": "timestamp", + "nullable": True, + "hard_delete": True, + }, + } + ) + table_name = row_msg.table.split(".")[1] return {"name": table_name, "columns": columns} @@ -656,4 +698,4 @@ def gen_data_item( def _convert_pg_timestamp(microseconds_since_2000: int) -> pendulum.DateTime: epoch_2000 = pendulum.datetime(2000, 1, 1, tz="UTC") - return epoch_2000.add(microseconds=microseconds_since_2000) \ No newline at end of file + return epoch_2000.add(microseconds=microseconds_since_2000) diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index daaad1d8e..94f7ee8eb 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -77,6 +77,16 @@ def test_extract_table_schema(): "name": "_dlt_id", "nullable": False, }, + "lsn": { + "data_type": "bigint", + "dedup_sort": "desc", + "nullable": True, + }, + "deleted_ts": { + "data_type": "timestamp", + "hard_delete": True, + "nullable": True, + }, }, } From aa464d549e7885b3418d2d54cd36a476ba4fe289 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 17 Oct 2024 14:10:01 +0200 Subject: [PATCH 18/88] wip: saving work --- sources/pg_legacy_replication/__init__.py | 6 +- sources/pg_legacy_replication/helpers.py | 61 ++++++++------- .../test_pg_replication.py | 76 ++++++++++--------- 3 files changed, 77 insertions(+), 66 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 5082e1554..6a123916a 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -13,11 +13,13 @@ @dlt.resource( - name=lambda args: args["slot_name"], + name=lambda args: args["slot_name"] + "_" + args["schema"], standalone=True, ) def replication_resource( slot_name: str, + schema: str = dlt.config.value, + table_names: Sequence[str] = dlt.config.value, credentials: ConnectionStringCredentials = dlt.secrets.value, include_columns: Optional[Dict[str, Sequence[str]]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, @@ -87,6 +89,8 @@ def replication_resource( gen = ItemGenerator( credentials=credentials, slot_name=slot_name, + schema=schema, + table_names=table_names, options=options, upto_lsn=upto_lsn, start_lsn=start_lsn, diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 33bda95e0..1a43517ea 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -1,6 +1,7 @@ from typing import ( Optional, Dict, + Set, Iterator, Union, List, @@ -59,8 +60,8 @@ @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) def init_replication( slot_name: str, - schema: Optional[str] = dlt.config.value, - table_names: Optional[List[str]] = dlt.config.value, + schema: str = dlt.config.value, + table_names: Sequence[str] = dlt.config.value, credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = True, include_columns: Optional[Dict[str, Sequence[str]]] = None, @@ -315,6 +316,8 @@ def _get_rep_conn( class ItemGenerator: credentials: ConnectionStringCredentials slot_name: str + schema: str + table_names: Sequence[str] options: Dict[str, str] upto_lsn: int start_lsn: int = 0 @@ -348,6 +351,8 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: consumer = MessageConsumer( upto_lsn=self.upto_lsn, pub_ops=pub_opts, + schema=self.schema, + table_names=set(self.table_names), target_batch_size=self.target_batch_size, include_columns=self.include_columns, columns=self.columns, @@ -358,10 +363,17 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: finally: cur.connection.close() self.last_commit_lsn = consumer.last_commit_lsn - for qual_table_name, data_items in consumer.data_items.items(): - table_name = consumer.last_table_schema[qual_table_name]["name"] - yield data_items[0] # meta item with column hints only, no data - yield dlt.mark.with_table_name(data_items[1:], table_name) + for table_name, data_items in consumer.data_items.items(): + table_schema = consumer.last_table_schema[table_name] + assert table_name == table_schema["name"] + yield dlt.mark.with_hints( # meta item with column hints only, no data + [], + dlt.mark.make_hints( + table_name=table_name, columns=table_schema["columns"] + ), + create_table_variant=True, + ) + yield dlt.mark.with_table_name(data_items, table_name) self.generated_all = consumer.consumed_all @@ -380,12 +392,16 @@ def __init__( self, upto_lsn: int, pub_ops: Dict[str, bool], + schema: str, + table_names: Set[str], target_batch_size: int = 1000, include_columns: Optional[Dict[str, Sequence[str]]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, ) -> None: self.upto_lsn = upto_lsn self.pub_ops = pub_ops + self.schema = schema + self.table_names = table_names self.target_batch_size = target_batch_size self.include_columns = include_columns self.columns = columns @@ -418,22 +434,23 @@ def process_msg(self, msg: ReplicationMessage) -> None: - `target_batch_size` is reached - a table's schema has changed """ - debug(msg) row_msg = RowMessage() row_msg.ParseFromString(msg.payload) - debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] op = row_msg.op if op == Op.BEGIN: self.last_commit_ts = convert_pg_ts(row_msg.commit_time) # type: ignore[assignment] elif op == Op.COMMIT: self.process_commit(msg) elif op == Op.INSERT: - last_table_schema = self.last_table_schema.get(row_msg.table) + schema, table_name = row_msg.table.split(".") + if schema != self.schema or table_name not in self.table_names: + return + last_table_schema = self.last_table_schema.get(table_name) table_schema = extract_table_schema(row_msg) - if last_table_schema is not None and last_table_schema != table_schema: + if last_table_schema is None: + self.last_table_schema[table_name] = table_schema + elif last_table_schema != table_schema: raise StopReplication # table schema change - self.last_table_schema[row_msg.table] = table_schema - schema, table_name = row_msg.table.split(".") data_item = gen_data_item( row_msg, table_schema["columns"], @@ -444,23 +461,7 @@ def process_msg(self, msg: ReplicationMessage) -> None: else self.include_columns.get(table_name) ), ) - self.data_items[row_msg.table].append(data_item) - # table_name = row_msg.table - # data_item = self.gen_data_item( - # data=column_data, - # column_schema=self.last_table_schema[decoded_msg.relation_id][ - # "columns" - # ], - # lsn=msg.data_start, - # commit_ts=convert_pg_ts(row_msg.commit_time), - # for_delete=False, - # include_columns=( - # None - # if self.include_columns is None - # else self.include_columns.get(table_name) - # ), - # ) - # self.data_items[decoded_msg.relation_id].append(data_item) + self.data_items[table_name].append(data_item) # if op == Op.UPDATE: # self.process_change(row_msg) # op = msg.payload[:1] @@ -478,6 +479,8 @@ def process_msg(self, msg: ReplicationMessage) -> None: # "Truncate replication messages are ignored." # ) else: + debug(msg) + debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] raise AssertionError(f"Unsupported operation : {row_msg}") def process_commit(self, msg: ReplicationMessage) -> None: diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index b52a87bfd..09722eb4f 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -57,7 +57,11 @@ def tbl_y(data): take_snapshots=True, ) - changes = replication_resource(slot_name) + changes = replication_resource( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + ) src_pl.run( [ @@ -83,41 +87,41 @@ def tbl_y(data): # process changes info = dest_pl.run(changes) assert_load_info(info) - # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} - # exp_tbl_x = [ - # {"id_x": 1, "val_x": "foo"}, - # {"id_x": 2, "val_x": "bar"}, - # {"id_x": 3, "val_x": "baz"}, - # ] - # exp_tbl_y = [{"id_y": 1, "val_y": True}, {"id_y": 2, "val_y": False}] - # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - # - # # change single table - # src_pl.run(tbl_y({"id_y": 3, "val_y": True})) - # - # # process changes - # info = dest_pl.run(changes) - # assert_load_info(info) - # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} - # exp_tbl_y = [ - # {"id_y": 1, "val_y": True}, - # {"id_y": 2, "val_y": False}, - # {"id_y": 3, "val_y": True}, - # ] - # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - # - # # update tables - # with src_pl.sql_client() as c: - # qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") - # c.execute_sql(f"UPDATE {qual_name} SET val_x = 'foo_updated' WHERE id_x = 1;") - # qual_name = src_pl.sql_client().make_qualified_table_name("tbl_y") - # c.execute_sql(f"UPDATE {qual_name} SET val_y = false WHERE id_y = 1;") - # - # # process changes - # info = dest_pl.run(changes) - # assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} + exp_tbl_x = [ + {"id_x": 1, "val_x": "foo"}, + {"id_x": 2, "val_x": "bar"}, + {"id_x": 3, "val_x": "baz"}, + ] + exp_tbl_y = [{"id_y": 1, "val_y": True}, {"id_y": 2, "val_y": False}] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # change single table + src_pl.run(tbl_y({"id_y": 3, "val_y": True})) + + # process changes + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} + exp_tbl_y = [ + {"id_y": 1, "val_y": True}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # update tables + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + c.execute_sql(f"UPDATE {qual_name} SET val_x = 'foo_updated' WHERE id_x = 1;") + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_y") + c.execute_sql(f"UPDATE {qual_name} SET val_y = false WHERE id_y = 1;") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info) # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} # exp_tbl_x = [ # {"id_x": 1, "val_x": "foo_updated"}, From db09568c23aaa3099ade758e5b5810aee43926a5 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 17 Oct 2024 14:34:25 +0200 Subject: [PATCH 19/88] wip: making progress --- sources/pg_legacy_replication/__init__.py | 5 +++-- sources/pg_legacy_replication/helpers.py | 18 +++++++----------- .../test_pg_replication.py | 6 +++--- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 6a123916a..3b74295dc 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -84,13 +84,14 @@ def replication_resource( if upto_lsn is None: return + table_qnames = {f"{schema}.{table_name}" for table_name in table_names} + # generate items in batches while True: gen = ItemGenerator( credentials=credentials, slot_name=slot_name, - schema=schema, - table_names=table_names, + table_qnames=table_qnames, options=options, upto_lsn=upto_lsn, start_lsn=start_lsn, diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 1a43517ea..ca6105d2f 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -316,8 +316,7 @@ def _get_rep_conn( class ItemGenerator: credentials: ConnectionStringCredentials slot_name: str - schema: str - table_names: Sequence[str] + table_qnames: Set[str] options: Dict[str, str] upto_lsn: int start_lsn: int = 0 @@ -351,8 +350,7 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: consumer = MessageConsumer( upto_lsn=self.upto_lsn, pub_ops=pub_opts, - schema=self.schema, - table_names=set(self.table_names), + table_qnames=self.table_qnames, target_batch_size=self.target_batch_size, include_columns=self.include_columns, columns=self.columns, @@ -392,16 +390,14 @@ def __init__( self, upto_lsn: int, pub_ops: Dict[str, bool], - schema: str, - table_names: Set[str], + table_qnames: Set[str], target_batch_size: int = 1000, include_columns: Optional[Dict[str, Sequence[str]]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, ) -> None: self.upto_lsn = upto_lsn self.pub_ops = pub_ops - self.schema = schema - self.table_names = table_names + self.table_qnames = table_qnames self.target_batch_size = target_batch_size self.include_columns = include_columns self.columns = columns @@ -442,9 +438,9 @@ def process_msg(self, msg: ReplicationMessage) -> None: elif op == Op.COMMIT: self.process_commit(msg) elif op == Op.INSERT: - schema, table_name = row_msg.table.split(".") - if schema != self.schema or table_name not in self.table_names: + if row_msg.table not in self.table_qnames: return + _, table_name = row_msg.table.split(".") last_table_schema = self.last_table_schema.get(table_name) table_schema = extract_table_schema(row_msg) if last_table_schema is None: @@ -669,7 +665,7 @@ def extract_table_schema( } ) - table_name = row_msg.table.split(".")[1] + _, table_name = row_msg.table.split(".") return {"name": table_name, "columns": columns} diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 09722eb4f..53508d3f8 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -85,7 +85,7 @@ def tbl_y(data): cleanup_snapshot_resources(snapshots) # process changes - info = dest_pl.run(changes) + info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} exp_tbl_x = [ @@ -101,7 +101,7 @@ def tbl_y(data): src_pl.run(tbl_y({"id_y": 3, "val_y": True})) # process changes - info = dest_pl.run(changes) + info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} exp_tbl_y = [ @@ -120,7 +120,7 @@ def tbl_y(data): c.execute_sql(f"UPDATE {qual_name} SET val_y = false WHERE id_y = 1;") # process changes - info = dest_pl.run(changes) + info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} # exp_tbl_x = [ From c3c051808fdc7cfa6e065cfc4cdd24c656f908d7 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sat, 19 Oct 2024 03:36:04 +0200 Subject: [PATCH 20/88] wip: saving work --- sources/pg_legacy_replication/helpers.py | 100 ++++++++++++++---- tests/pg_legacy_replication/test_helpers.py | 48 ++++++++- .../test_pg_replication.py | 62 +++++------ 3 files changed, 160 insertions(+), 50 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index ca6105d2f..c0ce62b1e 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -362,15 +362,16 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: cur.connection.close() self.last_commit_lsn = consumer.last_commit_lsn for table_name, data_items in consumer.data_items.items(): - table_schema = consumer.last_table_schema[table_name] - assert table_name == table_schema["name"] - yield dlt.mark.with_hints( # meta item with column hints only, no data - [], - dlt.mark.make_hints( - table_name=table_name, columns=table_schema["columns"] - ), - create_table_variant=True, - ) + table_schema = consumer.last_table_schema.get(table_name) + if table_schema: + assert table_name == table_schema["name"] + yield dlt.mark.with_hints( # meta item with column hints only, no data + [], + dlt.mark.make_hints( + table_name=table_name, columns=table_schema["columns"] + ), + create_table_variant=True, + ) yield dlt.mark.with_table_name(data_items, table_name) self.generated_all = consumer.consumed_all @@ -458,15 +459,44 @@ def process_msg(self, msg: ReplicationMessage) -> None: ), ) self.data_items[table_name].append(data_item) - # if op == Op.UPDATE: - # self.process_change(row_msg) + elif op == Op.UPDATE: + if row_msg.table not in self.table_qnames: + return + _, table_name = row_msg.table.split(".") + last_table_schema = self.last_table_schema.get(table_name) + table_schema = extract_table_schema(row_msg) + if last_table_schema is None: + self.last_table_schema[table_name] = table_schema + elif last_table_schema != table_schema: + raise StopReplication # table schema change + data_item = gen_data_item( + row_msg, + table_schema["columns"], + lsn=msg.data_start, + include_columns=( + None + if self.include_columns is None + else self.include_columns.get(table_name) + ), + ) + self.data_items[table_name].append(data_item) + elif op == Op.DELETE: + debug(msg) + debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] + if row_msg.table not in self.table_qnames: + return + _, table_name = row_msg.table.split(".") + data_item = gen_delete_item( + row_msg, + lsn=msg.data_start, + include_columns=( + None + if self.include_columns is None + else self.include_columns.get(table_name) + ), + ) + self.data_items[table_name].append(data_item) # op = msg.payload[:1] - # if op == b"I": - # self.process_change(Insert(msg.payload), msg.data_start) - # elif op == b"U": - # self.process_change(Update(msg.payload), msg.data_start) - # elif op == b"D": - # self.process_change(Delete(msg.payload), msg.data_start) # elif op == b"R": # self.process_relation(Relation(msg.payload)) # elif op == b"T": @@ -612,7 +642,7 @@ def process_commit(self, msg: ReplicationMessage) -> None: # FIXME Refactor later -from .schema_types import _PG_TYPES, _type_mapper +from .schema_types import _PG_TYPES, _type_mapper, _DUMMY_VALS from dlt.common.schema.typing import TColumnType, TColumnSchema _DATUM_PRECISIONS: Dict[str, int] = { @@ -695,6 +725,40 @@ def gen_data_item( return data_item +def gen_delete_item( + row_msg: RowMessage, + *, + lsn: int, + include_columns: Optional[Sequence[str]] = None, +) -> TDataItem: + """Generates data item from a `RowMessage` and corresponding metadata.""" + assert row_msg.op == Op.DELETE + + column_data = row_msg.old_tuple + type_mapper = _type_mapper() + data_item = {} + + for data in column_data: + if include_columns and data.column_name not in include_columns: + continue + datum_name = data.WhichOneof("datum") + if datum_name: + data_item[data.column_name] = getattr(data, datum_name) + else: + db_type = _PG_TYPES[data.column_type] + col_type: TColumnType = type_mapper.from_db_type(db_type) + data_item[data.column_name] = _DUMMY_VALS[col_type["data_type"]] + + data_item["lsn"] = lsn + data_item["deleted_ts"] = _convert_db_timestamp(row_msg.commit_time) + + return data_item + + def _convert_pg_timestamp(microseconds_since_2000: int) -> pendulum.DateTime: epoch_2000 = pendulum.datetime(2000, 1, 1, tz="UTC") return epoch_2000.add(microseconds=microseconds_since_2000) + + +def _convert_db_timestamp(microseconds_since_1970: int) -> pendulum.DateTime: + return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000, tz="UTC") diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 94f7ee8eb..8ea0f8a9d 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -1,4 +1,10 @@ -from sources.pg_legacy_replication.helpers import extract_table_schema, gen_data_item +import pendulum + +from sources.pg_legacy_replication.helpers import ( + extract_table_schema, + gen_data_item, + gen_delete_item, +) from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage from google.protobuf.json_format import ParseDict as parse_dict @@ -149,3 +155,43 @@ def test_gen_data_item(): "lsn": 27078296, "val_y": False, } + + +def test_gen_delete_item(): + row_msg = RowMessage() + data = { + "transactionId": 932, + "commitTime": "1729299383354856", + "table": "src_pl_dataset_202410191256122080.tbl_x", + "op": "DELETE", + "oldTuple": [ + { + "columnName": "id_x", + "columnType": "20", + "datumInt64": "1", + }, + { + "columnName": "val_x", + "columnType": "1043", + }, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + }, + ], + "newTuple": [], + "newTypeinfo": [], + } + parse_dict(data, row_msg) + assert gen_delete_item(row_msg, lsn=27078296) == { + "id_x": 1, + "val_x": "", + "_dlt_load_id": "", + "_dlt_id": "", + "lsn": 27078296, + "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), + } diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 53508d3f8..6437d578a 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -122,37 +122,37 @@ def tbl_y(data): # process changes info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) - # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} - # exp_tbl_x = [ - # {"id_x": 1, "val_x": "foo_updated"}, - # {"id_x": 2, "val_x": "bar"}, - # {"id_x": 3, "val_x": "baz"}, - # ] - # exp_tbl_y = [ - # {"id_y": 1, "val_y": False}, - # {"id_y": 2, "val_y": False}, - # {"id_y": 3, "val_y": True}, - # ] - # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - # - # # delete from table - # with src_pl.sql_client() as c: - # qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") - # c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 1;") - # - # # process changes - # info = dest_pl.run(changes) - # assert_load_info(info) - # assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 3} - # exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] - # exp_tbl_y = [ - # {"id_y": 1, "val_y": False}, - # {"id_y": 2, "val_y": False}, - # {"id_y": 3, "val_y": True}, - # ] - # assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") - # assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} + exp_tbl_x = [ + {"id_x": 1, "val_x": "foo_updated"}, + {"id_x": 2, "val_x": "bar"}, + {"id_x": 3, "val_x": "baz"}, + ] + exp_tbl_y = [ + {"id_y": 1, "val_y": False}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") + + # delete from table + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("tbl_x") + c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 1;") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info) + assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 3} + exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] + exp_tbl_y = [ + {"id_y": 1, "val_y": False}, + {"id_y": 2, "val_y": False}, + {"id_y": 3, "val_y": True}, + ] + assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") + assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) From a5b1a87e490543d6ebb31efa960e15060fe709b8 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sat, 19 Oct 2024 19:57:11 +0200 Subject: [PATCH 21/88] refactor: some test refactoring --- tests/pg_legacy_replication/conftest.py | 12 +- .../test_pg_replication.py | 191 ++++-------------- .../test_schema_types.py | 63 ------ tests/pg_legacy_replication/utils.py | 2 +- 4 files changed, 48 insertions(+), 220 deletions(-) delete mode 100644 tests/pg_legacy_replication/test_schema_types.py diff --git a/tests/pg_legacy_replication/conftest.py b/tests/pg_legacy_replication/conftest.py index 4bfe6f5bd..b3d5372ae 100644 --- a/tests/pg_legacy_replication/conftest.py +++ b/tests/pg_legacy_replication/conftest.py @@ -7,10 +7,9 @@ @pytest.fixture() -def src_config() -> Iterator[Tuple[dlt.Pipeline, str, str]]: - # random slot and pub to enable parallel runs +def src_config() -> Iterator[Tuple[dlt.Pipeline, str]]: + # random slot to enable parallel runs slot = "test_slot_" + uniq_id(4) - pub = "test_pub" + uniq_id(4) # setup src_pl = dlt.pipeline( pipeline_name="src_pl", @@ -19,7 +18,7 @@ def src_config() -> Iterator[Tuple[dlt.Pipeline, str, str]]: ), dev_mode=True, ) - yield src_pl, slot, pub + yield src_pl, slot # teardown with src_pl.sql_client() as c: # drop tables @@ -37,8 +36,3 @@ def src_config() -> Iterator[Tuple[dlt.Pipeline, str, str]]: c.execute_sql(f"SELECT pg_drop_replication_slot('{slot}');") except Exception as e: print(e) - # drop publication - try: - c.execute_sql(f"DROP PUBLICATION IF EXISTS {pub};") - except Exception as e: - print(e) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 6437d578a..8b6f1eac5 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -1,6 +1,6 @@ import pytest -from typing import Set, Tuple +from typing import Dict, Set, Sequence, Tuple from copy import deepcopy from psycopg2.errors import InsufficientPrivilege @@ -29,7 +29,7 @@ @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_core_functionality( - src_config: Tuple[dlt.Pipeline, str, str], destination_name: str + src_config: Tuple[dlt.Pipeline, str], destination_name: str ) -> None: @dlt.resource(write_disposition="merge", primary_key="id_x") def tbl_x(data): @@ -39,7 +39,7 @@ def tbl_x(data): def tbl_y(data): yield data - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config src_pl.run( [ @@ -157,7 +157,7 @@ def tbl_y(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_without_init_load( - src_config: Tuple[dlt.Pipeline, str, str], destination_name: str + src_config: Tuple[dlt.Pipeline, str], destination_name: str ) -> None: @dlt.resource(write_disposition="merge", primary_key="id_x") def tbl_x(data): @@ -167,7 +167,7 @@ def tbl_x(data): def tbl_y(data): yield data - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create postgres table # since we're skipping initial load, these records should not be in the replicated table @@ -183,11 +183,10 @@ def tbl_y(data): # initialize replication and create resource for changes init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), ) - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) # change postgres table after replication has been initialized # these records should be in the replicated table @@ -225,11 +224,11 @@ def tbl_y(data): assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") -def test_insert_only(src_config: Tuple[dlt.Pipeline, str, str]) -> None: +def test_insert_only(src_config: Tuple[dlt.Pipeline, str]) -> None: def items(data): yield data - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create postgres table with single record src_pl.run(items({"id": 1, "foo": "bar"})) @@ -237,12 +236,11 @@ def items(data): # initialize replication and create resource for changes init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names="items", publish="insert", ) - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) # insert a record in postgres table src_pl.run(items({"id": 2, "foo": "bar"})) @@ -267,7 +265,7 @@ def items(data): @pytest.mark.parametrize("give_hints", [True, False]) @pytest.mark.parametrize("init_load", [True, False]) def test_mapped_data_types( - src_config: Tuple[dlt.Pipeline, str, str], + src_config: Tuple[dlt.Pipeline, str], destination_name: str, give_hints: bool, init_load: bool, @@ -282,7 +280,7 @@ def test_mapped_data_types( def items(data): yield data - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create postgres table with single record containing all data types src_pl.run(items(data)) @@ -291,8 +289,7 @@ def items(data): # initialize replication and create resources snapshot = init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names="items", take_snapshots=init_load, columns={"items": column_schema} if give_hints else None, @@ -300,7 +297,6 @@ def items(data): changes = replication_resource( slot_name=slot_name, - pub_name=pub_name, columns={"items": column_schema} if give_hints else None, ) @@ -377,10 +373,10 @@ def items(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_unmapped_data_types( - src_config: Tuple[dlt.Pipeline, str, str], destination_name: str + src_config: Tuple[dlt.Pipeline, str], destination_name: str ) -> None: """Assert postgres data types that aren't explicitly mapped default to "text" type.""" - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create postgres table with some unmapped types with src_pl.sql_client() as c: @@ -392,12 +388,11 @@ def test_unmapped_data_types( # initialize replication and create resource init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names="data_types", publish="insert", ) - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) # insert record in source table to create replication item with src_pl.sql_client() as c: @@ -418,14 +413,12 @@ def test_unmapped_data_types( @pytest.mark.parametrize("publish", ["insert", "insert, update, delete"]) -def test_write_disposition( - src_config: Tuple[dlt.Pipeline, str, str], publish: str -) -> None: +def test_write_disposition(src_config: Tuple[dlt.Pipeline, str], publish: str) -> None: @dlt.resource def items(data): yield data - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create postgres table src_pl.run(items({"id": 1, "val": True})) @@ -433,8 +426,7 @@ def items(data): # create resources snapshot = init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names="items", publish=publish, take_snapshots=True, @@ -445,7 +437,7 @@ def items(data): assert snapshot.write_disposition == expected_write_disposition # assert write disposition on tables dispatched by changes resource - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) src_pl.run(items({"id": 2, "val": True})) dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) dest_pl.extract(changes) @@ -458,11 +450,11 @@ def items(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("init_load", [True, False]) def test_include_columns( - src_config: Tuple[dlt.Pipeline, str, str], destination_name: str, init_load: bool + src_config: Tuple[dlt.Pipeline, str], destination_name: str, init_load: bool ) -> None: def get_cols(pipeline: dlt.Pipeline, table_name: str) -> set: with pipeline.destination_client(pipeline.default_schema_name) as client: - client: SqlJobClientBase + assert isinstance(client, SqlJobClientBase) return { k for k in client.get_storage_table(table_name)[1].keys() @@ -481,7 +473,7 @@ def tbl_y(data): def tbl_z(data): yield data - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create three postgres tables src_pl.run( @@ -493,23 +485,20 @@ def tbl_z(data): ) # initialize replication and create resources - include_columns = { + include_columns: Dict[str, Sequence[str]] = { "tbl_x": ["id_x", "val_x"], "tbl_y": ["id_y", "val_y"], # tbl_z is not specified, hence all columns should be included } snapshots = init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), publish="insert", take_snapshots=init_load, include_columns=include_columns, ) - changes = replication_resource( - slot_name=slot_name, pub_name=pub_name, include_columns=include_columns - ) + changes = replication_resource(slot_name=slot_name, include_columns=include_columns) # update three postgres tables src_pl.run( @@ -538,7 +527,7 @@ def tbl_z(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("init_load", [True, False]) def test_column_hints( - src_config: Tuple[dlt.Pipeline, str, str], destination_name: str, init_load: bool + src_config: Tuple[dlt.Pipeline, str], destination_name: str, init_load: bool ) -> None: @dlt.resource def tbl_x(data): @@ -552,7 +541,7 @@ def tbl_y(data): def tbl_z(data): yield data - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create three postgres tables src_pl.run( @@ -571,16 +560,13 @@ def tbl_z(data): } snapshots = init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), publish="insert", take_snapshots=init_load, columns=column_hints, ) - changes = replication_resource( - slot_name=slot_name, pub_name=pub_name, columns=column_hints - ) + changes = replication_resource(slot_name=slot_name, columns=column_hints) # update three postgres tables src_pl.run( @@ -643,9 +629,9 @@ def tbl_z(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_table_schema_change( - src_config: Tuple[dlt.Pipeline, str, str], destination_name: str + src_config: Tuple[dlt.Pipeline, str], destination_name: str ) -> None: - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create postgres table src_pl.run([{"c1": 1, "c2": 1}], table_name="items") @@ -653,14 +639,13 @@ def test_table_schema_change( # initialize replication init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names="items", publish="insert", ) # create resource and pipeline - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) dest_pl = dlt.pipeline( pipeline_name="dest_pl", destination=destination_name, full_refresh=True ) @@ -688,93 +673,7 @@ def test_table_schema_change( ) -def test_init_replication(src_config: Tuple[dlt.Pipeline, str, str]) -> None: - def get_table_names_in_pub() -> Set[str]: - with src_pl.sql_client() as c: - result = c.execute_sql( - f"SELECT tablename FROM pg_publication_tables WHERE pubname = '{pub_name}';" - ) - return {tup[0] for tup in result} - - @dlt.resource - def tbl_x(data): - yield data - - @dlt.resource - def tbl_y(data): - yield data - - @dlt.resource - def tbl_z(data): - yield data - - src_pl, slot_name, pub_name = src_config - - # create three postgres tables - src_pl.run( - [ - tbl_x({"id_x": 1, "val_x": "foo"}), - tbl_y({"id_y": 1, "val_y": "foo"}), - tbl_z({"id_z": 1, "val_z": "foo"}), - ] - ) - - # initialize replication with a single table - snapshot = init_replication( - slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, - table_names="tbl_x", - take_snapshots=True, - ) - assert snapshot is not None - assert get_table_names_in_pub() == {"tbl_x"} - - # adding another table is supported, but snapshot tables won't be persisted - snapshots = init_replication( - slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, - table_names=("tbl_x", "tbl_y"), - take_snapshots=True, - ) - assert snapshots is None - assert get_table_names_in_pub() == {"tbl_x", "tbl_y"} - - # removing a table is not supported - init_replication( - slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, - table_names="tbl_x", # "tbl_y" is no longer provided - ) - # "tbl_y" is still in the publication - assert get_table_names_in_pub() == {"tbl_x", "tbl_y"} - - # switching to whole schema replication is supported by omitting `table_names`, - # but only for Postgres server versions 15 or higher and with superuser privileges - is_su = is_super_user(src_pl.sql_client) - if get_pg_version() >= 150000 and is_su: - init_replication( - slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, - ) - # includes dlt system tables - assert get_table_names_in_pub() >= {"tbl_x", "tbl_y", "tbl_z"} - else: - exp_err = ( - InsufficientPrivilege if not is_su else IncompatiblePostgresVersionException - ) - with pytest.raises(exp_err): - init_replication( - slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, - ) - - -def test_replicate_schema(src_config: Tuple[dlt.Pipeline, str, str]) -> None: +def test_replicate_schema(src_config: Tuple[dlt.Pipeline, str]) -> None: if get_pg_version() < 150000: pytest.skip("incompatible Postgres server version") if not is_super_user(src_config[0].sql_client): @@ -792,7 +691,7 @@ def tbl_y(data): def tbl_z(data): yield data - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create two postgres tables src_pl.run( @@ -805,11 +704,10 @@ def tbl_z(data): # initialize replication and create resource init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, # we only specify `schema_name`, not `table_names` + schema=src_pl.dataset_name, # we only specify `schema`, not `table_names` publish="insert", ) - changes = replication_resource(slot_name, pub_name) + changes = replication_resource(slot_name) # change source tables and load to destination src_pl.run( @@ -834,10 +732,10 @@ def tbl_z(data): assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y", "tbl_z"} -def test_batching(src_config: Tuple[dlt.Pipeline, str, str]) -> None: +def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: # this test asserts the number of data items yielded by the replication resource # is not affected by `target_batch_size` and the number of replication messages per transaction - src_pl, slot_name, pub_name = src_config + src_pl, slot_name = src_config # create postgres table with single record data = {"id": 1000, "val": True} @@ -846,11 +744,10 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str, str]) -> None: # initialize replication and create resource for changes init_replication( slot_name=slot_name, - pub_name=pub_name, - schema_name=src_pl.dataset_name, + schema=src_pl.dataset_name, table_names="items", ) - changes = replication_resource(slot_name, pub_name, target_batch_size=50) + changes = replication_resource(slot_name, target_batch_size=50) # create destination pipeline and resource dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) diff --git a/tests/pg_legacy_replication/test_schema_types.py b/tests/pg_legacy_replication/test_schema_types.py deleted file mode 100644 index d4cc68b5d..000000000 --- a/tests/pg_legacy_replication/test_schema_types.py +++ /dev/null @@ -1,63 +0,0 @@ -from sources.pg_legacy_replication.schema_types import _extract_table_schema -from sources.pg_legacy_replication.pg_logicaldec_pb2 import ( - RowMessage, - Op, - TypeInfo, - DatumMessage, -) - - -def test_extract_table_schema(): - row_msg = RowMessage() - row_msg.transaction_id = 818 - row_msg.commit_time = 1728637822344316 - row_msg.table = "src_pl_dataset_202410110910185374_staging.tbl_x" - row_msg.op = Op.INSERT - row_msg.new_tuple.extend( - [ - DatumMessage(column_name="id_x", column_type=20, datum_int64=2), - DatumMessage(column_name="val_x", column_type=1043, datum_string="bar"), - DatumMessage( - column_name="_dlt_load_id", - column_type=1043, - datum_string="1728637821.2016037", - ), - DatumMessage( - column_name="_dlt_id", column_type=1043, datum_string="q52p9Y2Ac5ZXaA" - ), - ] - ) - row_msg.new_typeinfo.extend( - [ - TypeInfo(modifier="bigint"), - TypeInfo(modifier="character varying", value_optional=True), - TypeInfo(modifier="character varying"), - TypeInfo(modifier="character varying"), - ] - ) - assert _extract_table_schema(row_msg) == { - "name": "tbl_x", - "columns": { - "id_x": { - "data_type": "bigint", - "precision": 64, - "name": "id_x", - "nullable": False, - }, - "val_x": { - "data_type": "text", - "name": "val_x", - "nullable": True, - }, - "_dlt_load_id": { - "data_type": "text", - "name": "_dlt_load_id", - "nullable": False, - }, - "_dlt_id": { - "data_type": "text", - "name": "_dlt_id", - "nullable": False, - }, - }, - } diff --git a/tests/pg_legacy_replication/utils.py b/tests/pg_legacy_replication/utils.py index fe7695b91..5deb16af0 100644 --- a/tests/pg_legacy_replication/utils.py +++ b/tests/pg_legacy_replication/utils.py @@ -47,6 +47,6 @@ def is_super_user(sql_client) -> bool: "sources.pg_replication.credentials", ConnectionStringCredentials ).username with sql_client() as c: - return c.execute_sql( + return c.execute_sql( # type: ignore[no-any-return] f"SELECT rolsuper FROM pg_roles WHERE rolname = '{username}';" )[0][0] From 7fad621365c7fb836bc5c7f1c927b39cffd7ca5e Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sun, 20 Oct 2024 14:30:19 +0200 Subject: [PATCH 22/88] wip: saving work --- sources/pg_legacy_replication/helpers.py | 64 ++++--- tests/pg_legacy_replication/test_helpers.py | 164 +++++++++--------- .../test_pg_replication.py | 41 +++-- 3 files changed, 141 insertions(+), 128 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index c0ce62b1e..21511cf67 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -63,7 +63,7 @@ def init_replication( schema: str = dlt.config.value, table_names: Sequence[str] = dlt.config.value, credentials: ConnectionStringCredentials = dlt.secrets.value, - take_snapshots: bool = True, + take_snapshots: bool = False, include_columns: Optional[Dict[str, Sequence[str]]] = None, reset: bool = False, ) -> Optional[List[DltResource]]: @@ -400,8 +400,8 @@ def __init__( self.pub_ops = pub_ops self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.include_columns = include_columns - self.columns = columns + self.include_columns = include_columns or {} + self.columns = columns or {} self.consumed_all: bool = False # data_items attribute maintains all data items @@ -443,7 +443,11 @@ def process_msg(self, msg: ReplicationMessage) -> None: return _, table_name = row_msg.table.split(".") last_table_schema = self.last_table_schema.get(table_name) - table_schema = extract_table_schema(row_msg) + table_schema = extract_table_schema( + row_msg, + column_hints=self.columns.get(table_name), + include_columns=self.include_columns.get(table_name), + ) if last_table_schema is None: self.last_table_schema[table_name] = table_schema elif last_table_schema != table_schema: @@ -464,7 +468,11 @@ def process_msg(self, msg: ReplicationMessage) -> None: return _, table_name = row_msg.table.split(".") last_table_schema = self.last_table_schema.get(table_name) - table_schema = extract_table_schema(row_msg) + table_schema = extract_table_schema( + row_msg, + column_hints=self.columns.get(table_name), + include_columns=self.include_columns.get(table_name), + ) if last_table_schema is None: self.last_table_schema[table_name] = table_schema elif last_table_schema != table_schema: @@ -481,8 +489,6 @@ def process_msg(self, msg: ReplicationMessage) -> None: ) self.data_items[table_name].append(data_item) elif op == Op.DELETE: - debug(msg) - debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] if row_msg.table not in self.table_qnames: return _, table_name = row_msg.table.split(".") @@ -655,13 +661,27 @@ def process_commit(self, msg: ReplicationMessage) -> None: def extract_table_schema( - row_msg: RowMessage, *, include_columns: Optional[Sequence[str]] = None + row_msg: RowMessage, + *, + column_hints: Optional[TTableSchemaColumns] = None, + include_columns: Optional[Sequence[str]] = None, ) -> TTableSchema: - columns: TTableSchemaColumns = {} + columns: TTableSchemaColumns = { + "lsn": { + "data_type": "bigint", + "nullable": True, + "dedup_sort": "desc", + }, + "deleted_ts": { + "data_type": "timestamp", + "nullable": True, + "hard_delete": True, + }, + } type_mapper = _type_mapper() for col, col_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): col_name = col.column_name - if include_columns is not None and col_name not in include_columns: + if include_columns and col_name not in include_columns: continue assert ( _PG_TYPES[col.column_type] == col_info.modifier @@ -674,26 +694,14 @@ def extract_table_schema( } precision = _DATUM_PRECISIONS.get(col.WhichOneof("datum")) - if precision is not None: + if precision: col_schema["precision"] = precision - columns[col_name] = col_schema - - # Add replication columns - columns.update( - { - "lsn": { - "data_type": "bigint", - "nullable": True, - "dedup_sort": "desc", - }, - "deleted_ts": { - "data_type": "timestamp", - "nullable": True, - "hard_delete": True, - }, - } - ) + columns[col_name] = ( + merge_column(col_schema, column_hints.get(col_name)) + if column_hints and column_hints.get(col_name) + else col_schema + ) _, table_name = row_msg.table.split(".") return {"name": table_name, "columns": columns} diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 8ea0f8a9d..6603d421e 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -1,4 +1,9 @@ +from typing import Optional + import pendulum +import pytest +from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns +from google.protobuf.json_format import ParseDict as parse_dict from sources.pg_legacy_replication.helpers import ( extract_table_schema, @@ -6,95 +11,84 @@ gen_delete_item, ) from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage -from google.protobuf.json_format import ParseDict as parse_dict -def test_extract_table_schema(): +@pytest.mark.parametrize( + "data, column_hints, expected_schema", + [ + ( + { + "transactionId": 969, + "commitTime": "1728662646949062", + "table": "src_pl_dataset_202410110404048747_staging.tbl_y", + "op": "INSERT", + "newTuple": [ + {"columnName": "id_y", "columnType": "20", "datumInt64": 2}, + {"columnName": "val_y", "columnType": "16", "datumBool": False}, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + "datumString": "1728662646.2657657", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + "datumString": "gGjifTMTAUs5ag", + }, + ], + "newTypeinfo": [ + {"modifier": "bigint", "valueOptional": False}, + {"modifier": "boolean", "valueOptional": True}, + {"modifier": "character varying", "valueOptional": False}, + {"modifier": "character varying", "valueOptional": False}, + ], + "oldTuple": [], + }, + {"id_y": {"primary_key": True}}, + { + "name": "tbl_y", + "columns": { + "id_y": { + "data_type": "bigint", + "precision": 64, + "name": "id_y", + "nullable": False, + "primary_key": True, + }, + "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "_dlt_id": { + "data_type": "text", + "name": "_dlt_id", + "nullable": False, + }, + "lsn": { + "data_type": "bigint", + "dedup_sort": "desc", + "nullable": True, + }, + "deleted_ts": { + "data_type": "timestamp", + "hard_delete": True, + "nullable": True, + }, + }, + }, + ), + ], +) +def test_extract_table_schema( + data, + column_hints: Optional[TTableSchemaColumns], + expected_schema: TTableSchema, +): row_msg = RowMessage() - data = { - "transactionId": 969, - "commitTime": "1728662646949062", - "table": "src_pl_dataset_202410110404048747_staging.tbl_y", - "op": "INSERT", - "newTuple": [ - { - "columnName": "id_y", - "columnType": "20", - "datumInt64": 2, - }, - { - "columnName": "val_y", - "columnType": "16", - "datumBool": False, - }, - { - "columnName": "_dlt_load_id", - "columnType": "1043", - "datumString": "1728662646.2657657", - }, - { - "columnName": "_dlt_id", - "columnType": "1043", - "datumString": "gGjifTMTAUs5ag", - }, - ], - "newTypeinfo": [ - { - "modifier": "bigint", - "valueOptional": False, - }, - { - "modifier": "boolean", - "valueOptional": True, - }, - { - "modifier": "character varying", - "valueOptional": False, - }, - { - "modifier": "character varying", - "valueOptional": False, - }, - ], - "oldTuple": [], - } parse_dict(data, row_msg) - assert extract_table_schema(row_msg) == { - "name": "tbl_y", - "columns": { - "id_y": { - "data_type": "bigint", - "precision": 64, - "name": "id_y", - "nullable": False, - }, - "val_y": { - "data_type": "bool", - "name": "val_y", - "nullable": True, - }, - "_dlt_load_id": { - "data_type": "text", - "name": "_dlt_load_id", - "nullable": False, - }, - "_dlt_id": { - "data_type": "text", - "name": "_dlt_id", - "nullable": False, - }, - "lsn": { - "data_type": "bigint", - "dedup_sort": "desc", - "nullable": True, - }, - "deleted_ts": { - "data_type": "timestamp", - "hard_delete": True, - "nullable": True, - }, - }, - } + assert extract_table_schema(row_msg, column_hints=column_hints) == expected_schema def test_gen_data_item(): diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 8b6f1eac5..b610a4480 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -71,7 +71,7 @@ def tbl_y(data): ) dest_pl = dlt.pipeline( - pipeline_name="dest_pl", destination=destination_name, full_refresh=True + pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) # initial load @@ -186,7 +186,16 @@ def tbl_y(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), ) - changes = replication_resource(slot_name) + + changes = replication_resource( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y"), + columns={ + "tbl_x": {"id_x": {"primary_key": True}}, + "tbl_y": {"id_y": {"primary_key": True}}, + }, + ) # change postgres table after replication has been initialized # these records should be in the replicated table @@ -199,9 +208,9 @@ def tbl_y(data): # load changes to destination and assert expectations dest_pl = dlt.pipeline( - pipeline_name="dest_pl", destination=destination_name, full_refresh=True + pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) - info = dest_pl.run(changes) + info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 1} exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] @@ -215,7 +224,7 @@ def tbl_y(data): c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 2;") # process change and assert expectations - info = dest_pl.run(changes) + info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} exp_tbl_x = [{"id_x": 3, "val_x": "baz"}] @@ -240,13 +249,15 @@ def items(data): table_names="items", publish="insert", ) - changes = replication_resource(slot_name) + changes = replication_resource( + slot_name=slot_name, schema=src_pl.dataset_name, table_names="items" + ) # insert a record in postgres table src_pl.run(items({"id": 2, "foo": "bar"})) # extract items from resource - dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) + dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) extract_info = dest_pl.extract(changes) assert get_table_metrics(extract_info, "items")["items_count"] == 1 @@ -302,7 +313,7 @@ def items(data): # initial load dest_pl = dlt.pipeline( - pipeline_name="dest_pl", destination=destination_name, full_refresh=True + pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) if init_load: info = dest_pl.run(snapshot) @@ -402,7 +413,7 @@ def test_unmapped_data_types( # run destination pipeline and assert resulting data types dest_pl = dlt.pipeline( - pipeline_name="dest_pl", destination=destination_name, full_refresh=True + pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) dest_pl.extract(changes) dest_pl.normalize() @@ -439,7 +450,7 @@ def items(data): # assert write disposition on tables dispatched by changes resource changes = replication_resource(slot_name) src_pl.run(items({"id": 2, "val": True})) - dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) + dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) dest_pl.extract(changes) assert ( dest_pl.default_schema.get_table("items")["write_disposition"] @@ -511,7 +522,7 @@ def tbl_z(data): # load to destination and assert column expectations dest_pl = dlt.pipeline( - pipeline_name="dest_pl", destination=destination_name, full_refresh=True + pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) if init_load: dest_pl.run(snapshots) @@ -579,7 +590,7 @@ def tbl_z(data): # load to destination and assert column expectations dest_pl = dlt.pipeline( - pipeline_name="dest_pl", destination=destination_name, full_refresh=True + pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) if init_load: dest_pl.run(snapshots) @@ -647,7 +658,7 @@ def test_table_schema_change( # create resource and pipeline changes = replication_resource(slot_name) dest_pl = dlt.pipeline( - pipeline_name="dest_pl", destination=destination_name, full_refresh=True + pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) # add a column in one commit, this will create one Relation message @@ -716,7 +727,7 @@ def tbl_z(data): tbl_y({"id_y": 2, "val_y": "foo"}), ] ) - dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) + dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) dest_pl.extract(changes) assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y"} @@ -750,7 +761,7 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: changes = replication_resource(slot_name, target_batch_size=50) # create destination pipeline and resource - dest_pl = dlt.pipeline(pipeline_name="dest_pl", full_refresh=True) + dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) # insert 100 records into source table in one transaction batch = [{**r, **{"id": key}} for r in [data] for key in range(1, 101)] From fbc65bc694c7d0409fffa0e4b0505a79d5f726f4 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 21 Oct 2024 01:19:56 +0200 Subject: [PATCH 23/88] wip: saving work --- sources/pg_legacy_replication/__init__.py | 6 +- sources/pg_legacy_replication/helpers.py | 290 +++++++++++------- sources/pg_legacy_replication_pipeline.py | 2 +- .../test_pg_replication.py | 49 +-- 4 files changed, 181 insertions(+), 166 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 3b74295dc..235799451 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -21,7 +21,7 @@ def replication_resource( schema: str = dlt.config.value, table_names: Sequence[str] = dlt.config.value, credentials: ConnectionStringCredentials = dlt.secrets.value, - include_columns: Optional[Dict[str, Sequence[str]]] = None, + included_columns: Optional[Dict[str, Sequence[str]]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, target_batch_size: int = 1000, flush_slot: bool = True, @@ -38,7 +38,7 @@ def replication_resource( Args: slot_name (str): Name of the replication slot to consume replication messages from. credentials (ConnectionStringCredentials): Postgres database credentials. - include_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to + included_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to sequence of names of columns to include in the generated data items. Any column not in the sequence is excluded. If not provided, all columns are included. For example: @@ -96,7 +96,7 @@ def replication_resource( upto_lsn=upto_lsn, start_lsn=start_lsn, target_batch_size=target_batch_size, - include_columns=include_columns, + included_columns=included_columns, columns=columns, ) yield from gen diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 21511cf67..3ccf2b0e7 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -64,7 +64,8 @@ def init_replication( table_names: Sequence[str] = dlt.config.value, credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = False, - include_columns: Optional[Dict[str, Sequence[str]]] = None, + included_columns: Optional[Dict[str, Sequence[str]]] = None, + columns: Optional[Dict[str, TTableSchemaColumns]] = None, reset: bool = False, ) -> Optional[List[DltResource]]: """Initializes replication for one, several, or all tables within a schema. @@ -94,12 +95,12 @@ def init_replication( resources (`DltResource` objects) for these tables are created and returned. The resources can be used to perform an initial load of all data present in the tables at the moment the replication slot got created. - include_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to + included_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to sequence of names of columns to include in the snapshot table(s). Any column not in the sequence is excluded. If not provided, all columns are included. For example: ``` - include_columns={ + included_columns={ "table_x": ["col_a", "col_c"], "table_y": ["col_x", "col_y", "col_z"], } @@ -143,13 +144,15 @@ def on_begin(conn: ConnectionSqla) -> None: cur.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") cur.execute(f"SET TRANSACTION SNAPSHOT '{slot['snapshot_name']}'") - include_columns = include_columns or {} + included_columns = included_columns or {} + columns = columns or {} return [ - sql_table( # type: ignore[name-defined] - credentials=engine, - table=table_name, - schema=schema, - included_columns=include_columns.get(table_name), + _prepare_snapshot_resource( + engine, + table_name, + schema, + included_columns=included_columns.get(table_name), + columns=columns.get(table_name), ) for table_name in table_names ] @@ -173,6 +176,25 @@ def on_engine_disposed(engine: Engine) -> None: return engine +def _prepare_snapshot_resource( + engine: Engine, + table_name: str, + schema: str, + *, + included_columns: Optional[Sequence[str]] = None, + columns: Optional[TTableSchemaColumns] = None, +) -> DltResource: + t_rsrc: DltResource = sql_table( # type: ignore[name-defined] + credentials=engine, + table=table_name, + schema=schema, + included_columns=included_columns, + ) + if columns: + t_rsrc.apply_hints(columns=columns) + return t_rsrc + + def cleanup_snapshot_resources(snapshots: List[DltResource]) -> None: """FIXME Awful hack to release the underlying SQL engine when snapshotting tables""" if not snapshots: @@ -321,7 +343,7 @@ class ItemGenerator: upto_lsn: int start_lsn: int = 0 target_batch_size: int = 1000 - include_columns: Optional[Dict[str, Sequence[str]]] = (None,) # type: ignore[assignment] + included_columns: Optional[Dict[str, Sequence[str]]] = (None,) # type: ignore[assignment] columns: Optional[Dict[str, TTableSchemaColumns]] = (None,) # type: ignore[assignment] last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -352,7 +374,7 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: pub_ops=pub_opts, table_qnames=self.table_qnames, target_batch_size=self.target_batch_size, - include_columns=self.include_columns, + included_columns=self.included_columns, columns=self.columns, ) cur.consume_stream(consumer) @@ -393,14 +415,14 @@ def __init__( pub_ops: Dict[str, bool], table_qnames: Set[str], target_batch_size: int = 1000, - include_columns: Optional[Dict[str, Sequence[str]]] = None, + included_columns: Optional[Dict[str, Sequence[str]]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, ) -> None: self.upto_lsn = upto_lsn self.pub_ops = pub_ops self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.include_columns = include_columns or {} + self.included_columns = included_columns or {} self.columns = columns or {} self.consumed_all: bool = False @@ -432,88 +454,96 @@ def process_msg(self, msg: ReplicationMessage) -> None: - a table's schema has changed """ row_msg = RowMessage() - row_msg.ParseFromString(msg.payload) - op = row_msg.op - if op == Op.BEGIN: - self.last_commit_ts = convert_pg_ts(row_msg.commit_time) # type: ignore[assignment] - elif op == Op.COMMIT: - self.process_commit(msg) - elif op == Op.INSERT: - if row_msg.table not in self.table_qnames: - return - _, table_name = row_msg.table.split(".") - last_table_schema = self.last_table_schema.get(table_name) - table_schema = extract_table_schema( - row_msg, - column_hints=self.columns.get(table_name), - include_columns=self.include_columns.get(table_name), - ) - if last_table_schema is None: - self.last_table_schema[table_name] = table_schema - elif last_table_schema != table_schema: - raise StopReplication # table schema change - data_item = gen_data_item( - row_msg, - table_schema["columns"], - lsn=msg.data_start, - include_columns=( - None - if self.include_columns is None - else self.include_columns.get(table_name) - ), - ) - self.data_items[table_name].append(data_item) - elif op == Op.UPDATE: - if row_msg.table not in self.table_qnames: - return - _, table_name = row_msg.table.split(".") - last_table_schema = self.last_table_schema.get(table_name) - table_schema = extract_table_schema( - row_msg, - column_hints=self.columns.get(table_name), - include_columns=self.include_columns.get(table_name), - ) - if last_table_schema is None: - self.last_table_schema[table_name] = table_schema - elif last_table_schema != table_schema: - raise StopReplication # table schema change - data_item = gen_data_item( - row_msg, - table_schema["columns"], - lsn=msg.data_start, - include_columns=( - None - if self.include_columns is None - else self.include_columns.get(table_name) - ), - ) - self.data_items[table_name].append(data_item) - elif op == Op.DELETE: - if row_msg.table not in self.table_qnames: - return - _, table_name = row_msg.table.split(".") - data_item = gen_delete_item( - row_msg, - lsn=msg.data_start, - include_columns=( - None - if self.include_columns is None - else self.include_columns.get(table_name) - ), + try: + row_msg.ParseFromString(msg.payload) + op = row_msg.op + if op == Op.BEGIN: + self.last_commit_ts = convert_pg_ts(row_msg.commit_time) # type: ignore[assignment] + elif op == Op.COMMIT: + self.process_commit(msg) + elif op == Op.INSERT: + if row_msg.table not in self.table_qnames: + return + _, table_name = row_msg.table.split(".") + last_table_schema = self.last_table_schema.get(table_name) + table_schema = extract_table_schema( + row_msg, + column_hints=self.columns.get(table_name), + included_columns=self.included_columns.get(table_name), + ) + if last_table_schema is None: + self.last_table_schema[table_name] = table_schema + elif last_table_schema != table_schema: + raise StopReplication # table schema change + data_item = gen_data_item( + row_msg, + table_schema["columns"], + lsn=msg.data_start, + included_columns=( + None + if self.included_columns is None + else self.included_columns.get(table_name) + ), + ) + self.data_items[table_name].append(data_item) + elif op == Op.UPDATE: + if row_msg.table not in self.table_qnames: + return + _, table_name = row_msg.table.split(".") + last_table_schema = self.last_table_schema.get(table_name) + table_schema = extract_table_schema( + row_msg, + column_hints=self.columns.get(table_name), + included_columns=self.included_columns.get(table_name), + ) + if last_table_schema is None: + self.last_table_schema[table_name] = table_schema + elif last_table_schema != table_schema: + raise StopReplication # table schema change + data_item = gen_data_item( + row_msg, + table_schema["columns"], + lsn=msg.data_start, + included_columns=( + None + if self.included_columns is None + else self.included_columns.get(table_name) + ), + ) + self.data_items[table_name].append(data_item) + elif op == Op.DELETE: + if row_msg.table not in self.table_qnames: + return + _, table_name = row_msg.table.split(".") + data_item = gen_delete_item( + row_msg, + lsn=msg.data_start, + included_columns=( + None + if self.included_columns is None + else self.included_columns.get(table_name) + ), + ) + self.data_items[table_name].append(data_item) + # op = msg.payload[:1] + # elif op == b"R": + # self.process_relation(Relation(msg.payload)) + # elif op == b"T": + # logger.warning( + # "The truncate operation is currently not supported. " + # "Truncate replication messages are ignored." + # ) + else: + debug(msg) + debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] + raise AssertionError(f"Unsupported operation : {row_msg}") + except StopReplication: + raise + except Exception: + logger.error( + "A fatal error occured while processing a message: %s", row_msg ) - self.data_items[table_name].append(data_item) - # op = msg.payload[:1] - # elif op == b"R": - # self.process_relation(Relation(msg.payload)) - # elif op == b"T": - # logger.warning( - # "The truncate operation is currently not supported. " - # "Truncate replication messages are ignored." - # ) - else: - debug(msg) - debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] - raise AssertionError(f"Unsupported operation : {row_msg}") + raise def process_commit(self, msg: ReplicationMessage) -> None: """Updates object state when Commit message is observed. @@ -664,7 +694,7 @@ def extract_table_schema( row_msg: RowMessage, *, column_hints: Optional[TTableSchemaColumns] = None, - include_columns: Optional[Sequence[str]] = None, + included_columns: Optional[Sequence[str]] = None, ) -> TTableSchema: columns: TTableSchemaColumns = { "lsn": { @@ -681,21 +711,49 @@ def extract_table_schema( type_mapper = _type_mapper() for col, col_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): col_name = col.column_name - if include_columns and col_name not in include_columns: + if included_columns and col_name not in included_columns: continue - assert ( - _PG_TYPES[col.column_type] == col_info.modifier - ), f"Type mismatch for column {col_name}" - col_type: TColumnType = type_mapper.from_db_type(col_info.modifier) + db_type = _PG_TYPES[col.column_type] + col_type: TColumnType = type_mapper.from_db_type(db_type) col_schema: TColumnSchema = { "name": col_name, "nullable": col_info.value_optional, **col_type, } + if db_type == "character varying": + import re - precision = _DATUM_PRECISIONS.get(col.WhichOneof("datum")) - if precision: + match = re.search(r"character varying\((\d+)\)", col_info.modifier) + if match: + col_schema["precision"] = int(match.group(1)) + elif db_type == "numeric": + import re + + match = re.search(r"numeric\((\d+),(\d+)\)", col_info.modifier) + precision, scale = map(int, match.groups()) col_schema["precision"] = precision + col_schema["scale"] = scale + elif db_type == "timestamp with time zone": + import re + + match = re.search(r"timestamp\((\d+)\) with time zone", col_info.modifier) + if match: + col_schema["precision"] = int(match.group(1)) + # col_schema["timezone"] = True FIXME + elif db_type == "time without time zone": + import re + + match = re.search(r"time\((\d+)\) without time zone", col_info.modifier) + if match: + col_schema["precision"] = int(match.group(1)) + # col_schema["timezone"] = False FIXME + else: + assert ( + _PG_TYPES[col.column_type] == col_info.modifier + ), f"Type mismatch for column {col_name}" + + if precision := _DATUM_PRECISIONS.get(col.WhichOneof("datum")): + col_schema["precision"] = precision columns[col_name] = ( merge_column(col_schema, column_hints.get(col_name)) @@ -712,23 +770,17 @@ def gen_data_item( column_schema: TTableSchemaColumns, *, lsn: int, - include_columns: Optional[Sequence[str]] = None, + included_columns: Optional[Sequence[str]] = None, ) -> TDataItem: """Generates data item from a `RowMessage` and corresponding metadata.""" - assert row_msg.op in (Op.INSERT, Op.UPDATE, Op.DELETE) - column_data = ( - row_msg.new_tuple if row_msg.op in (Op.INSERT, Op.UPDATE) else row_msg.old_tuple - ) - - data_item = { - data.column_name: getattr(data, data.WhichOneof("datum")) - for schema, data in zip(column_schema.values(), column_data) - if include_columns is None or data.column_name in include_columns - } - - data_item["lsn"] = lsn - if row_msg.op == Op.DELETE: - data_item["deleted_ts"] = _convert_pg_timestamp(row_msg.commit_time) + assert row_msg.op in (Op.INSERT, Op.UPDATE) + data_item = {"lsn": lsn} + for data in row_msg.new_tuple: + if included_columns and data.column_name not in included_columns: + continue + datum = data.WhichOneof("datum") + assert datum or column_schema[data.column_name]["nullable"] + data_item[data.column_name] = getattr(data, datum) if datum else None return data_item @@ -737,7 +789,7 @@ def gen_delete_item( row_msg: RowMessage, *, lsn: int, - include_columns: Optional[Sequence[str]] = None, + included_columns: Optional[Sequence[str]] = None, ) -> TDataItem: """Generates data item from a `RowMessage` and corresponding metadata.""" assert row_msg.op == Op.DELETE @@ -747,7 +799,7 @@ def gen_delete_item( data_item = {} for data in column_data: - if include_columns and data.column_name not in include_columns: + if included_columns and data.column_name not in included_columns: continue datum_name = data.WhichOneof("datum") if datum_name: diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 95f02fafc..2f84b9ec4 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -204,7 +204,7 @@ def replicate_with_column_selection() -> None: # create a resource that generates items for each change in the schema's tables changes = replication_resource( slot_name=slot_name, - include_columns={ + included_columns={ "tbl_x": ("c1", "c2") }, # columns not specified here are excluded from generated data items ) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index b610a4480..c8d27be63 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -233,45 +233,6 @@ def tbl_y(data): assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") -def test_insert_only(src_config: Tuple[dlt.Pipeline, str]) -> None: - def items(data): - yield data - - src_pl, slot_name = src_config - - # create postgres table with single record - src_pl.run(items({"id": 1, "foo": "bar"})) - - # initialize replication and create resource for changes - init_replication( - slot_name=slot_name, - schema=src_pl.dataset_name, - table_names="items", - publish="insert", - ) - changes = replication_resource( - slot_name=slot_name, schema=src_pl.dataset_name, table_names="items" - ) - - # insert a record in postgres table - src_pl.run(items({"id": 2, "foo": "bar"})) - - # extract items from resource - dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) - extract_info = dest_pl.extract(changes) - assert get_table_metrics(extract_info, "items")["items_count"] == 1 - - # do an update and a delete—these operations should not lead to items in the resource - with src_pl.sql_client() as c: - qual_name = src_pl.sql_client().make_qualified_table_name("items") - c.execute_sql(f"UPDATE {qual_name} SET foo = 'baz' WHERE id = 2;") - c.execute_sql(f"DELETE FROM {qual_name} WHERE id = 2;") - extract_info = dest_pl.extract(changes) - assert ( - get_table_metrics(extract_info, "items") is None - ) # there should be no metrics for the "items" table - - @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("give_hints", [True, False]) @pytest.mark.parametrize("init_load", [True, False]) @@ -301,13 +262,15 @@ def items(data): snapshot = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, - table_names="items", + table_names=("items",), take_snapshots=init_load, columns={"items": column_schema} if give_hints else None, ) changes = replication_resource( slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("items",), columns={"items": column_schema} if give_hints else None, ) @@ -327,7 +290,7 @@ def items(data): r2["col1"] = 2 src_pl.run(items([r1, r2])) - info = dest_pl.run(changes) + info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 @@ -351,7 +314,7 @@ def items(data): src_pl.run(items([r1, r2])) # process changes and assert expectations - info = dest_pl.run(changes) + info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 exp = [ @@ -373,7 +336,7 @@ def items(data): c.execute_sql(f"UPDATE {qual_name} SET col2 = 2.5 WHERE col1 = 2;") # process change and assert expectation - info = dest_pl.run(changes) + info = dest_pl.run(changes, write_disposition="merge") assert_load_info(info) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 exp = [{"col1": 2, "col2": 2.5, "col3": False}] From 1299b606602b092493ad50b8912b111708f2ac7b Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 21 Oct 2024 11:06:56 +0200 Subject: [PATCH 24/88] wip: cleaning up + refactor --- sources/pg_legacy_replication/helpers.py | 403 +++++++---------------- 1 file changed, 124 insertions(+), 279 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 3ccf2b0e7..a6f7c8a50 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -334,73 +334,6 @@ def _get_rep_conn( return _get_conn(credentials, LogicalReplicationConnection) # type: ignore[return-value] -@dataclass -class ItemGenerator: - credentials: ConnectionStringCredentials - slot_name: str - table_qnames: Set[str] - options: Dict[str, str] - upto_lsn: int - start_lsn: int = 0 - target_batch_size: int = 1000 - included_columns: Optional[Dict[str, Sequence[str]]] = (None,) # type: ignore[assignment] - columns: Optional[Dict[str, TTableSchemaColumns]] = (None,) # type: ignore[assignment] - last_commit_lsn: Optional[int] = field(default=None, init=False) - generated_all: bool = False - - def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: - """Yields replication messages from MessageConsumer. - - Starts replication of messages published by the publication from the replication slot. - Maintains LSN of last consumed Commit message in object state. - Does not advance the slot. - """ - try: - cur = _get_rep_conn(self.credentials).cursor() - cur.start_replication( - slot_name=self.slot_name, - start_lsn=self.start_lsn, - decode=False, - options=self.options, - ) - pub_opts = { - "insert": True, - "update": True, - "delete": True, - "truncate": False, - } - consumer = MessageConsumer( - upto_lsn=self.upto_lsn, - pub_ops=pub_opts, - table_qnames=self.table_qnames, - target_batch_size=self.target_batch_size, - included_columns=self.included_columns, - columns=self.columns, - ) - cur.consume_stream(consumer) - except StopReplication: # completed batch or reached `upto_lsn` - pass - finally: - cur.connection.close() - self.last_commit_lsn = consumer.last_commit_lsn - for table_name, data_items in consumer.data_items.items(): - table_schema = consumer.last_table_schema.get(table_name) - if table_schema: - assert table_name == table_schema["name"] - yield dlt.mark.with_hints( # meta item with column hints only, no data - [], - dlt.mark.make_hints( - table_name=table_name, columns=table_schema["columns"] - ), - create_table_variant=True, - ) - yield dlt.mark.with_table_name(data_items, table_name) - self.generated_all = consumer.consumed_all - - -from devtools import debug - - class MessageConsumer: """Consumes messages from a ReplicationCursor sequentially. @@ -412,14 +345,12 @@ class MessageConsumer: def __init__( self, upto_lsn: int, - pub_ops: Dict[str, bool], table_qnames: Set[str], target_batch_size: int = 1000, included_columns: Optional[Dict[str, Sequence[str]]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, ) -> None: self.upto_lsn = upto_lsn - self.pub_ops = pub_ops self.table_qnames = table_qnames self.target_batch_size = target_batch_size self.included_columns = included_columns or {} @@ -435,9 +366,9 @@ def __init__( # other attributes only maintain last-seen values self.last_table_schema: Dict[ str, TTableSchema - ] = dict() # maps relation_id to table schema + ] = dict() # maps table name to table schema self.last_commit_ts: pendulum.DateTime - self.last_commit_lsn = None + self.last_commit_lsn: Optional[int] = None def __call__(self, msg: ReplicationMessage) -> None: """Processes message received from stream.""" @@ -458,84 +389,16 @@ def process_msg(self, msg: ReplicationMessage) -> None: row_msg.ParseFromString(msg.payload) op = row_msg.op if op == Op.BEGIN: - self.last_commit_ts = convert_pg_ts(row_msg.commit_time) # type: ignore[assignment] + self.last_commit_ts = _convert_db_timestamp(row_msg.commit_time) elif op == Op.COMMIT: - self.process_commit(msg) + self.process_commit(msg.data_start) elif op == Op.INSERT: - if row_msg.table not in self.table_qnames: - return - _, table_name = row_msg.table.split(".") - last_table_schema = self.last_table_schema.get(table_name) - table_schema = extract_table_schema( - row_msg, - column_hints=self.columns.get(table_name), - included_columns=self.included_columns.get(table_name), - ) - if last_table_schema is None: - self.last_table_schema[table_name] = table_schema - elif last_table_schema != table_schema: - raise StopReplication # table schema change - data_item = gen_data_item( - row_msg, - table_schema["columns"], - lsn=msg.data_start, - included_columns=( - None - if self.included_columns is None - else self.included_columns.get(table_name) - ), - ) - self.data_items[table_name].append(data_item) + self.process_change(row_msg, msg.data_start) elif op == Op.UPDATE: - if row_msg.table not in self.table_qnames: - return - _, table_name = row_msg.table.split(".") - last_table_schema = self.last_table_schema.get(table_name) - table_schema = extract_table_schema( - row_msg, - column_hints=self.columns.get(table_name), - included_columns=self.included_columns.get(table_name), - ) - if last_table_schema is None: - self.last_table_schema[table_name] = table_schema - elif last_table_schema != table_schema: - raise StopReplication # table schema change - data_item = gen_data_item( - row_msg, - table_schema["columns"], - lsn=msg.data_start, - included_columns=( - None - if self.included_columns is None - else self.included_columns.get(table_name) - ), - ) - self.data_items[table_name].append(data_item) + self.process_change(row_msg, msg.data_start) elif op == Op.DELETE: - if row_msg.table not in self.table_qnames: - return - _, table_name = row_msg.table.split(".") - data_item = gen_delete_item( - row_msg, - lsn=msg.data_start, - included_columns=( - None - if self.included_columns is None - else self.included_columns.get(table_name) - ), - ) - self.data_items[table_name].append(data_item) - # op = msg.payload[:1] - # elif op == b"R": - # self.process_relation(Relation(msg.payload)) - # elif op == b"T": - # logger.warning( - # "The truncate operation is currently not supported. " - # "Truncate replication messages are ignored." - # ) + self.process_delete(row_msg, msg.data_start) else: - debug(msg) - debug(MessageToDict(row_msg, including_default_value_fields=True)) # type: ignore[call-arg] raise AssertionError(f"Unsupported operation : {row_msg}") except StopReplication: raise @@ -545,13 +408,13 @@ def process_msg(self, msg: ReplicationMessage) -> None: ) raise - def process_commit(self, msg: ReplicationMessage) -> None: + def process_commit(self, lsn: int) -> None: """Updates object state when Commit message is observed. Raises StopReplication when `upto_lsn` or `target_batch_size` is reached. """ - self.last_commit_lsn = msg.data_start - if msg.data_start >= self.upto_lsn: + self.last_commit_lsn = lsn + if lsn >= self.upto_lsn: self.consumed_all = True n_items = sum( [len(items) for items in self.data_items.values()] @@ -559,122 +422,111 @@ def process_commit(self, msg: ReplicationMessage) -> None: if self.consumed_all or n_items >= self.target_batch_size: raise StopReplication - # def process_relation(self, decoded_msg: Relation) -> None: - # """Processes a replication message of type Relation. - # - # Stores table schema in object state. - # Creates meta item to emit column hints while yielding data. - # - # Raises StopReplication when a table's schema changes. - # """ - # if ( - # self.data_items.get(decoded_msg.relation_id) is not None - # ): # table schema change - # raise StopReplication - # # get table schema information from source and store in object state - # table_name = decoded_msg.relation_name - # columns: TTableSchemaColumns = { - # c.name: _to_dlt_column_schema(c) for c in decoded_msg.columns - # } - # self.last_table_schema[decoded_msg.relation_id] = { - # "name": table_name, - # "columns": columns, - # } - # - # # apply user input - # # 1) exclude columns - # include_columns = ( - # None - # if self.include_columns is None - # else self.include_columns.get(table_name) - # ) - # if include_columns is not None: - # columns = {k: v for k, v in columns.items() if k in include_columns} - # # 2) override source hints - # column_hints: TTableSchemaColumns = ( - # dict() if self.columns is None else self.columns.get(table_name, dict()) - # ) - # for column_name, column_val in column_hints.items(): - # columns[column_name] = merge_column(columns[column_name], column_val) - # - # # add hints for replication columns - # columns["lsn"] = {"data_type": "bigint", "nullable": True} - # if self.pub_ops["update"] or self.pub_ops["delete"]: - # columns["lsn"]["dedup_sort"] = "desc" - # if self.pub_ops["delete"]: - # columns["deleted_ts"] = { - # "hard_delete": True, - # "data_type": "timestamp", - # "nullable": True, - # } - # - # # determine write disposition - # write_disposition: TWriteDisposition = "append" - # if self.pub_ops["update"] or self.pub_ops["delete"]: - # write_disposition = "merge" - # - # # include meta item to emit hints while yielding data - # meta_item = dlt.mark.with_hints( - # [], - # dlt.mark.make_hints( - # table_name=table_name, - # write_disposition=write_disposition, - # columns=columns, - # ), - # create_table_variant=True, - # ) - # self.data_items[decoded_msg.relation_id] = [meta_item] - - # def process_change( - # self, decoded_msg: Union[Insert, Update, Delete], msg_start_lsn: int - # ) -> None: - # """Processes replication message of type Insert, Update, or Delete. - # - # Adds data item for inserted/updated/deleted record to instance attribute. - # """ - # if isinstance(decoded_msg, (Insert, Update)): - # column_data = decoded_msg.new_tuple.column_data - # elif isinstance(decoded_msg, Delete): - # column_data = decoded_msg.old_tuple.column_data - # table_name = self.last_table_schema[decoded_msg.relation_id]["name"] - # data_item = self.gen_data_item( - # data=column_data, - # column_schema=self.last_table_schema[decoded_msg.relation_id]["columns"], - # lsn=msg_start_lsn, - # commit_ts=self.last_commit_ts, - # for_delete=isinstance(decoded_msg, Delete), - # include_columns=( - # None - # if self.include_columns is None - # else self.include_columns.get(table_name) - # ), - # ) - # self.data_items[decoded_msg.relation_id].append(data_item) - # - # @staticmethod - # def gen_data_item( - # data: List[ColumnData], - # column_schema: TTableSchemaColumns, - # lsn: int, - # commit_ts: pendulum.DateTime, - # for_delete: bool, - # include_columns: Optional[Sequence[str]] = None, - # ) -> TDataItem: - # """Generates data item from replication message data and corresponding metadata.""" - # data_item = { - # schema["name"]: _to_dlt_val( - # val=data.col_data, - # data_type=schema["data_type"], - # byte1=data.col_data_category, - # for_delete=for_delete, - # ) - # for (schema, data) in zip(column_schema.values(), data) - # if (True if include_columns is None else schema["name"] in include_columns) - # } - # data_item["lsn"] = lsn - # if for_delete: - # data_item["deleted_ts"] = commit_ts - # return data_item + def process_change(self, msg: RowMessage, lsn: int) -> None: + """Processes replication message of type Insert or Update""" + if msg.table not in self.table_qnames: + return + _, table_name = msg.table.split(".") + last_table_schema = self.last_table_schema.get(table_name) + table_schema = extract_table_schema( + msg, + column_hints=self.columns.get(table_name), + included_columns=self.included_columns.get(table_name), + ) + if last_table_schema is None: + self.last_table_schema[table_name] = table_schema + elif last_table_schema != table_schema: + raise StopReplication # table schema change + + data_item = gen_data_item( + msg, + table_schema["columns"], + lsn=lsn, + included_columns=( + None + if self.included_columns is None + else self.included_columns.get(table_name) + ), + ) + self.data_items[table_name].append(data_item) + + def process_delete(self, msg: RowMessage, lsn: int) -> None: + """Processes replication message of type Delete""" + if msg.table not in self.table_qnames: + return + _, table_name = msg.table.split(".") + data_item = gen_delete_item( + msg, + lsn=lsn, + included_columns=( + None + if self.included_columns is None + else self.included_columns.get(table_name) + ), + ) + self.data_items[table_name].append(data_item) + + +@dataclass +class ItemGenerator: + credentials: ConnectionStringCredentials + slot_name: str + table_qnames: Set[str] + options: Dict[str, str] + upto_lsn: int + start_lsn: int = 0 + target_batch_size: int = 1000 + included_columns: Optional[Dict[str, Sequence[str]]] = None + columns: Optional[Dict[str, TTableSchemaColumns]] = None + last_commit_lsn: Optional[int] = field(default=None, init=False) + generated_all: bool = False + + def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: + """Yields replication messages from MessageConsumer. + + Starts replication of messages published by the publication from the replication slot. + Maintains LSN of last consumed Commit message in object state. + Does not advance the slot. + """ + try: + cur = _get_rep_conn(self.credentials).cursor() + cur.start_replication( + slot_name=self.slot_name, + start_lsn=self.start_lsn, + decode=False, + options=self.options, + ) + consumer = MessageConsumer( + upto_lsn=self.upto_lsn, + table_qnames=self.table_qnames, + target_batch_size=self.target_batch_size, + included_columns=self.included_columns, + columns=self.columns, + ) + cur.consume_stream(consumer) + except StopReplication: # completed batch or reached `upto_lsn` + pass + finally: + cur.connection.close() + yield from self.flush(consumer) + + def flush( + self, consumer: MessageConsumer + ) -> Iterator[Union[TDataItem, DataItemWithMeta]]: + self.last_commit_lsn = consumer.last_commit_lsn + for table_name, data_items in consumer.data_items.items(): + table_schema = consumer.last_table_schema.get(table_name) + if table_schema: + assert table_name == table_schema["name"] + yield dlt.mark.with_hints( # meta item with column hints only, no data + [], + dlt.mark.make_hints( + table_name=table_name, columns=table_schema["columns"] + ), + create_table_variant=True, + ) + yield dlt.mark.with_table_name(data_items, table_name) + self.generated_all = consumer.consumed_all # FIXME Refactor later @@ -791,34 +643,27 @@ def gen_delete_item( lsn: int, included_columns: Optional[Sequence[str]] = None, ) -> TDataItem: - """Generates data item from a `RowMessage` and corresponding metadata.""" + """Generates DELETE data item from a `RowMessage` and corresponding metadata.""" assert row_msg.op == Op.DELETE column_data = row_msg.old_tuple type_mapper = _type_mapper() - data_item = {} + data_item = {"lsn": lsn, "deleted_ts": _convert_db_timestamp(row_msg.commit_time)} for data in column_data: - if included_columns and data.column_name not in included_columns: + col_name = data.column_name + if included_columns and col_name not in included_columns: continue - datum_name = data.WhichOneof("datum") - if datum_name: - data_item[data.column_name] = getattr(data, datum_name) + datum = data.WhichOneof("datum") + if datum: + data_item[col_name] = getattr(data, datum) else: db_type = _PG_TYPES[data.column_type] col_type: TColumnType = type_mapper.from_db_type(db_type) - data_item[data.column_name] = _DUMMY_VALS[col_type["data_type"]] - - data_item["lsn"] = lsn - data_item["deleted_ts"] = _convert_db_timestamp(row_msg.commit_time) + data_item[col_name] = _DUMMY_VALS[col_type["data_type"]] return data_item -def _convert_pg_timestamp(microseconds_since_2000: int) -> pendulum.DateTime: - epoch_2000 = pendulum.datetime(2000, 1, 1, tz="UTC") - return epoch_2000.add(microseconds=microseconds_since_2000) - - def _convert_db_timestamp(microseconds_since_1970: int) -> pendulum.DateTime: - return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000, tz="UTC") + return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000, tz="UTC") \ No newline at end of file From f44853bb883afa9d0749dd41cb62807033804e7a Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 21 Oct 2024 14:49:17 +0200 Subject: [PATCH 25/88] wip: cleaning up + refactor --- sources/pg_legacy_replication/helpers.py | 108 ++++-- tests/pg_legacy_replication/test_helpers.py | 349 +++++++++++------- .../test_pg_replication.py | 7 +- 3 files changed, 290 insertions(+), 174 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index a6f7c8a50..ce9efe21b 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -22,6 +22,7 @@ import dlt from dlt.common import logger +from dlt.common.data_types import coerce_value from dlt.common.typing import TDataItem from dlt.common.pendulum import pendulum from dlt.common.schema.typing import ( @@ -50,8 +51,7 @@ from sqlalchemy import Connection as ConnectionSqla, Engine, event -from .pg_logicaldec_pb2 import Op, RowMessage -from .schema_types import _to_dlt_column_schema, _to_dlt_val +from .pg_logicaldec_pb2 import DatumMessage, Op, RowMessage from .exceptions import SqlDatabaseSourceImportError from google.protobuf.json_format import MessageToDict from collections import defaultdict @@ -357,18 +357,12 @@ def __init__( self.columns = columns or {} self.consumed_all: bool = False - # data_items attribute maintains all data items - self.data_items: Dict[ - str, List[Union[TDataItem, DataItemWithMeta]] - ] = defaultdict( - list - ) # maps qualified table names to list of data items - # other attributes only maintain last-seen values - self.last_table_schema: Dict[ - str, TTableSchema - ] = dict() # maps table name to table schema + # maps table names to list of data items + self.data_items: Dict[str, List[TDataItem]] = defaultdict(list) + # maps table name to table schema + self.last_table_schema: Dict[str, TTableSchema] = dict() self.last_commit_ts: pendulum.DateTime - self.last_commit_lsn: Optional[int] = None + self.last_commit_lsn: int def __call__(self, msg: ReplicationMessage) -> None: """Processes message received from stream.""" @@ -388,16 +382,17 @@ def process_msg(self, msg: ReplicationMessage) -> None: try: row_msg.ParseFromString(msg.payload) op = row_msg.op + lsn = msg.data_start if op == Op.BEGIN: - self.last_commit_ts = _convert_db_timestamp(row_msg.commit_time) + self.last_commit_ts = epoch_micros_to_datetime(row_msg.commit_time) elif op == Op.COMMIT: - self.process_commit(msg.data_start) + self.process_commit(lsn) elif op == Op.INSERT: - self.process_change(row_msg, msg.data_start) + self.process_change(row_msg, lsn) elif op == Op.UPDATE: - self.process_change(row_msg, msg.data_start) + self.process_change(row_msg, lsn) elif op == Op.DELETE: - self.process_delete(row_msg, msg.data_start) + self.process_delete(row_msg, lsn) else: raise AssertionError(f"Unsupported operation : {row_msg}") except StopReplication: @@ -439,7 +434,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: raise StopReplication # table schema change data_item = gen_data_item( - msg, + msg.new_tuple, table_schema["columns"], lsn=lsn, included_columns=( @@ -456,7 +451,8 @@ def process_delete(self, msg: RowMessage, lsn: int) -> None: return _, table_name = msg.table.split(".") data_item = gen_delete_item( - msg, + msg.old_tuple, + msg.commit_time, lsn=lsn, included_columns=( None @@ -531,7 +527,7 @@ def flush( # FIXME Refactor later from .schema_types import _PG_TYPES, _type_mapper, _DUMMY_VALS -from dlt.common.schema.typing import TColumnType, TColumnSchema +from dlt.common.schema.typing import TColumnType, TColumnSchema, TDataType _DATUM_PRECISIONS: Dict[str, int] = { "datum_int32": 32, @@ -539,6 +535,16 @@ def flush( "datum_float": 32, "datum_double": 64, } + +_DATUM_RAW_TYPES: Dict[str, TDataType] = { + "datum_int32": "bigint", + "datum_int64": "bigint", + "datum_float": "double", + "datum_double": "double", + "datum_bool": "bool", + "datum_string": "text", + "datum_bytes": "binary", +} """TODO: Add comment here""" @@ -618,39 +624,57 @@ def extract_table_schema( def gen_data_item( - row_msg: RowMessage, + row: Sequence[DatumMessage], column_schema: TTableSchemaColumns, *, lsn: int, included_columns: Optional[Sequence[str]] = None, ) -> TDataItem: - """Generates data item from a `RowMessage` and corresponding metadata.""" - assert row_msg.op in (Op.INSERT, Op.UPDATE) - data_item = {"lsn": lsn} - for data in row_msg.new_tuple: - if included_columns and data.column_name not in included_columns: + """Generates data item from a row and corresponding metadata.""" + data_item: TDataItem = {"lsn": lsn} + for data in row: + col_name = data.column_name + col_schema = column_schema[col_name] + if included_columns and col_name not in included_columns: continue datum = data.WhichOneof("datum") - assert datum or column_schema[data.column_name]["nullable"] - data_item[data.column_name] = getattr(data, datum) if datum else None + assert datum or col_schema["nullable"] + if datum is None: + data_item[col_name] = None + else: + raw_value = getattr(data, datum) + data_type = col_schema["data_type"] + if data_type == "date": + data_item[col_name] = epoch_days_to_date(raw_value) + elif data_type == "time": + data_item[col_name] = microseconds_to_time(raw_value) + elif data_type == "timestamp": + data_item[col_name] = epoch_micros_to_datetime(raw_value) + else: + data_item[col_name] = coerce_value( + to_type=col_schema["data_type"], + from_type=_DATUM_RAW_TYPES[datum], + value=raw_value, + ) return data_item def gen_delete_item( - row_msg: RowMessage, + row: Sequence[DatumMessage], + commit_time: int, *, lsn: int, included_columns: Optional[Sequence[str]] = None, ) -> TDataItem: - """Generates DELETE data item from a `RowMessage` and corresponding metadata.""" - assert row_msg.op == Op.DELETE - - column_data = row_msg.old_tuple + """Generates DELETE data item from a row and corresponding metadata.""" type_mapper = _type_mapper() - data_item = {"lsn": lsn, "deleted_ts": _convert_db_timestamp(row_msg.commit_time)} + data_item: TDataItem = { + "lsn": lsn, + "deleted_ts": epoch_micros_to_datetime(commit_time), + } - for data in column_data: + for data in row: col_name = data.column_name if included_columns and col_name not in included_columns: continue @@ -665,5 +689,13 @@ def gen_delete_item( return data_item -def _convert_db_timestamp(microseconds_since_1970: int) -> pendulum.DateTime: - return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000, tz="UTC") \ No newline at end of file +def epoch_micros_to_datetime(microseconds_since_1970: int) -> pendulum.DateTime: + return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000) + + +def microseconds_to_time(microseconds: int) -> pendulum.Time: + return pendulum.Time(0).add(microseconds=microseconds) + + +def epoch_days_to_date(epoch_days: int) -> pendulum.Date: + return pendulum.Date(1970, 1, 1).add(days=epoch_days) diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 6603d421e..bee57239c 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -1,8 +1,10 @@ +import random from typing import Optional import pendulum import pytest from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns +from dlt.common.typing import TDataItem from google.protobuf.json_format import ParseDict as parse_dict from sources.pg_legacy_replication.helpers import ( @@ -14,7 +16,94 @@ @pytest.mark.parametrize( - "data, column_hints, expected_schema", + "data", + [ + { + "transactionId": 969, + "commitTime": "1728662646949062", + "table": "src_pl_dataset_202410110404048747_staging.tbl_y", + "op": "INSERT", + "newTuple": [ + {"columnName": "id_y", "columnType": "20", "datumInt64": 2}, + {"columnName": "val_y", "columnType": "16", "datumBool": False}, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + "datumString": "1728662646.2657657", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + "datumString": "gGjifTMTAUs5ag", + }, + ], + "newTypeinfo": [ + {"modifier": "bigint", "valueOptional": False}, + {"modifier": "boolean", "valueOptional": True}, + {"modifier": "character varying", "valueOptional": False}, + {"modifier": "character varying", "valueOptional": False}, + ], + "oldTuple": [], + }, + ], +) +@pytest.mark.parametrize( + "column_hints", + [{"id_y": {"primary_key": True}}], +) +@pytest.mark.parametrize( + "expected_schema", + [ + { + "name": "tbl_y", + "columns": { + "id_y": { + "data_type": "bigint", + "precision": 64, + "name": "id_y", + "nullable": False, + "primary_key": True, + }, + "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "_dlt_id": { + "data_type": "text", + "name": "_dlt_id", + "nullable": False, + }, + "lsn": { + "data_type": "bigint", + "dedup_sort": "desc", + "nullable": True, + }, + "deleted_ts": { + "data_type": "timestamp", + "hard_delete": True, + "nullable": True, + }, + }, + }, + ], +) +def test_extract_table_schema( + data, + column_hints: Optional[TTableSchemaColumns], + expected_schema: TTableSchema, +): + row_msg = RowMessage() + parse_dict(data, row_msg) + assert extract_table_schema(row_msg, column_hints=column_hints) == expected_schema + + +LSN = random.randint(0, 10000) + + +@pytest.mark.parametrize( + "data, data_item", [ ( { @@ -23,8 +112,16 @@ "table": "src_pl_dataset_202410110404048747_staging.tbl_y", "op": "INSERT", "newTuple": [ - {"columnName": "id_y", "columnType": "20", "datumInt64": 2}, - {"columnName": "val_y", "columnType": "16", "datumBool": False}, + { + "columnName": "id_y", + "columnType": "20", + "datumInt64": "2", + }, + { + "columnName": "val_y", + "columnType": "16", + "datumBool": False, + }, { "columnName": "_dlt_load_id", "columnType": "1043", @@ -37,155 +134,137 @@ }, ], "newTypeinfo": [ - {"modifier": "bigint", "valueOptional": False}, - {"modifier": "boolean", "valueOptional": True}, - {"modifier": "character varying", "valueOptional": False}, - {"modifier": "character varying", "valueOptional": False}, + { + "modifier": "bigint", + "valueOptional": False, + }, + { + "modifier": "boolean", + "valueOptional": True, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, ], "oldTuple": [], }, - {"id_y": {"primary_key": True}}, { - "name": "tbl_y", - "columns": { - "id_y": { - "data_type": "bigint", - "precision": 64, - "name": "id_y", - "nullable": False, - "primary_key": True, - }, - "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, - "_dlt_load_id": { - "data_type": "text", - "name": "_dlt_load_id", - "nullable": False, - }, - "_dlt_id": { - "data_type": "text", - "name": "_dlt_id", - "nullable": False, - }, - "lsn": { - "data_type": "bigint", - "dedup_sort": "desc", - "nullable": True, - }, - "deleted_ts": { - "data_type": "timestamp", - "hard_delete": True, - "nullable": True, + "_dlt_id": "gGjifTMTAUs5ag", + "_dlt_load_id": "1728662646.2657657", + "id_y": 2, + "lsn": LSN, + "val_y": False, + }, + ), + ( + { + "transactionId": 2018, + "commitTime": "1729503423666542", + "table": "src_pl_dataset_202410210936594956.items", + "op": "INSERT", + "newTuple": [ + { + "columnName": "col4", + "columnType": 1184, + "datumInt64": 1653312405176451, }, + { + "columnName": "col9", + "columnType": 3802, + "datumString": ( + '{"link": "?commen\\ntU\\nrn=urn%3Ali%3Acomment%3A%28acti\\n \\u0006 \\\\vity%3A69\'08444473\\n\\n551163392' + '%2C6n \\r \x8e9085", "complex": [1, 2, 3, "a"]}' + ), + }, + { + "columnName": "col10", + "columnType": 1082, + "datumInt32": 19415, + }, + { + "columnName": "col11", + "columnType": 1083, + "datumInt64": 48405176451, + }, + ], + "newTypeinfo": [ + {"modifier": "timestamp with time zone", "valueOptional": False}, + {"modifier": "jsonb", "valueOptional": False}, + {"modifier": "date", "valueOptional": False}, + {"modifier": "time without time zone", "valueOptional": False}, + ], + }, + { + "lsn": LSN, + "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), + "col9": { + "complex": [1, 2, 3, "a"], + "link": ( + "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" + " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" + ), }, + "col10": pendulum.parse("2023-02-27", strict=False).date(), + "col11": pendulum.parse("13:26:45.176451", strict=False).time(), }, ), ], ) -def test_extract_table_schema( - data, - column_hints: Optional[TTableSchemaColumns], - expected_schema: TTableSchema, -): - row_msg = RowMessage() - parse_dict(data, row_msg) - assert extract_table_schema(row_msg, column_hints=column_hints) == expected_schema - - -def test_gen_data_item(): +def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() - data = { - "transactionId": 969, - "commitTime": "1728662646949062", - "table": "src_pl_dataset_202410110404048747_staging.tbl_y", - "op": "INSERT", - "newTuple": [ - { - "columnName": "id_y", - "columnType": "20", - "datumInt64": "2", - }, - { - "columnName": "val_y", - "columnType": "16", - "datumBool": False, - }, - { - "columnName": "_dlt_load_id", - "columnType": "1043", - "datumString": "1728662646.2657657", - }, - { - "columnName": "_dlt_id", - "columnType": "1043", - "datumString": "gGjifTMTAUs5ag", - }, - ], - "newTypeinfo": [ - { - "modifier": "bigint", - "valueOptional": False, - }, - { - "modifier": "boolean", - "valueOptional": True, - }, - { - "modifier": "character varying", - "valueOptional": False, - }, - { - "modifier": "character varying", - "valueOptional": False, - }, - ], - "oldTuple": [], - } parse_dict(data, row_msg) table_schema = extract_table_schema(row_msg) - assert gen_data_item(row_msg, table_schema["columns"], lsn=27078296) == { - "_dlt_id": "gGjifTMTAUs5ag", - "_dlt_load_id": "1728662646.2657657", - "id_y": 2, - "lsn": 27078296, - "val_y": False, - } + assert ( + gen_data_item(row_msg.new_tuple, table_schema["columns"], lsn=LSN) == data_item + ) -def test_gen_delete_item(): - row_msg = RowMessage() - data = { - "transactionId": 932, - "commitTime": "1729299383354856", - "table": "src_pl_dataset_202410191256122080.tbl_x", - "op": "DELETE", - "oldTuple": [ - { - "columnName": "id_x", - "columnType": "20", - "datumInt64": "1", - }, - { - "columnName": "val_x", - "columnType": "1043", - }, +@pytest.mark.parametrize( + "data, data_item", + [ + ( { - "columnName": "_dlt_load_id", - "columnType": "1043", + "transactionId": 932, + "commitTime": "1729299383354856", + "table": "src_pl_dataset_202410191256122080.tbl_x", + "op": "DELETE", + "oldTuple": [ + { + "columnName": "id_x", + "columnType": "20", + "datumInt64": "1", + }, + { + "columnName": "val_x", + "columnType": "1043", + }, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + }, + ], }, { - "columnName": "_dlt_id", - "columnType": "1043", + "id_x": 1, + "val_x": "", + "_dlt_load_id": "", + "_dlt_id": "", + "lsn": LSN, + "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), }, - ], - "newTuple": [], - "newTypeinfo": [], - } + ), + ], +) +def test_gen_delete_item(data, data_item: TDataItem): + row_msg = RowMessage() parse_dict(data, row_msg) - assert gen_delete_item(row_msg, lsn=27078296) == { - "id_x": 1, - "val_x": "", - "_dlt_load_id": "", - "_dlt_id": "", - "lsn": 27078296, - "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), - } + assert gen_delete_item(row_msg.old_tuple, row_msg.commit_time, lsn=LSN) == data_item \ No newline at end of file diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index c8d27be63..5044acc47 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -267,11 +267,16 @@ def items(data): columns={"items": column_schema} if give_hints else None, ) + if give_hints: + column_schema["col1"]["primary_key"] = True + else: + column_schema = {"col1": {"primary_key": True}} + changes = replication_resource( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("items",), - columns={"items": column_schema} if give_hints else None, + columns={"items": column_schema}, ) # initial load From 46200ca5f1c7ced9ebc8d44353e0a86bb30d0f39 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 22 Oct 2024 00:36:36 +0200 Subject: [PATCH 26/88] wip: cleaning up + refactor --- sources/pg_legacy_replication/helpers.py | 96 +++---------------- sources/pg_legacy_replication/schema_types.py | 75 +++++++++++---- tests/pg_legacy_replication/test_helpers.py | 20 +--- 3 files changed, 73 insertions(+), 118 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index ce9efe21b..0ea3d3500 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -37,7 +37,7 @@ from dlt.extract.resource import DltResource from dlt.sources.credentials import ConnectionStringCredentials -from .schema_types import _to_dlt_column_schema, _to_dlt_val +from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val from .exceptions import IncompatiblePostgresVersionException from .decoders import ( Begin, @@ -384,7 +384,7 @@ def process_msg(self, msg: ReplicationMessage) -> None: op = row_msg.op lsn = msg.data_start if op == Op.BEGIN: - self.last_commit_ts = epoch_micros_to_datetime(row_msg.commit_time) + self.last_commit_ts = _epoch_micros_to_datetime(row_msg.commit_time) elif op == Op.COMMIT: self.process_commit(lsn) elif op == Op.INSERT: @@ -435,14 +435,14 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: data_item = gen_data_item( msg.new_tuple, - table_schema["columns"], - lsn=lsn, + column_schema=table_schema["columns"], included_columns=( None if self.included_columns is None else self.included_columns.get(table_name) ), ) + data_item["lsn"] = lsn self.data_items[table_name].append(data_item) def process_delete(self, msg: RowMessage, lsn: int) -> None: @@ -450,16 +450,17 @@ def process_delete(self, msg: RowMessage, lsn: int) -> None: if msg.table not in self.table_qnames: return _, table_name = msg.table.split(".") - data_item = gen_delete_item( + data_item = gen_data_item( msg.old_tuple, - msg.commit_time, - lsn=lsn, + for_delete=True, included_columns=( None if self.included_columns is None else self.included_columns.get(table_name) ), ) + data_item["lsn"] = lsn + data_item["deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) self.data_items[table_name].append(data_item) @@ -536,17 +537,6 @@ def flush( "datum_double": 64, } -_DATUM_RAW_TYPES: Dict[str, TDataType] = { - "datum_int32": "bigint", - "datum_int64": "bigint", - "datum_float": "double", - "datum_double": "double", - "datum_bool": "bool", - "datum_string": "text", - "datum_bytes": "binary", -} -"""TODO: Add comment here""" - def extract_table_schema( row_msg: RowMessage, @@ -625,77 +615,21 @@ def extract_table_schema( def gen_data_item( row: Sequence[DatumMessage], - column_schema: TTableSchemaColumns, *, - lsn: int, included_columns: Optional[Sequence[str]] = None, + column_schema: Optional[TTableSchemaColumns] = None, + for_delete: bool = False, ) -> TDataItem: """Generates data item from a row and corresponding metadata.""" - data_item: TDataItem = {"lsn": lsn} - for data in row: - col_name = data.column_name - col_schema = column_schema[col_name] - if included_columns and col_name not in included_columns: - continue - datum = data.WhichOneof("datum") - assert datum or col_schema["nullable"] - if datum is None: - data_item[col_name] = None - else: - raw_value = getattr(data, datum) - data_type = col_schema["data_type"] - if data_type == "date": - data_item[col_name] = epoch_days_to_date(raw_value) - elif data_type == "time": - data_item[col_name] = microseconds_to_time(raw_value) - elif data_type == "timestamp": - data_item[col_name] = epoch_micros_to_datetime(raw_value) - else: - data_item[col_name] = coerce_value( - to_type=col_schema["data_type"], - from_type=_DATUM_RAW_TYPES[datum], - value=raw_value, - ) - - return data_item - - -def gen_delete_item( - row: Sequence[DatumMessage], - commit_time: int, - *, - lsn: int, - included_columns: Optional[Sequence[str]] = None, -) -> TDataItem: - """Generates DELETE data item from a row and corresponding metadata.""" - type_mapper = _type_mapper() - data_item: TDataItem = { - "lsn": lsn, - "deleted_ts": epoch_micros_to_datetime(commit_time), - } + data_item: TDataItem = {} for data in row: col_name = data.column_name if included_columns and col_name not in included_columns: continue - datum = data.WhichOneof("datum") - if datum: - data_item[col_name] = getattr(data, datum) - else: - db_type = _PG_TYPES[data.column_type] - col_type: TColumnType = type_mapper.from_db_type(db_type) - data_item[col_name] = _DUMMY_VALS[col_type["data_type"]] + data_type = ( + column_schema[col_name]["data_type"] if column_schema else data.column_type + ) + data_item[col_name] = _to_dlt_val(data, data_type, for_delete=for_delete) return data_item - - -def epoch_micros_to_datetime(microseconds_since_1970: int) -> pendulum.DateTime: - return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000) - - -def microseconds_to_time(microseconds: int) -> pendulum.Time: - return pendulum.Time(0).add(microseconds=microseconds) - - -def epoch_days_to_date(epoch_days: int) -> pendulum.Date: - return pendulum.Date(1970, 1, 1).add(days=epoch_days) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 81692e0bb..be375e56b 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -1,6 +1,7 @@ from functools import lru_cache import json -from typing import Optional, Any, Dict +import pendulum +from typing import Optional, Any, Dict, Callable, Union from dlt.common import Decimal from dlt.common.data_types.typing import TDataType @@ -8,7 +9,7 @@ from dlt.common.schema.typing import TColumnSchema, TColumnType from .decoders import ColumnType - +from .pg_logicaldec_pb2 import DatumMessage _DUMMY_VALS: Dict[TDataType, Any] = { "bigint": 0, @@ -25,7 +26,6 @@ } """Dummy values used to replace NULLs in NOT NULL columns in key-only delete records.""" - _PG_TYPES: Dict[int, str] = { 16: "boolean", 17: "bytea", @@ -42,6 +42,17 @@ } """Maps postgres type OID to type string. Only includes types present in PostgresTypeMapper.""" +_DATUM_RAW_TYPES: Dict[str, TDataType] = { + "datum_int32": "bigint", + "datum_int64": "bigint", + "datum_float": "double", + "datum_double": "double", + "datum_bool": "bool", + "datum_string": "text", + "datum_bytes": "binary", +} +"""Maps decoderbuf's datum msg type to dlt type.""" + def _get_precision(type_id: int, atttypmod: int) -> Optional[int]: """Get precision from postgres type attributes.""" @@ -109,21 +120,43 @@ def _to_dlt_column_schema(col: ColumnType) -> TColumnSchema: return {**dlt_column_type, **partial_column_schema} # type: ignore[typeddict-item] -def _to_dlt_val(val: str, data_type: TDataType, byte1: str, for_delete: bool) -> Any: - """Converts pgoutput's text-formatted value into dlt-compatible data value.""" - if byte1 == "n": - if for_delete: - # replace None with dummy value to prevent NOT NULL violations in staging table - return _DUMMY_VALS[data_type] - return None - elif byte1 == "t": - if data_type == "binary": - # https://www.postgresql.org/docs/current/datatype-binary.html#DATATYPE-BINARY-BYTEA-HEX-FORMAT - return bytes.fromhex(val.replace("\\x", "")) - elif data_type == "complex": - return json.loads(val) - return coerce_value(data_type, "text", val) - else: - raise ValueError( - f"Byte1 in replication message must be 'n' or 't', not '{byte1}'." - ) +def _epoch_micros_to_datetime(microseconds_since_1970: int) -> pendulum.DateTime: + return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000) + + +def _microseconds_to_time(microseconds: int) -> pendulum.Time: + return pendulum.Time(0).add(microseconds=microseconds) + + +def _epoch_days_to_date(epoch_days: int) -> pendulum.Date: + return pendulum.Date(1970, 1, 1).add(days=epoch_days) + + +data_type_handlers: Dict[TDataType, Callable[[Any], Any]] = { + "date": _epoch_days_to_date, + "time": _microseconds_to_time, + "timestamp": _epoch_micros_to_datetime, +} + + +def _to_dlt_val( + val: DatumMessage, data_type: Union[TDataType, int], *, for_delete: bool = False +) -> Any: + """Converts decoderbuf's datum value into dlt-compatible data value.""" + if isinstance(data_type, int): + col_type: TColumnType = _type_mapper().from_db_type(_PG_TYPES[data_type]) + data_type = col_type["data_type"] + + datum = val.WhichOneof("datum") + if datum is None: + return _DUMMY_VALS[data_type] if for_delete else None + + raw_value = getattr(val, datum) + if data_type in data_type_handlers: + return data_type_handlers[data_type](raw_value) + + return coerce_value( + to_type=data_type, + from_type=_DATUM_RAW_TYPES[datum], + value=raw_value, + ) diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index bee57239c..f6bdb04d4 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -10,7 +10,6 @@ from sources.pg_legacy_replication.helpers import ( extract_table_schema, gen_data_item, - gen_delete_item, ) from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage @@ -157,7 +156,6 @@ def test_extract_table_schema( "_dlt_id": "gGjifTMTAUs5ag", "_dlt_load_id": "1728662646.2657657", "id_y": 2, - "lsn": LSN, "val_y": False, }, ), @@ -200,7 +198,6 @@ def test_extract_table_schema( ], }, { - "lsn": LSN, "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), "col9": { "complex": [1, 2, 3, "a"], @@ -218,10 +215,8 @@ def test_extract_table_schema( def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) - table_schema = extract_table_schema(row_msg) - assert ( - gen_data_item(row_msg.new_tuple, table_schema["columns"], lsn=LSN) == data_item - ) + column_schema = extract_table_schema(row_msg)["columns"] + assert gen_data_item(row_msg.new_tuple, column_schema=column_schema) == data_item @pytest.mark.parametrize( @@ -253,18 +248,11 @@ def test_gen_data_item(data, data_item: TDataItem): }, ], }, - { - "id_x": 1, - "val_x": "", - "_dlt_load_id": "", - "_dlt_id": "", - "lsn": LSN, - "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), - }, + {"id_x": 1, "val_x": "", "_dlt_load_id": "", "_dlt_id": ""}, ), ], ) def test_gen_delete_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) - assert gen_delete_item(row_msg.old_tuple, row_msg.commit_time, lsn=LSN) == data_item \ No newline at end of file + assert gen_data_item(row_msg.old_tuple, for_delete=True) == data_item From f0f0146652d1e3f179f87c2aa9fcd1b8abb9c0c7 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 22 Oct 2024 13:36:38 +0200 Subject: [PATCH 27/88] wip: slowly progressing --- sources/pg_legacy_replication/helpers.py | 89 +++--------------- sources/pg_legacy_replication/schema_types.py | 91 +++++++++---------- .../test_pg_replication.py | 9 +- 3 files changed, 63 insertions(+), 126 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 0ea3d3500..50ab6fc53 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -1,3 +1,5 @@ +from collections import defaultdict +from dataclasses import dataclass, field from typing import ( Optional, Dict, @@ -8,53 +10,32 @@ Sequence, Any, ) -from dataclasses import dataclass, field - -import psycopg2 -from psycopg2.extensions import cursor, connection as ConnectionExt -from psycopg2.extras import ( - LogicalReplicationConnection, - ReplicationCursor, - ReplicationMessage, - StopReplication, -) import dlt - +import psycopg2 from dlt.common import logger -from dlt.common.data_types import coerce_value -from dlt.common.typing import TDataItem from dlt.common.pendulum import pendulum from dlt.common.schema.typing import ( TTableSchema, TTableSchemaColumns, - TColumnNames, - TWriteDisposition, ) from dlt.common.schema.utils import merge_column -from dlt.common.data_writers.escape import escape_postgres_identifier +from dlt.common.typing import TDataItem from dlt.extract.items import DataItemWithMeta from dlt.extract.resource import DltResource from dlt.sources.credentials import ConnectionStringCredentials - -from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val -from .exceptions import IncompatiblePostgresVersionException -from .decoders import ( - Begin, - Relation, - Insert, - Update, - Delete, - ColumnData, - convert_pg_ts, +from psycopg2.extensions import cursor, connection as ConnectionExt +from psycopg2.extras import ( + LogicalReplicationConnection, + ReplicationCursor, + ReplicationMessage, + StopReplication, ) - from sqlalchemy import Connection as ConnectionSqla, Engine, event -from .pg_logicaldec_pb2 import DatumMessage, Op, RowMessage from .exceptions import SqlDatabaseSourceImportError -from google.protobuf.json_format import MessageToDict -from collections import defaultdict +from .pg_logicaldec_pb2 import DatumMessage, Op, RowMessage +from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) @@ -527,8 +508,6 @@ def flush( # FIXME Refactor later -from .schema_types import _PG_TYPES, _type_mapper, _DUMMY_VALS -from dlt.common.schema.typing import TColumnType, TColumnSchema, TDataType _DATUM_PRECISIONS: Dict[str, int] = { "datum_int32": 32, @@ -556,53 +535,11 @@ def extract_table_schema( "hard_delete": True, }, } - type_mapper = _type_mapper() for col, col_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): col_name = col.column_name if included_columns and col_name not in included_columns: continue - db_type = _PG_TYPES[col.column_type] - col_type: TColumnType = type_mapper.from_db_type(db_type) - col_schema: TColumnSchema = { - "name": col_name, - "nullable": col_info.value_optional, - **col_type, - } - if db_type == "character varying": - import re - - match = re.search(r"character varying\((\d+)\)", col_info.modifier) - if match: - col_schema["precision"] = int(match.group(1)) - elif db_type == "numeric": - import re - - match = re.search(r"numeric\((\d+),(\d+)\)", col_info.modifier) - precision, scale = map(int, match.groups()) - col_schema["precision"] = precision - col_schema["scale"] = scale - elif db_type == "timestamp with time zone": - import re - - match = re.search(r"timestamp\((\d+)\) with time zone", col_info.modifier) - if match: - col_schema["precision"] = int(match.group(1)) - # col_schema["timezone"] = True FIXME - elif db_type == "time without time zone": - import re - - match = re.search(r"time\((\d+)\) without time zone", col_info.modifier) - if match: - col_schema["precision"] = int(match.group(1)) - # col_schema["timezone"] = False FIXME - else: - assert ( - _PG_TYPES[col.column_type] == col_info.modifier - ), f"Type mismatch for column {col_name}" - - if precision := _DATUM_PRECISIONS.get(col.WhichOneof("datum")): - col_schema["precision"] = precision - + col_schema = _to_dlt_column_schema(col, col_info) columns[col_name] = ( merge_column(col_schema, column_hints.get(col_name)) if column_hints and column_hints.get(col_name) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index be375e56b..85ad17712 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -1,15 +1,14 @@ +import re from functools import lru_cache -import json -import pendulum -from typing import Optional, Any, Dict, Callable, Union +from typing import Optional, Any, Dict, Callable, Union, Tuple +import pendulum from dlt.common import Decimal -from dlt.common.data_types.typing import TDataType from dlt.common.data_types.type_helpers import coerce_value +from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import TColumnSchema, TColumnType -from .decoders import ColumnType -from .pg_logicaldec_pb2 import DatumMessage +from .pg_logicaldec_pb2 import DatumMessage, TypeInfo _DUMMY_VALS: Dict[TDataType, Any] = { "bigint": 0, @@ -53,38 +52,37 @@ } """Maps decoderbuf's datum msg type to dlt type.""" +_FIXED_PRECISION_TYPES: Dict[int, Tuple[int, Optional[int]]] = { + 21: (16, None), # smallint + 23: (32, None), # integer + 20: (64, None), # bigint +} +"""Dict for fixed precision types""" + +_VARYING_PRECISION_PATTERNS: Dict[int, str] = { + 1043: r"character varying\((\d+)\)", + 1700: r"numeric\((\d+),(\d+)\)", + 1184: r"timestamp\((\d+)\) with time zone", + 1083: r"time\((\d+)\) without time zone", +} +"""Regex patterns for precision/scale types""" + + +def _get_precision_and_scale( + type_id: int, modifier: str +) -> Optional[Tuple[int, Optional[int]]]: + """Get precision from postgres type attributes and modifiers.""" + if type_id in _FIXED_PRECISION_TYPES: + return _FIXED_PRECISION_TYPES[type_id] + + if pattern := _VARYING_PRECISION_PATTERNS.get(type_id): + if match := re.search(pattern, modifier): + groups = match.groups() + precision = int(groups[0]) + scale = int(groups[1]) if len(groups) > 1 else None + return (precision, scale) -def _get_precision(type_id: int, atttypmod: int) -> Optional[int]: - """Get precision from postgres type attributes.""" - # https://stackoverflow.com/a/3351120 - if type_id == 21: # smallint - return 16 - elif type_id == 23: # integer - return 32 - elif type_id == 20: # bigint - return 64 - if atttypmod != -1: - if type_id == 1700: # numeric - return ((atttypmod - 4) >> 16) & 65535 - elif type_id in ( - 1083, - 1184, - ): # time without time zone, timestamp with time zone - return atttypmod - elif type_id == 1043: # character varying - return atttypmod - 4 - return None - - -def _get_scale(type_id: int, atttypmod: int) -> Optional[int]: - """Get scale from postgres type attributes.""" - # https://stackoverflow.com/a/3351120 - if atttypmod != -1: - if type_id in (21, 23, 20): # smallint, integer, bigint - return 0 - if type_id == 1700: # numeric - return (atttypmod - 4) & 65535 - return None + return (None, None) @lru_cache(maxsize=None) @@ -99,25 +97,23 @@ def _type_mapper() -> Any: return PostgresTypeMapper(postgres().capabilities()) -def _to_dlt_column_type(type_id: int, atttypmod: int) -> TColumnType: +def _to_dlt_column_type(type_id: int, modifier: str) -> TColumnType: """Converts postgres type OID to dlt column type. Type OIDs not in _PG_TYPES mapping default to "text" type. """ pg_type = _PG_TYPES.get(type_id) - precision = _get_precision(type_id, atttypmod) - scale = _get_scale(type_id, atttypmod) + precision, scale = _get_precision_and_scale(type_id, modifier) return _type_mapper().from_db_type(pg_type, precision, scale) # type: ignore[no-any-return] -def _to_dlt_column_schema(col: ColumnType) -> TColumnSchema: - """Converts pypgoutput ColumnType to dlt column schema.""" - dlt_column_type = _to_dlt_column_type(col.type_id, col.atttypmod) - partial_column_schema = { - "name": col.name, - "primary_key": bool(col.part_of_pkey), +def _to_dlt_column_schema(datum: DatumMessage, type_info: TypeInfo) -> TColumnSchema: + """Converts decoderbuf's datum value/typeinfo to dlt column schema.""" + return { + "name": datum.column_name, + "nullable": type_info.value_optional, + **_to_dlt_column_type(datum.column_type, type_info.modifier), } - return {**dlt_column_type, **partial_column_schema} # type: ignore[typeddict-item] def _epoch_micros_to_datetime(microseconds_since_1970: int) -> pendulum.DateTime: @@ -137,6 +133,7 @@ def _epoch_days_to_date(epoch_days: int) -> pendulum.Date: "time": _microseconds_to_time, "timestamp": _epoch_micros_to_datetime, } +"""Dispatch table for type conversions""" def _to_dlt_val( diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 5044acc47..0e89a5753 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -368,10 +368,13 @@ def test_unmapped_data_types( init_replication( slot_name=slot_name, schema=src_pl.dataset_name, - table_names="data_types", - publish="insert", + table_names=("data_types",), + ) + changes = replication_resource( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("data_types",), ) - changes = replication_resource(slot_name) # insert record in source table to create replication item with src_pl.sql_client() as c: From cd8d906cfab8f528e71bf2ab647949501496516b Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 22 Oct 2024 15:34:39 +0200 Subject: [PATCH 28/88] wip: all tests pass now to update docs and cleanup --- sources/pg_legacy_replication/__init__.py | 9 +- sources/pg_legacy_replication/decoders.py | 427 ------------------ sources/pg_legacy_replication/exceptions.py | 8 - sources/pg_legacy_replication/helpers.py | 108 ++--- sources/pg_legacy_replication_pipeline.py | 2 +- tests/pg_legacy_replication/test_helpers.py | 8 +- .../test_pg_replication.py | 110 ++--- 7 files changed, 109 insertions(+), 563 deletions(-) delete mode 100644 sources/pg_legacy_replication/decoders.py diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 235799451..682466755 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -1,11 +1,10 @@ """Replicates postgres tables in batch using logical decoding.""" -from typing import Dict, Sequence, Optional, Iterable, Union +from typing import Dict, Sequence, Optional, Iterable, Union, List import dlt - +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns from dlt.common.typing import TDataItem -from dlt.common.schema.typing import TTableSchemaColumns from dlt.extract.items import DataItemWithMeta from dlt.sources.credentials import ConnectionStringCredentials @@ -19,9 +18,9 @@ def replication_resource( slot_name: str, schema: str = dlt.config.value, - table_names: Sequence[str] = dlt.config.value, + table_names: List[str] = dlt.config.value, credentials: ConnectionStringCredentials = dlt.secrets.value, - included_columns: Optional[Dict[str, Sequence[str]]] = None, + included_columns: Optional[Dict[str, TColumnNames]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, target_batch_size: int = 1000, flush_slot: bool = True, diff --git a/sources/pg_legacy_replication/decoders.py b/sources/pg_legacy_replication/decoders.py deleted file mode 100644 index c2707b46a..000000000 --- a/sources/pg_legacy_replication/decoders.py +++ /dev/null @@ -1,427 +0,0 @@ -# flake8: noqa -# file copied from https://raw.githubusercontent.com/dgea005/pypgoutput/master/src/pypgoutput/decoders.py -# we do this instead of importing `pypgoutput` because it depends on `psycopg2`, which causes errors when installing on macOS - -import io -from abc import ABC, abstractmethod -from dataclasses import dataclass -from datetime import datetime, timedelta, timezone -from typing import List, Optional, Union - -# integer byte lengths -INT8 = 1 -INT16 = 2 -INT32 = 4 -INT64 = 8 - - -def convert_pg_ts(_ts_in_microseconds: int) -> datetime: - ts = datetime(2000, 1, 1, 0, 0, 0, 0, tzinfo=timezone.utc) - return ts + timedelta(microseconds=_ts_in_microseconds) - - -def convert_bytes_to_int(_in_bytes: bytes) -> int: - return int.from_bytes(_in_bytes, byteorder="big", signed=True) - - -def convert_bytes_to_utf8(_in_bytes: Union[bytes, bytearray]) -> str: - return (_in_bytes).decode("utf-8") - - -@dataclass(frozen=True) -class ColumnData: - # col_data_category is NOT the type. it means null value/toasted(not sent)/text formatted - col_data_category: Optional[str] - col_data_length: Optional[int] = None - col_data: Optional[str] = None - - def __repr__(self) -> str: - return f"[col_data_category='{self.col_data_category}', col_data_length={self.col_data_length}, col_data='{self.col_data}']" - - -@dataclass(frozen=True) -class ColumnType: - """https://www.postgresql.org/docs/12/catalog-pg-attribute.html""" - - part_of_pkey: int - name: str - type_id: int - atttypmod: int - - -@dataclass(frozen=True) -class TupleData: - n_columns: int - column_data: List[ColumnData] - - def __repr__(self) -> str: - return f"n_columns: {self.n_columns}, data: {self.column_data}" - - -class PgoutputMessage(ABC): - def __init__(self, buffer: bytes): - self.buffer: io.BytesIO = io.BytesIO(buffer) - self.byte1: str = self.read_utf8(1) - self.decode_buffer() - - @abstractmethod - def decode_buffer(self) -> None: - """Decoding is implemented for each message type""" - - @abstractmethod - def __repr__(self) -> str: - """Implemented for each message type""" - - def read_int8(self) -> int: - return convert_bytes_to_int(self.buffer.read(INT8)) - - def read_int16(self) -> int: - return convert_bytes_to_int(self.buffer.read(INT16)) - - def read_int32(self) -> int: - return convert_bytes_to_int(self.buffer.read(INT32)) - - def read_int64(self) -> int: - return convert_bytes_to_int(self.buffer.read(INT64)) - - def read_utf8(self, n: int = 1) -> str: - return convert_bytes_to_utf8(self.buffer.read(n)) - - def read_timestamp(self) -> datetime: - # 8 chars -> int64 -> timestamp - return convert_pg_ts(_ts_in_microseconds=self.read_int64()) - - def read_string(self) -> str: - output = bytearray() - while (next_char := self.buffer.read(1)) != b"\x00": - output += next_char - return convert_bytes_to_utf8(output) - - def read_tuple_data(self) -> TupleData: - """ - TupleData - Int16 Number of columns. - Next, one of the following submessages appears for each column (except generated columns): - Byte1('n') Identifies the data as NULL value. - Or - Byte1('u') Identifies unchanged TOASTed value (the actual value is not sent). - Or - Byte1('t') Identifies the data as text formatted value. - Int32 Length of the column value. - Byten The value of the column, in text format. (A future release might support additional formats.) n is the above length. - """ - # TODO: investigate what happens with the generated columns - column_data = list() - n_columns = self.read_int16() - for column in range(n_columns): - col_data_category = self.read_utf8() - if col_data_category in ("n", "u"): - # "n"=NULL, "t"=TOASTed - column_data.append(ColumnData(col_data_category=col_data_category)) - elif col_data_category == "t": - # t = tuple - col_data_length = self.read_int32() - col_data = self.read_utf8(col_data_length) - column_data.append( - ColumnData( - col_data_category=col_data_category, - col_data_length=col_data_length, - col_data=col_data, - ) - ) - return TupleData(n_columns=n_columns, column_data=column_data) - - -class Begin(PgoutputMessage): - """ - https://pgpedia.info/x/xlogrecptr.html - https://www.postgresql.org/docs/14/datatype-pg-lsn.html - - byte1 Byte1('B') Identifies the message as a begin message. - lsn Int64 The final LSN of the transaction. - commit_tx_ts Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). - tx_xid Int32 Xid of the transaction. - """ - - byte1: str - lsn: int - commit_ts: datetime - tx_xid: int - - def decode_buffer(self) -> None: - if self.byte1 != "B": - raise ValueError("first byte in buffer does not match Begin message") - self.lsn = self.read_int64() - self.commit_ts = self.read_timestamp() - self.tx_xid = self.read_int64() - - def __repr__(self) -> str: - return ( - f"BEGIN \n\tbyte1: '{self.byte1}', \n\tLSN: {self.lsn}, " - f"\n\tcommit_ts {self.commit_ts}, \n\ttx_xid: {self.tx_xid}" - ) - - -class Commit(PgoutputMessage): - """ - byte1: Byte1('C') Identifies the message as a commit message. - flags: Int8 Flags; currently unused (must be 0). - lsn_commit: Int64 The LSN of the commit. - lsn: Int64 The end LSN of the transaction. - Int64 Commit timestamp of the transaction. The value is in number of microseconds since PostgreSQL epoch (2000-01-01). - """ - - byte1: str - flags: int - lsn_commit: int - lsn: int - commit_ts: datetime - - def decode_buffer(self) -> None: - if self.byte1 != "C": - raise ValueError("first byte in buffer does not match Commit message") - self.flags = self.read_int8() - self.lsn_commit = self.read_int64() - self.lsn = self.read_int64() - self.commit_ts = self.read_timestamp() - - def __repr__(self) -> str: - return ( - f"COMMIT \n\tbyte1: {self.byte1}, \n\tflags {self.flags}, \n\tlsn_commit: {self.lsn_commit}" - f"\n\tLSN: {self.lsn}, \n\tcommit_ts {self.commit_ts}" - ) - - -class Origin: - """ - Byte1('O') Identifies the message as an origin message. - Int64 The LSN of the commit on the origin server. - String Name of the origin. - Note that there can be multiple Origin messages inside a single transaction. - This seems to be what origin means: https://www.postgresql.org/docs/12/replication-origins.html - """ - - pass - - -class Relation(PgoutputMessage): - """ - Byte1('R') Identifies the message as a relation message. - Int32 ID of the relation. - String Namespace (empty string for pg_catalog). - String Relation name. - Int8 Replica identity setting for the relation (same as relreplident in pg_class). - # select relreplident from pg_class where relname = 'test_table'; - # from reading the documentation and looking at the tables this is not int8 but a single character - # background: https://www.postgresql.org/docs/10/sql-altertable.html#SQL-CREATETABLE-REPLICA-IDENTITY - Int16 Number of columns. - Next, the following message part appears for each column (except generated columns): - Int8 Flags for the column. Currently can be either 0 for no flags or 1 which marks the column as part of the key. - String Name of the column. - Int32 ID of the column's data type. - Int32 Type modifier of the column (atttypmod). - """ - - byte1: str - relation_id: int - namespace: str - relation_name: str - replica_identity_setting: str - n_columns: int - columns: List[ColumnType] - - def decode_buffer(self) -> None: - if self.byte1 != "R": - raise ValueError("first byte in buffer does not match Relation message") - self.relation_id = self.read_int32() - self.namespace = self.read_string() - self.relation_name = self.read_string() - self.replica_identity_setting = self.read_utf8() - self.n_columns = self.read_int16() - self.columns = list() - - for column in range(self.n_columns): - part_of_pkey = self.read_int8() - col_name = self.read_string() - data_type_id = self.read_int32() - # TODO: check on use of signed / unsigned - # check with select oid from pg_type where typname = ; timestamp == 1184, int4 = 23 - col_modifier = self.read_int32() - self.columns.append( - ColumnType( - part_of_pkey=part_of_pkey, - name=col_name, - type_id=data_type_id, - atttypmod=col_modifier, - ) - ) - - def __repr__(self) -> str: - return ( - f"RELATION \n\tbyte1: '{self.byte1}', \n\trelation_id: {self.relation_id}" - f",\n\tnamespace/schema: '{self.namespace}',\n\trelation_name: '{self.relation_name}'" - f",\n\treplica_identity_setting: '{self.replica_identity_setting}',\n\tn_columns: {self.n_columns} " - f",\n\tcolumns: {self.columns}" - ) - - -class PgType: - """ - Renamed to PgType not to collide with "type" - - Byte1('Y') Identifies the message as a type message. - Int32 ID of the data type. - String Namespace (empty string for pg_catalog). - String Name of the data type. - """ - - pass - - -class Insert(PgoutputMessage): - """ - Byte1('I') Identifies the message as an insert message. - Int32 ID of the relation corresponding to the ID in the relation message. - Byte1('N') Identifies the following TupleData message as a new tuple. - TupleData TupleData message part representing the contents of new tuple. - """ - - byte1: str - relation_id: int - new_tuple_byte: str - new_tuple: TupleData - - def decode_buffer(self) -> None: - if self.byte1 != "I": - raise ValueError( - f"first byte in buffer does not match Insert message (expected 'I', got '{self.byte1}'" - ) - self.relation_id = self.read_int32() - self.new_tuple_byte = self.read_utf8() - self.new_tuple = self.read_tuple_data() - - def __repr__(self) -> str: - return ( - f"INSERT \n\tbyte1: '{self.byte1}', \n\trelation_id: {self.relation_id} " - f"\n\tnew tuple byte: '{self.new_tuple_byte}', \n\tnew_tuple: {self.new_tuple}" - ) - - -class Update(PgoutputMessage): - """ - Byte1('U') Identifies the message as an update message. - Int32 ID of the relation corresponding to the ID in the relation message. - Byte1('K') Identifies the following TupleData submessage as a key. This field is optional and is only present if the update changed data in any of the column(s) that are part of the REPLICA IDENTITY index. - Byte1('O') Identifies the following TupleData submessage as an old tuple. This field is optional and is only present if table in which the update happened has REPLICA IDENTITY set to FULL. - TupleData TupleData message part representing the contents of the old tuple or primary key. Only present if the previous 'O' or 'K' part is present. - Byte1('N') Identifies the following TupleData message as a new tuple. - TupleData TupleData message part representing the contents of a new tuple. - - The Update message may contain either a 'K' message part or an 'O' message part or neither of them, but never both of them. - """ - - byte1: str - relation_id: int - next_byte_identifier: Optional[str] - optional_tuple_identifier: Optional[str] - old_tuple: Optional[TupleData] - new_tuple_byte: str - new_tuple: TupleData - - def decode_buffer(self) -> None: - self.optional_tuple_identifier = None - self.old_tuple = None - if self.byte1 != "U": - raise ValueError( - f"first byte in buffer does not match Update message (expected 'U', got '{self.byte1}'" - ) - self.relation_id = self.read_int32() - # TODO test update to PK, test update with REPLICA IDENTITY = FULL - self.next_byte_identifier = self.read_utf8() # one of K, O or N - if self.next_byte_identifier == "K" or self.next_byte_identifier == "O": - self.optional_tuple_identifier = self.next_byte_identifier - self.old_tuple = self.read_tuple_data() - self.new_tuple_byte = self.read_utf8() - else: - self.new_tuple_byte = self.next_byte_identifier - if self.new_tuple_byte != "N": - # TODO: test exception handling - raise ValueError( - f"did not find new_tuple_byte ('N') at position: {self.buffer.tell()}, found: '{self.new_tuple_byte}'" - ) - self.new_tuple = self.read_tuple_data() - - def __repr__(self) -> str: - return ( - f"UPDATE \n\tbyte1: '{self.byte1}', \n\trelation_id: {self.relation_id}" - f"\n\toptional_tuple_identifier: '{self.optional_tuple_identifier}', \n\toptional_old_tuple_data: {self.old_tuple}" - f"\n\tnew_tuple_byte: '{self.new_tuple_byte}', \n\tnew_tuple: {self.new_tuple}" - ) - - -class Delete(PgoutputMessage): - """ - Byte1('D') Identifies the message as a delete message. - Int32 ID of the relation corresponding to the ID in the relation message. - Byte1('K') Identifies the following TupleData submessage as a key. This field is present if the table in which the delete has happened uses an index as REPLICA IDENTITY. - Byte1('O') Identifies the following TupleData message as a old tuple. This field is present if the table in which the delete has happened has REPLICA IDENTITY set to FULL. - TupleData TupleData message part representing the contents of the old tuple or primary key, depending on the previous field. - - The Delete message may contain either a 'K' message part or an 'O' message part, but never both of them. - """ - - byte1: str - relation_id: int - message_type: str - old_tuple: TupleData - - def decode_buffer(self) -> None: - if self.byte1 != "D": - raise ValueError( - f"first byte in buffer does not match Delete message (expected 'D', got '{self.byte1}'" - ) - self.relation_id = self.read_int32() - self.message_type = self.read_utf8() - # TODO: test with replica identity full - if self.message_type not in ["K", "O"]: - raise ValueError( - f"message type byte is not 'K' or 'O', got: '{self.message_type}'" - ) - self.old_tuple = self.read_tuple_data() - - def __repr__(self) -> str: - return ( - f"DELETE \n\tbyte1: {self.byte1} \n\trelation_id: {self.relation_id} " - f"\n\tmessage_type: {self.message_type} \n\told_tuple: {self.old_tuple}" - ) - - -class Truncate(PgoutputMessage): - """ - Byte1('T') Identifies the message as a truncate message. - Int32 Number of relations - Int8 Option bits for TRUNCATE: 1 for CASCADE, 2 for RESTART IDENTITY - Int32 ID of the relation corresponding to the ID in the relation message. This field is repeated for each relation. - """ - - byte1: str - number_of_relations: int - option_bits: int - relation_ids: List[int] - - def decode_buffer(self) -> None: - if self.byte1 != "T": - raise ValueError( - f"first byte in buffer does not match Truncate message (expected 'T', got '{self.byte1}'" - ) - self.number_of_relations = self.read_int32() - self.option_bits = self.read_int8() - self.relation_ids = [] - for relation in range(self.number_of_relations): - self.relation_ids.append(self.read_int32()) - - def __repr__(self) -> str: - return ( - f"TRUNCATE \n\tbyte1: {self.byte1} \n\tn_relations: {self.number_of_relations} " - f"option_bits: {self.option_bits}, relation_ids: {self.relation_ids}" - ) diff --git a/sources/pg_legacy_replication/exceptions.py b/sources/pg_legacy_replication/exceptions.py index df52c4bab..ea850999d 100644 --- a/sources/pg_legacy_replication/exceptions.py +++ b/sources/pg_legacy_replication/exceptions.py @@ -1,11 +1,3 @@ -class NoPrimaryKeyException(Exception): - pass - - -class IncompatiblePostgresVersionException(Exception): - pass - - class SqlDatabaseSourceImportError(Exception): def __init__(self) -> None: super().__init__( diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 50ab6fc53..d1db38d36 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -16,6 +16,7 @@ from dlt.common import logger from dlt.common.pendulum import pendulum from dlt.common.schema.typing import ( + TColumnNames, TTableSchema, TTableSchemaColumns, ) @@ -42,10 +43,10 @@ def init_replication( slot_name: str, schema: str = dlt.config.value, - table_names: Sequence[str] = dlt.config.value, + table_names: List[str] = dlt.config.value, credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = False, - included_columns: Optional[Dict[str, Sequence[str]]] = None, + included_columns: Optional[Dict[str, TColumnNames]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, reset: bool = False, ) -> Optional[List[DltResource]]: @@ -162,7 +163,7 @@ def _prepare_snapshot_resource( table_name: str, schema: str, *, - included_columns: Optional[Sequence[str]] = None, + included_columns: Optional[TColumnNames] = None, columns: Optional[TTableSchemaColumns] = None, ) -> DltResource: t_rsrc: DltResource = sql_table( # type: ignore[name-defined] @@ -328,13 +329,20 @@ def __init__( upto_lsn: int, table_qnames: Set[str], target_batch_size: int = 1000, - included_columns: Optional[Dict[str, Sequence[str]]] = None, + included_columns: Optional[Dict[str, TColumnNames]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, ) -> None: self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.included_columns = included_columns or {} + self.included_columns = ( + { + table_name: _normalize_included_columns(columns) + for table_name, columns in included_columns.items() + } + if included_columns + else {} + ) self.columns = columns or {} self.consumed_all: bool = False @@ -404,7 +412,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: return _, table_name = msg.table.split(".") last_table_schema = self.last_table_schema.get(table_name) - table_schema = extract_table_schema( + table_schema = infer_table_schema( msg, column_hints=self.columns.get(table_name), included_columns=self.included_columns.get(table_name), @@ -417,11 +425,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: data_item = gen_data_item( msg.new_tuple, column_schema=table_schema["columns"], - included_columns=( - None - if self.included_columns is None - else self.included_columns.get(table_name) - ), + included_columns=self.included_columns.get(table_name), ) data_item["lsn"] = lsn self.data_items[table_name].append(data_item) @@ -434,11 +438,7 @@ def process_delete(self, msg: RowMessage, lsn: int) -> None: data_item = gen_data_item( msg.old_tuple, for_delete=True, - included_columns=( - None - if self.included_columns is None - else self.included_columns.get(table_name) - ), + included_columns=self.included_columns.get(table_name), ) data_item["lsn"] = lsn data_item["deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) @@ -454,7 +454,7 @@ class ItemGenerator: upto_lsn: int start_lsn: int = 0 target_batch_size: int = 1000 - included_columns: Optional[Dict[str, Sequence[str]]] = None + included_columns: Optional[Dict[str, TColumnNames]] = None columns: Optional[Dict[str, TTableSchemaColumns]] = None last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -507,53 +507,43 @@ def flush( self.generated_all = consumer.consumed_all -# FIXME Refactor later - -_DATUM_PRECISIONS: Dict[str, int] = { - "datum_int32": 32, - "datum_int64": 64, - "datum_float": 32, - "datum_double": 64, -} - - -def extract_table_schema( - row_msg: RowMessage, +def infer_table_schema( + msg: RowMessage, *, column_hints: Optional[TTableSchemaColumns] = None, - included_columns: Optional[Sequence[str]] = None, + included_columns: Optional[Set[str]] = None, ) -> TTableSchema: + """Infers the table schema from the replication message and optional hints""" columns: TTableSchemaColumns = { - "lsn": { - "data_type": "bigint", - "nullable": True, - "dedup_sort": "desc", - }, - "deleted_ts": { - "data_type": "timestamp", - "nullable": True, - "hard_delete": True, - }, - } - for col, col_info in zip(row_msg.new_tuple, row_msg.new_typeinfo): - col_name = col.column_name - if included_columns and col_name not in included_columns: - continue - col_schema = _to_dlt_column_schema(col, col_info) - columns[col_name] = ( - merge_column(col_schema, column_hints.get(col_name)) - if column_hints and column_hints.get(col_name) - else col_schema + col.column_name: ( + merge_column( + _to_dlt_column_schema(col, col_info), + column_hints.get(col.column_name), + ) + if column_hints and col.column_name in column_hints + else _to_dlt_column_schema(col, col_info) ) + for col, col_info in zip(msg.new_tuple, msg.new_typeinfo) + if not included_columns or col.column_name in included_columns + } - _, table_name = row_msg.table.split(".") - return {"name": table_name, "columns": columns} + columns["lsn"] = {"data_type": "bigint", "nullable": True, "dedup_sort": "desc"} + columns["deleted_ts"] = { + "data_type": "timestamp", + "nullable": True, + "hard_delete": True, + } + + return { + "name": msg.table.split(".")[1], + "columns": columns, + } def gen_data_item( row: Sequence[DatumMessage], *, - included_columns: Optional[Sequence[str]] = None, + included_columns: Optional[Set[str]] = None, column_schema: Optional[TTableSchemaColumns] = None, for_delete: bool = False, ) -> TDataItem: @@ -570,3 +560,15 @@ def gen_data_item( data_item[col_name] = _to_dlt_val(data, data_type, for_delete=for_delete) return data_item + + +def _normalize_included_columns( + included_columns: Optional[TColumnNames], +) -> Optional[Set[str]]: + if included_columns is None: + return None + return ( + {included_columns} + if isinstance(included_columns, str) + else set(included_columns) + ) diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 2f84b9ec4..8b1945233 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -205,7 +205,7 @@ def replicate_with_column_selection() -> None: changes = replication_resource( slot_name=slot_name, included_columns={ - "tbl_x": ("c1", "c2") + "tbl_x": ["c1", "c2"] }, # columns not specified here are excluded from generated data items ) diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index f6bdb04d4..df12e39bc 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -8,7 +8,7 @@ from google.protobuf.json_format import ParseDict as parse_dict from sources.pg_legacy_replication.helpers import ( - extract_table_schema, + infer_table_schema, gen_data_item, ) from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage @@ -88,14 +88,14 @@ }, ], ) -def test_extract_table_schema( +def test_infer_table_schema( data, column_hints: Optional[TTableSchemaColumns], expected_schema: TTableSchema, ): row_msg = RowMessage() parse_dict(data, row_msg) - assert extract_table_schema(row_msg, column_hints=column_hints) == expected_schema + assert infer_table_schema(row_msg, column_hints=column_hints) == expected_schema LSN = random.randint(0, 10000) @@ -215,7 +215,7 @@ def test_extract_table_schema( def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) - column_schema = extract_table_schema(row_msg)["columns"] + column_schema = infer_table_schema(row_msg)["columns"] assert gen_data_item(row_msg.new_tuple, column_schema=column_schema) == data_item diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 0e89a5753..d85295cf1 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -1,28 +1,21 @@ -import pytest - -from typing import Dict, Set, Sequence, Tuple from copy import deepcopy -from psycopg2.errors import InsufficientPrivilege +from typing import Dict, Set, Tuple import dlt +import pytest from dlt.destinations.job_client_impl import SqlJobClientBase -from tests.utils import ( - ALL_DESTINATIONS, - assert_load_info, - load_table_counts, - get_table_metrics, -) from sources.pg_legacy_replication import replication_resource from sources.pg_legacy_replication.helpers import ( init_replication, get_pg_version, cleanup_snapshot_resources, ) -from sources.pg_legacy_replication.exceptions import ( - IncompatiblePostgresVersionException, +from tests.utils import ( + ALL_DESTINATIONS, + assert_load_info, + load_table_counts, ) - from .cases import TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA from .utils import add_pk, assert_loaded_data, is_super_user @@ -394,44 +387,9 @@ def test_unmapped_data_types( assert columns["uuid_col"]["data_type"] == "text" -@pytest.mark.parametrize("publish", ["insert", "insert, update, delete"]) -def test_write_disposition(src_config: Tuple[dlt.Pipeline, str], publish: str) -> None: - @dlt.resource - def items(data): - yield data - - src_pl, slot_name = src_config - - # create postgres table - src_pl.run(items({"id": 1, "val": True})) - - # create resources - snapshot = init_replication( - slot_name=slot_name, - schema=src_pl.dataset_name, - table_names="items", - publish=publish, - take_snapshots=True, - ) - - # assert write disposition on snapshot resource - expected_write_disposition = "append" if publish == "insert" else "merge" - assert snapshot.write_disposition == expected_write_disposition - - # assert write disposition on tables dispatched by changes resource - changes = replication_resource(slot_name) - src_pl.run(items({"id": 2, "val": True})) - dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) - dest_pl.extract(changes) - assert ( - dest_pl.default_schema.get_table("items")["write_disposition"] - == expected_write_disposition - ) - - @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("init_load", [True, False]) -def test_include_columns( +def test_included_columns( src_config: Tuple[dlt.Pipeline, str], destination_name: str, init_load: bool ) -> None: def get_cols(pipeline: dlt.Pipeline, table_name: str) -> set: @@ -467,20 +425,24 @@ def tbl_z(data): ) # initialize replication and create resources - include_columns: Dict[str, Sequence[str]] = { - "tbl_x": ["id_x", "val_x"], - "tbl_y": ["id_y", "val_y"], + included_columns: Dict[str, Set[str]] = { + "tbl_x": {"id_x", "val_x"}, + "tbl_y": {"id_y", "val_y"}, # tbl_z is not specified, hence all columns should be included } snapshots = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - publish="insert", take_snapshots=init_load, - include_columns=include_columns, + included_columns=included_columns, + ) + changes = replication_resource( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y", "tbl_z"), + included_columns=included_columns, ) - changes = replication_resource(slot_name=slot_name, include_columns=include_columns) # update three postgres tables src_pl.run( @@ -501,9 +463,15 @@ def tbl_z(data): assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y"} assert get_cols(dest_pl, "tbl_z") == {"id_z", "val_z", "another_col_z"} dest_pl.run(changes) - assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x", "lsn"} - assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y", "lsn"} - assert get_cols(dest_pl, "tbl_z") == {"id_z", "val_z", "another_col_z", "lsn"} + assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x", "lsn", "deleted_ts"} + assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y", "lsn", "deleted_ts"} + assert get_cols(dest_pl, "tbl_z") == { + "id_z", + "val_z", + "another_col_z", + "lsn", + "deleted_ts", + } @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @@ -544,11 +512,15 @@ def tbl_z(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - publish="insert", take_snapshots=init_load, columns=column_hints, ) - changes = replication_resource(slot_name=slot_name, columns=column_hints) + changes = replication_resource( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("tbl_x", "tbl_y", "tbl_z"), + columns=column_hints, + ) # update three postgres tables src_pl.run( @@ -622,12 +594,15 @@ def test_table_schema_change( init_replication( slot_name=slot_name, schema=src_pl.dataset_name, - table_names="items", - publish="insert", + table_names=("items",), ) # create resource and pipeline - changes = replication_resource(slot_name) + changes = replication_resource( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("items",), + ) dest_pl = dlt.pipeline( pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) @@ -727,9 +702,14 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: init_replication( slot_name=slot_name, schema=src_pl.dataset_name, - table_names="items", + table_names=["items"], + ) + changes = replication_resource( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=["items"], + target_batch_size=50, ) - changes = replication_resource(slot_name, target_batch_size=50) # create destination pipeline and resource dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) From 02851f41c889a165d0d498aa4af07bad2edc8f5e Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 22 Oct 2024 19:28:34 +0200 Subject: [PATCH 29/88] wip: still trying to get it work with all versions of dlt --- sources/pg_legacy_replication/schema_types.py | 42 +++++++++++++++---- tests/pg_legacy_replication/cases.py | 12 ++++-- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 85ad17712..7c84b09e3 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -15,6 +15,7 @@ "binary": b" ", "bool": True, "complex": [0], + "json": [0], # type: ignore[dict-item] "date": "2000-01-01", "decimal": Decimal(0), "double": 0.0, @@ -90,13 +91,26 @@ def _type_mapper() -> Any: from dlt.destinations import postgres try: - from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper - except ImportError: from dlt.destinations.impl.postgres.factory import PostgresTypeMapper # type: ignore + except ImportError: + from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper return PostgresTypeMapper(postgres().capabilities()) +# FIXME Hack to get it to work with 0.5.x and 1.x +def _from_destination_type( + db_type: str, precision: Optional[int] = None, scale: Optional[int] = None +) -> TColumnType: + mapper = _type_mapper() + from_db_type: Callable[[str, Optional[int], Optional[int]], TColumnType] + if hasattr(mapper, "from_destination_type"): + from_db_type = mapper.from_destination_type + else: + from_db_type = mapper.from_db_type + return from_db_type(db_type, precision, scale) + + def _to_dlt_column_type(type_id: int, modifier: str) -> TColumnType: """Converts postgres type OID to dlt column type. @@ -104,7 +118,7 @@ def _to_dlt_column_type(type_id: int, modifier: str) -> TColumnType: """ pg_type = _PG_TYPES.get(type_id) precision, scale = _get_precision_and_scale(type_id, modifier) - return _type_mapper().from_db_type(pg_type, precision, scale) # type: ignore[no-any-return] + return _from_destination_type(pg_type, precision, scale) def _to_dlt_column_schema(datum: DatumMessage, type_info: TypeInfo) -> TColumnSchema: @@ -141,7 +155,7 @@ def _to_dlt_val( ) -> Any: """Converts decoderbuf's datum value into dlt-compatible data value.""" if isinstance(data_type, int): - col_type: TColumnType = _type_mapper().from_db_type(_PG_TYPES[data_type]) + col_type: TColumnType = _from_destination_type(_PG_TYPES[data_type]) data_type = col_type["data_type"] datum = val.WhichOneof("datum") @@ -152,8 +166,18 @@ def _to_dlt_val( if data_type in data_type_handlers: return data_type_handlers[data_type](raw_value) - return coerce_value( - to_type=data_type, - from_type=_DATUM_RAW_TYPES[datum], - value=raw_value, - ) + try: + return coerce_value( + to_type=data_type, + from_type=_DATUM_RAW_TYPES[datum], + value=raw_value, + ) + except ValueError: + # FIXME Hack to get it to work with 0.5.x and 1.x + if data_type == "json": + return coerce_value( + "complex", + from_type=_DATUM_RAW_TYPES[datum], + value=raw_value, + ) + raise diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index a17efcad7..12f1a4bf1 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -1,9 +1,9 @@ from typing import List from dlt.common import Decimal +from dlt.common.data_types.typing import DATA_TYPES from dlt.common.schema import TColumnSchema, TTableSchemaColumns - TABLE_ROW_ALL_DATA_TYPES = { "col1": 989127831, "col2": 898912.821982, @@ -49,7 +49,7 @@ {"name": "col6", "data_type": "decimal", "nullable": False}, {"name": "col7", "data_type": "binary", "nullable": False}, # {"name": "col8", "data_type": "wei", "nullable": False}, - {"name": "col9", "data_type": "complex", "nullable": False, "variant": True}, + {"name": "col9", "data_type": "json", "nullable": False, "variant": True}, {"name": "col10", "data_type": "date", "nullable": False}, {"name": "col11", "data_type": "time", "nullable": False}, {"name": "col1_null", "data_type": "bigint", "nullable": True}, @@ -60,7 +60,7 @@ {"name": "col6_null", "data_type": "decimal", "nullable": True}, {"name": "col7_null", "data_type": "binary", "nullable": True}, # {"name": "col8_null", "data_type": "wei", "nullable": True}, - {"name": "col9_null", "data_type": "complex", "nullable": True, "variant": True}, + {"name": "col9_null", "data_type": "json", "nullable": True, "variant": True}, {"name": "col10_null", "data_type": "date", "nullable": True}, {"name": "col11_null", "data_type": "time", "nullable": True}, { @@ -91,4 +91,10 @@ }, {"name": "col11_precision", "data_type": "time", "precision": 3, "nullable": False}, ] + +if "complex" in DATA_TYPES: + for col_schema in TABLE_UPDATE: + if col_schema["data_type"] == "json": + col_schema["data_type"] = "complex" + TABLE_UPDATE_COLUMNS_SCHEMA: TTableSchemaColumns = {t["name"]: t for t in TABLE_UPDATE} From beef6eac6c884c45e871fdca8a717690633c75fd Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 23 Oct 2024 00:10:22 +0200 Subject: [PATCH 30/88] wip --- sources/pg_legacy_replication/__init__.py | 6 +- sources/pg_legacy_replication/helpers.py | 6 +- sources/pg_legacy_replication_pipeline.py | 136 +++++++++-------- tests/pg_legacy_replication/cases.py | 4 +- .../test_pg_replication.py | 144 +++++++++--------- 5 files changed, 155 insertions(+), 141 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 682466755..cccd1f896 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -17,8 +17,8 @@ ) def replication_resource( slot_name: str, - schema: str = dlt.config.value, - table_names: List[str] = dlt.config.value, + schema: str, + table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, included_columns: Optional[Dict[str, TColumnNames]] = None, columns: Optional[Dict[str, TTableSchemaColumns]] = None, @@ -83,6 +83,8 @@ def replication_resource( if upto_lsn is None: return + if isinstance(table_names, str): + table_names = [table_names] table_qnames = {f"{schema}.{table_name}" for table_name in table_names} # generate items in batches diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index d1db38d36..263a0db2d 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -42,8 +42,8 @@ @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) def init_replication( slot_name: str, - schema: str = dlt.config.value, - table_names: List[str] = dlt.config.value, + schema: str, + table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = False, included_columns: Optional[Dict[str, TColumnNames]] = None, @@ -126,6 +126,8 @@ def on_begin(conn: ConnectionSqla) -> None: cur.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") cur.execute(f"SET TRANSACTION SNAPSHOT '{slot['snapshot_name']}'") + if isinstance(table_names, str): + table_names = [table_names] included_columns = included_columns or {} columns = columns or {} return [ diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 8b1945233..6db9c63fb 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -24,7 +24,7 @@ def replicate_single_table() -> None: pipeline_name="pg_replication_pipeline", destination="duckdb", dataset_name="replicate_single_table", - full_refresh=True, + dev_mode=True, ) # create table "my_source_table" in source to demonstrate replication @@ -37,12 +37,16 @@ def replicate_single_table() -> None: init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, schema=src_pl.dataset_name, - table_names=["my_source_table"], + table_names="my_source_table", reset=True, ) # create a resource that generates items for each change in the source table - changes = replication_resource(slot_name) + changes = replication_resource( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names="my_source_table", + ) # insert two records in source table and propagate changes to destination change_source_table( @@ -74,7 +78,7 @@ def replicate_with_initial_load() -> None: pipeline_name="pg_replication_pipeline", destination="duckdb", dataset_name="replicate_with_initial_load", - full_refresh=True, + dev_mode=True, ) # create table "my_source_table" in source to demonstrate replication @@ -92,7 +96,7 @@ def replicate_with_initial_load() -> None: snapshot = init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, schema=src_pl.dataset_name, - table_names=["my_source_table"], + table_names="my_source_table", take_snapshots=True, # persist snapshot table(s) and let function return resource(s) for initial load reset=True, ) @@ -103,67 +107,71 @@ def replicate_with_initial_load() -> None: # insert record in source table and propagate change to destination change_source_table(src_pl, "INSERT INTO {table_name} VALUES (3, true);") - changes = replication_resource(slot_name) - dest_pl.run(changes) - show_destination_table(dest_pl) - - -def replicate_entire_schema() -> None: - """Demonstrates setup and usage of schema replication. - - Schema replication requires a Postgres server version of 15 or higher. An - exception is raised if that's not the case. - """ - # create source and destination pipelines - src_pl = get_postgres_pipeline() - dest_pl = dlt.pipeline( - pipeline_name="pg_replication_pipeline", - destination="duckdb", - dataset_name="replicate_entire_schema", - full_refresh=True, - ) - - # create two source tables to demonstrate schema replication - create_source_table( - src_pl, - "CREATE TABLE {table_name} (id integer PRIMARY KEY, val bool);", - "tbl_x", - ) - create_source_table( - src_pl, - "CREATE TABLE {table_name} (id integer PRIMARY KEY, val varchar);", - "tbl_y", - ) - - # initialize schema replication by omitting the `table_names` argument - slot_name = "example_slot" - init_replication( # initializing schema replication requires the Postgres user to be a superuser + changes = replication_resource( slot_name=slot_name, schema=src_pl.dataset_name, - reset=True, + table_names="my_source_table", ) - - # create a resource that generates items for each change in the schema's tables - changes = replication_resource(slot_name) - - # insert records in source tables and propagate changes to destination - change_source_table( - src_pl, "INSERT INTO {table_name} VALUES (1, true), (2, false);", "tbl_x" - ) - change_source_table(src_pl, "INSERT INTO {table_name} VALUES (1, 'foo');", "tbl_y") dest_pl.run(changes) - show_destination_table(dest_pl, "tbl_x") - show_destination_table(dest_pl, "tbl_y") + show_destination_table(dest_pl) - # tables added to the schema later are also included in the replication - create_source_table( - src_pl, "CREATE TABLE {table_name} (id integer PRIMARY KEY, val date);", "tbl_z" - ) - change_source_table( - src_pl, "INSERT INTO {table_name} VALUES (1, '2023-03-18');", "tbl_z" - ) - dest_pl.run(changes) - show_destination_table(dest_pl, "tbl_z") + +# def replicate_entire_schema() -> None: +# """Demonstrates setup and usage of schema replication. +# +# Schema replication requires a Postgres server version of 15 or higher. An +# exception is raised if that's not the case. +# """ +# # create source and destination pipelines +# src_pl = get_postgres_pipeline() +# dest_pl = dlt.pipeline( +# pipeline_name="pg_replication_pipeline", +# destination="duckdb", +# dataset_name="replicate_entire_schema", +# dev_mode=True, +# ) +# +# # create two source tables to demonstrate schema replication +# create_source_table( +# src_pl, +# "CREATE TABLE {table_name} (id integer PRIMARY KEY, val bool);", +# "tbl_x", +# ) +# create_source_table( +# src_pl, +# "CREATE TABLE {table_name} (id integer PRIMARY KEY, val varchar);", +# "tbl_y", +# ) +# +# # initialize schema replication by omitting the `table_names` argument +# slot_name = "example_slot" +# init_replication( # initializing schema replication requires the Postgres user to be a superuser +# slot_name=slot_name, +# schema=src_pl.dataset_name, +# reset=True, +# ) +# +# # create a resource that generates items for each change in the schema's tables +# changes = replication_resource(slot_name) +# +# # insert records in source tables and propagate changes to destination +# change_source_table( +# src_pl, "INSERT INTO {table_name} VALUES (1, true), (2, false);", "tbl_x" +# ) +# change_source_table(src_pl, "INSERT INTO {table_name} VALUES (1, 'foo');", "tbl_y") +# dest_pl.run(changes) +# show_destination_table(dest_pl, "tbl_x") +# show_destination_table(dest_pl, "tbl_y") +# +# # tables added to the schema later are also included in the replication +# create_source_table( +# src_pl, "CREATE TABLE {table_name} (id integer PRIMARY KEY, val date);", "tbl_z" +# ) +# change_source_table( +# src_pl, "INSERT INTO {table_name} VALUES (1, '2023-03-18');", "tbl_z" +# ) +# dest_pl.run(changes) +# show_destination_table(dest_pl, "tbl_z") def replicate_with_column_selection() -> None: @@ -177,7 +185,7 @@ def replicate_with_column_selection() -> None: pipeline_name="pg_replication_pipeline", destination="duckdb", dataset_name="replicate_with_column_selection", - full_refresh=True, + dev_mode=True, ) # create two source tables to demonstrate schema replication @@ -204,6 +212,8 @@ def replicate_with_column_selection() -> None: # create a resource that generates items for each change in the schema's tables changes = replication_resource( slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=["tbl_x", "tbl_y"], included_columns={ "tbl_x": ["c1", "c2"] }, # columns not specified here are excluded from generated data items @@ -240,7 +250,7 @@ def get_postgres_pipeline() -> dlt.Pipeline: pipeline_name="source_pipeline", destination=Destination.from_reference("postgres", credentials=PG_CREDS), dataset_name="source_dataset", - full_refresh=True, + dev_mode=True, ) return pipe diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 12f1a4bf1..bb99ad52e 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -49,7 +49,7 @@ {"name": "col6", "data_type": "decimal", "nullable": False}, {"name": "col7", "data_type": "binary", "nullable": False}, # {"name": "col8", "data_type": "wei", "nullable": False}, - {"name": "col9", "data_type": "json", "nullable": False, "variant": True}, + {"name": "col9", "data_type": "json", "nullable": False, "variant": True}, # type: ignore[typeddict-item] {"name": "col10", "data_type": "date", "nullable": False}, {"name": "col11", "data_type": "time", "nullable": False}, {"name": "col1_null", "data_type": "bigint", "nullable": True}, @@ -60,7 +60,7 @@ {"name": "col6_null", "data_type": "decimal", "nullable": True}, {"name": "col7_null", "data_type": "binary", "nullable": True}, # {"name": "col8_null", "data_type": "wei", "nullable": True}, - {"name": "col9_null", "data_type": "json", "nullable": True, "variant": True}, + {"name": "col9_null", "data_type": "json", "nullable": True, "variant": True}, # type: ignore[typeddict-item] {"name": "col10_null", "data_type": "date", "nullable": True}, {"name": "col11_null", "data_type": "time", "nullable": True}, { diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index d85295cf1..d47e0bb9a 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -1,14 +1,14 @@ from copy import deepcopy -from typing import Dict, Set, Tuple +from typing import Dict, Sequence, Tuple import dlt import pytest +from dlt.common.schema.typing import TTableSchemaColumns from dlt.destinations.job_client_impl import SqlJobClientBase from sources.pg_legacy_replication import replication_resource from sources.pg_legacy_replication.helpers import ( init_replication, - get_pg_version, cleanup_snapshot_resources, ) from tests.utils import ( @@ -17,7 +17,7 @@ load_table_counts, ) from .cases import TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA -from .utils import add_pk, assert_loaded_data, is_super_user +from .utils import add_pk, assert_loaded_data @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @@ -255,7 +255,7 @@ def items(data): snapshot = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=("items",), + table_names="items", take_snapshots=init_load, columns={"items": column_schema} if give_hints else None, ) @@ -268,7 +268,7 @@ def items(data): changes = replication_resource( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=("items",), + table_names="items", columns={"items": column_schema}, ) @@ -361,12 +361,12 @@ def test_unmapped_data_types( init_replication( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=("data_types",), + table_names="data_types", ) changes = replication_resource( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=("data_types",), + table_names="data_types", ) # insert record in source table to create replication item @@ -425,9 +425,9 @@ def tbl_z(data): ) # initialize replication and create resources - included_columns: Dict[str, Set[str]] = { - "tbl_x": {"id_x", "val_x"}, - "tbl_y": {"id_y", "val_y"}, + included_columns: Dict[str, Sequence[str]] = { + "tbl_x": ("id_x", "val_x"), + "tbl_y": ("id_y", "val_y"), # tbl_z is not specified, hence all columns should be included } snapshots = init_replication( @@ -503,7 +503,7 @@ def tbl_z(data): ) # initialize replication and create resources - column_hints = { + column_hints: Dict[str, TTableSchemaColumns] = { "tbl_x": {"another_col_x": {"data_type": "double"}}, "tbl_y": {"another_col_y": {"precision": 32}}, # tbl_z is not specified, hence all columns should be included @@ -594,14 +594,14 @@ def test_table_schema_change( init_replication( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=("items",), + table_names="items", ) # create resource and pipeline changes = replication_resource( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=("items",), + table_names="items", ) dest_pl = dlt.pipeline( pipeline_name="dest_pl", destination=destination_name, dev_mode=True @@ -630,63 +630,63 @@ def test_table_schema_change( ) -def test_replicate_schema(src_config: Tuple[dlt.Pipeline, str]) -> None: - if get_pg_version() < 150000: - pytest.skip("incompatible Postgres server version") - if not is_super_user(src_config[0].sql_client): - pytest.skip("Postgres user needs to be superuser") - - @dlt.resource - def tbl_x(data): - yield data - - @dlt.resource - def tbl_y(data): - yield data - - @dlt.resource - def tbl_z(data): - yield data - - src_pl, slot_name = src_config - - # create two postgres tables - src_pl.run( - [ - tbl_x({"id_x": 1, "val_x": "foo"}), - tbl_y({"id_y": 1, "val_y": "foo"}), - ] - ) - - # initialize replication and create resource - init_replication( - slot_name=slot_name, - schema=src_pl.dataset_name, # we only specify `schema`, not `table_names` - publish="insert", - ) - changes = replication_resource(slot_name) - - # change source tables and load to destination - src_pl.run( - [ - tbl_x({"id_x": 2, "val_x": "foo"}), - tbl_y({"id_y": 2, "val_y": "foo"}), - ] - ) - dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) - dest_pl.extract(changes) - assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y"} - - # introduce new table in source and assert it gets included in the replication - src_pl.run( - [ - tbl_x({"id_x": 3, "val_x": "foo"}), - tbl_y({"id_y": 3, "val_y": "foo"}), - tbl_z({"id_z": 1, "val_z": "foo"}), - ] - ) - dest_pl.extract(changes) - assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y", "tbl_z"} +# def test_replicate_schema(src_config: Tuple[dlt.Pipeline, str]) -> None: +# if get_pg_version() < 150000: +# pytest.skip("incompatible Postgres server version") +# if not is_super_user(src_config[0].sql_client): +# pytest.skip("Postgres user needs to be superuser") +# +# @dlt.resource +# def tbl_x(data): +# yield data +# +# @dlt.resource +# def tbl_y(data): +# yield data +# +# @dlt.resource +# def tbl_z(data): +# yield data +# +# src_pl, slot_name = src_config +# +# # create two postgres tables +# src_pl.run( +# [ +# tbl_x({"id_x": 1, "val_x": "foo"}), +# tbl_y({"id_y": 1, "val_y": "foo"}), +# ] +# ) +# +# # initialize replication and create resource +# init_replication( +# slot_name=slot_name, +# schema=src_pl.dataset_name, # we only specify `schema`, not `table_names` +# publish="insert", +# ) +# changes = replication_resource(slot_name) +# +# # change source tables and load to destination +# src_pl.run( +# [ +# tbl_x({"id_x": 2, "val_x": "foo"}), +# tbl_y({"id_y": 2, "val_y": "foo"}), +# ] +# ) +# dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) +# dest_pl.extract(changes) +# assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y"} +# +# # introduce new table in source and assert it gets included in the replication +# src_pl.run( +# [ +# tbl_x({"id_x": 3, "val_x": "foo"}), +# tbl_y({"id_y": 3, "val_y": "foo"}), +# tbl_z({"id_z": 1, "val_z": "foo"}), +# ] +# ) +# dest_pl.extract(changes) +# assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y", "tbl_z"} def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: @@ -702,12 +702,12 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: init_replication( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=["items"], + table_names="items", ) changes = replication_resource( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=["items"], + table_names="items", target_batch_size=50, ) From 77242e84207d8f808ba6d7e9d5f0fcba31ef868e Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 23 Oct 2024 02:47:34 +0200 Subject: [PATCH 31/88] wip: changing signature --- sources/pg_legacy_replication/__init__.py | 8 +- sources/pg_legacy_replication/helpers.py | 62 +++++--- tests/pg_legacy_replication/test_helpers.py | 134 +++++++++--------- .../test_pg_replication.py | 38 ++--- 4 files changed, 128 insertions(+), 114 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index cccd1f896..b674333d6 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -1,9 +1,9 @@ """Replicates postgres tables in batch using logical decoding.""" -from typing import Dict, Sequence, Optional, Iterable, Union, List +from typing import Dict, Sequence, Optional, Iterable, Union import dlt -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns +from dlt.common.schema.typing import TColumnNames, TTableSchema from dlt.common.typing import TDataItem from dlt.extract.items import DataItemWithMeta from dlt.sources.credentials import ConnectionStringCredentials @@ -21,7 +21,7 @@ def replication_resource( table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, included_columns: Optional[Dict[str, TColumnNames]] = None, - columns: Optional[Dict[str, TTableSchemaColumns]] = None, + table_hints: Optional[Dict[str, TTableSchema]] = None, target_batch_size: int = 1000, flush_slot: bool = True, ) -> Iterable[Union[TDataItem, DataItemWithMeta]]: @@ -98,7 +98,7 @@ def replication_resource( start_lsn=start_lsn, target_batch_size=target_batch_size, included_columns=included_columns, - columns=columns, + table_hints=table_hints, ) yield from gen if gen.generated_all: diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 263a0db2d..a9ec3c1f9 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -20,7 +20,7 @@ TTableSchema, TTableSchemaColumns, ) -from dlt.common.schema.utils import merge_column +from dlt.common.schema.utils import merge_column, merge_table from dlt.common.typing import TDataItem from dlt.extract.items import DataItemWithMeta from dlt.extract.resource import DltResource @@ -47,7 +47,7 @@ def init_replication( credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = False, included_columns: Optional[Dict[str, TColumnNames]] = None, - columns: Optional[Dict[str, TTableSchemaColumns]] = None, + table_hints: Optional[Dict[str, TTableSchema]] = None, reset: bool = False, ) -> Optional[List[DltResource]]: """Initializes replication for one, several, or all tables within a schema. @@ -129,14 +129,14 @@ def on_begin(conn: ConnectionSqla) -> None: if isinstance(table_names, str): table_names = [table_names] included_columns = included_columns or {} - columns = columns or {} + table_hints = table_hints or {} return [ _prepare_snapshot_resource( engine, table_name, schema, included_columns=included_columns.get(table_name), - columns=columns.get(table_name), + table_hints=table_hints.get(table_name), ) for table_name in table_names ] @@ -166,7 +166,7 @@ def _prepare_snapshot_resource( schema: str, *, included_columns: Optional[TColumnNames] = None, - columns: Optional[TTableSchemaColumns] = None, + table_hints: Optional[TTableSchema] = None, ) -> DltResource: t_rsrc: DltResource = sql_table( # type: ignore[name-defined] credentials=engine, @@ -174,11 +174,23 @@ def _prepare_snapshot_resource( schema=schema, included_columns=included_columns, ) - if columns: - t_rsrc.apply_hints(columns=columns) + if table_hints: + _apply_hints(t_rsrc, table_hints) return t_rsrc +def _apply_hints(resource: DltResource, table_hints: TTableSchema) -> None: + return resource.apply_hints( + table_name=table_hints.get("name"), + parent_table_name=table_hints.get("parent"), + write_disposition=table_hints.get("write_disposition"), + columns=table_hints.get("columns"), + schema_contract=table_hints.get("schema_contract"), + table_format=table_hints.get("table_format"), + file_format=table_hints.get("file_format"), + ) + + def cleanup_snapshot_resources(snapshots: List[DltResource]) -> None: """FIXME Awful hack to release the underlying SQL engine when snapshotting tables""" if not snapshots: @@ -332,7 +344,7 @@ def __init__( table_qnames: Set[str], target_batch_size: int = 1000, included_columns: Optional[Dict[str, TColumnNames]] = None, - columns: Optional[Dict[str, TTableSchemaColumns]] = None, + table_hints: Optional[Dict[str, TTableSchema]] = None, ) -> None: self.upto_lsn = upto_lsn self.table_qnames = table_qnames @@ -345,7 +357,7 @@ def __init__( if included_columns else {} ) - self.columns = columns or {} + self.table_hints = table_hints or {} self.consumed_all: bool = False # maps table names to list of data items @@ -416,7 +428,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: last_table_schema = self.last_table_schema.get(table_name) table_schema = infer_table_schema( msg, - column_hints=self.columns.get(table_name), + table_hints=self.table_hints.get(table_name), included_columns=self.included_columns.get(table_name), ) if last_table_schema is None: @@ -457,7 +469,7 @@ class ItemGenerator: start_lsn: int = 0 target_batch_size: int = 1000 included_columns: Optional[Dict[str, TColumnNames]] = None - columns: Optional[Dict[str, TTableSchemaColumns]] = None + table_hints: Optional[Dict[str, TTableSchema]] = None last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -481,7 +493,7 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: table_qnames=self.table_qnames, target_batch_size=self.target_batch_size, included_columns=self.included_columns, - columns=self.columns, + table_hints=self.table_hints, ) cur.consume_stream(consumer) except StopReplication: # completed batch or reached `upto_lsn` @@ -512,19 +524,12 @@ def flush( def infer_table_schema( msg: RowMessage, *, - column_hints: Optional[TTableSchemaColumns] = None, + table_hints: Optional[TTableSchema] = None, included_columns: Optional[Set[str]] = None, ) -> TTableSchema: """Infers the table schema from the replication message and optional hints""" columns: TTableSchemaColumns = { - col.column_name: ( - merge_column( - _to_dlt_column_schema(col, col_info), - column_hints.get(col.column_name), - ) - if column_hints and col.column_name in column_hints - else _to_dlt_column_schema(col, col_info) - ) + col.column_name: _to_dlt_column_schema(col, col_info) for col, col_info in zip(msg.new_tuple, msg.new_typeinfo) if not included_columns or col.column_name in included_columns } @@ -536,10 +541,21 @@ def infer_table_schema( "hard_delete": True, } - return { - "name": msg.table.split(".")[1], + table_name = msg.table.split(".")[1] + table_schema: TTableSchema = { + "name": table_name, "columns": columns, } + if table_hints: + table_hints["name"] = table_name + # FIXME I dont't know why I have to do this, but merge_table doesn't work right or I'm missing something + if col_hints := table_hints.get("columns"): + table_hints["columns"] = { + col_name: merge_column(columns[col_name], col_schema) + for col_name, col_schema in col_hints.items() + } + merge_table("decoderbufs", table_schema, table_hints) + return table_schema def gen_data_item( diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index df12e39bc..f4b9619a9 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -15,87 +15,81 @@ @pytest.mark.parametrize( - "data", + "data, table_hints, expected_schema", [ - { - "transactionId": 969, - "commitTime": "1728662646949062", - "table": "src_pl_dataset_202410110404048747_staging.tbl_y", - "op": "INSERT", - "newTuple": [ - {"columnName": "id_y", "columnType": "20", "datumInt64": 2}, - {"columnName": "val_y", "columnType": "16", "datumBool": False}, - { - "columnName": "_dlt_load_id", - "columnType": "1043", - "datumString": "1728662646.2657657", - }, - { - "columnName": "_dlt_id", - "columnType": "1043", - "datumString": "gGjifTMTAUs5ag", - }, - ], - "newTypeinfo": [ - {"modifier": "bigint", "valueOptional": False}, - {"modifier": "boolean", "valueOptional": True}, - {"modifier": "character varying", "valueOptional": False}, - {"modifier": "character varying", "valueOptional": False}, - ], - "oldTuple": [], - }, - ], -) -@pytest.mark.parametrize( - "column_hints", - [{"id_y": {"primary_key": True}}], -) -@pytest.mark.parametrize( - "expected_schema", - [ - { - "name": "tbl_y", - "columns": { - "id_y": { - "data_type": "bigint", - "precision": 64, - "name": "id_y", - "nullable": False, - "primary_key": True, - }, - "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, - "_dlt_load_id": { - "data_type": "text", - "name": "_dlt_load_id", - "nullable": False, - }, - "_dlt_id": { - "data_type": "text", - "name": "_dlt_id", - "nullable": False, - }, - "lsn": { - "data_type": "bigint", - "dedup_sort": "desc", - "nullable": True, - }, - "deleted_ts": { - "data_type": "timestamp", - "hard_delete": True, - "nullable": True, + ( + { + "transactionId": 969, + "commitTime": "1728662646949062", + "table": "src_pl_dataset_202410110404048747_staging.tbl_y", + "op": "INSERT", + "newTuple": [ + {"columnName": "id_y", "columnType": "20", "datumInt64": 2}, + {"columnName": "val_y", "columnType": "16", "datumBool": False}, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + "datumString": "1728662646.2657657", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + "datumString": "gGjifTMTAUs5ag", + }, + ], + "newTypeinfo": [ + {"modifier": "bigint", "valueOptional": False}, + {"modifier": "boolean", "valueOptional": True}, + {"modifier": "character varying", "valueOptional": False}, + {"modifier": "character varying", "valueOptional": False}, + ], + "oldTuple": [], + }, + {"columns": {"id_y": {"primary_key": True}}}, + { + "name": "tbl_y", + "columns": { + "id_y": { + "data_type": "bigint", + "precision": 64, + "name": "id_y", + "nullable": False, + "primary_key": True, + }, + "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "_dlt_id": { + "data_type": "text", + "name": "_dlt_id", + "nullable": False, + }, + "lsn": { + "data_type": "bigint", + "dedup_sort": "desc", + "nullable": True, + }, + "deleted_ts": { + "data_type": "timestamp", + "hard_delete": True, + "nullable": True, + }, }, }, - }, + ), ], ) def test_infer_table_schema( data, - column_hints: Optional[TTableSchemaColumns], + table_hints: Optional[TTableSchema], expected_schema: TTableSchema, ): row_msg = RowMessage() parse_dict(data, row_msg) - assert infer_table_schema(row_msg, column_hints=column_hints) == expected_schema + assert infer_table_schema(row_msg, table_hints=table_hints) == expected_schema LSN = random.randint(0, 10000) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index d47e0bb9a..7f6a61139 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -3,7 +3,7 @@ import dlt import pytest -from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns from dlt.destinations.job_client_impl import SqlJobClientBase from sources.pg_legacy_replication import replication_resource @@ -184,9 +184,9 @@ def tbl_y(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - columns={ - "tbl_x": {"id_x": {"primary_key": True}}, - "tbl_y": {"id_y": {"primary_key": True}}, + table_hints={ + "tbl_x": {"columns": {"id_x": {"primary_key": True}}}, + "tbl_y": {"columns": {"id_y": {"primary_key": True}}}, }, ) @@ -251,25 +251,27 @@ def items(data): src_pl.run(items(data)) add_pk(src_pl.sql_client, "items", "col1") + if give_hints: + column_schema["col1"]["primary_key"] = True + else: + column_schema = {"col1": {"primary_key": True}} + + table_hints: Dict[str, TTableSchema] = {"items": {"columns": column_schema}} + # initialize replication and create resources snapshot = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", take_snapshots=init_load, - columns={"items": column_schema} if give_hints else None, + table_hints=table_hints if give_hints else None, ) - if give_hints: - column_schema["col1"]["primary_key"] = True - else: - column_schema = {"col1": {"primary_key": True}} - changes = replication_resource( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", - columns={"items": column_schema}, + table_hints=table_hints, ) # initial load @@ -476,7 +478,7 @@ def tbl_z(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("init_load", [True, False]) -def test_column_hints( +def test_table_hints( src_config: Tuple[dlt.Pipeline, str], destination_name: str, init_load: bool ) -> None: @dlt.resource @@ -503,23 +505,25 @@ def tbl_z(data): ) # initialize replication and create resources - column_hints: Dict[str, TTableSchemaColumns] = { - "tbl_x": {"another_col_x": {"data_type": "double"}}, - "tbl_y": {"another_col_y": {"precision": 32}}, + table_hints: Dict[str, TTableSchema] = { + "tbl_x": {"columns": {"another_col_x": {"data_type": "double"}}}, + "tbl_y": {"columns": {"another_col_y": {"precision": 32}}}, # tbl_z is not specified, hence all columns should be included } + snapshots = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), take_snapshots=init_load, - columns=column_hints, + table_hints=table_hints, ) + changes = replication_resource( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - columns=column_hints, + table_hints=table_hints, ) # update three postgres tables From f9cdf783143175df1f20b27f14bda741a831ad49 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 23 Oct 2024 03:08:50 +0200 Subject: [PATCH 32/88] wip: finally got rid of those errors --- sources/pg_legacy_replication/helpers.py | 36 ++++++++++++++---------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index a9ec3c1f9..94616b2e2 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -534,27 +534,33 @@ def infer_table_schema( if not included_columns or col.column_name in included_columns } - columns["lsn"] = {"data_type": "bigint", "nullable": True, "dedup_sort": "desc"} - columns["deleted_ts"] = { - "data_type": "timestamp", - "nullable": True, - "hard_delete": True, - } + # Add replication columns + columns["lsn"] = {"data_type": "bigint", "nullable": True} + columns["deleted_ts"] = {"data_type": "timestamp", "nullable": True} + + write_disposition = ( + table_hints.get("write_disposition", "append") if table_hints else "append" + ) + + if write_disposition not in ("replace", "append"): + columns["lsn"]["dedup_sort"] = "desc" + columns["deleted_ts"]["hard_delete"] = True + + schema, table = msg.table.split(".") + table_schema: TTableSchema = {"name": table, "columns": columns} - table_name = msg.table.split(".")[1] - table_schema: TTableSchema = { - "name": table_name, - "columns": columns, - } if table_hints: - table_hints["name"] = table_name + table_hints["name"] = table # FIXME I dont't know why I have to do this, but merge_table doesn't work right or I'm missing something - if col_hints := table_hints.get("columns"): - table_hints["columns"] = { + col_hints = table_hints.get("columns") + if col_hints: + col_hints = { col_name: merge_column(columns[col_name], col_schema) for col_name, col_schema in col_hints.items() + if not included_columns or col_name in included_columns } - merge_table("decoderbufs", table_schema, table_hints) + merge_table(schema, table_schema, table_hints) + return table_schema From 327b44cb306d092dad90859ab3788b7e4a9360c4 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 23 Oct 2024 09:08:04 +0200 Subject: [PATCH 33/88] wip: correcting failing tests --- sources/pg_legacy_replication/helpers.py | 24 ++++++------- tests/pg_legacy_replication/test_helpers.py | 3 +- .../test_pg_replication.py | 34 +++++++++++++------ 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 94616b2e2..1f302df2f 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -23,7 +23,7 @@ from dlt.common.schema.utils import merge_column, merge_table from dlt.common.typing import TDataItem from dlt.extract.items import DataItemWithMeta -from dlt.extract.resource import DltResource +from dlt.extract.resource import DltResource, TResourceHints from dlt.sources.credentials import ConnectionStringCredentials from psycopg2.extensions import cursor, connection as ConnectionExt from psycopg2.extras import ( @@ -175,14 +175,13 @@ def _prepare_snapshot_resource( included_columns=included_columns, ) if table_hints: - _apply_hints(t_rsrc, table_hints) + t_rsrc.merge_hints(_table_to_resource_hints(table_hints)) return t_rsrc -def _apply_hints(resource: DltResource, table_hints: TTableSchema) -> None: - return resource.apply_hints( +def _table_to_resource_hints(table_hints: TTableSchema) -> TResourceHints: + return dlt.mark.make_hints( table_name=table_hints.get("name"), - parent_table_name=table_hints.get("parent"), write_disposition=table_hints.get("write_disposition"), columns=table_hints.get("columns"), schema_contract=table_hints.get("schema_contract"), @@ -358,6 +357,10 @@ def __init__( else {} ) self.table_hints = table_hints or {} + if table_hints: + for table_schema in table_hints.values(): + if table_schema.get("columns") is None: + table_schema["columns"] = {} self.consumed_all: bool = False # maps table names to list of data items @@ -507,14 +510,11 @@ def flush( ) -> Iterator[Union[TDataItem, DataItemWithMeta]]: self.last_commit_lsn = consumer.last_commit_lsn for table_name, data_items in consumer.data_items.items(): - table_schema = consumer.last_table_schema.get(table_name) - if table_schema: + if table_schema := consumer.last_table_schema.get(table_name): assert table_name == table_schema["name"] - yield dlt.mark.with_hints( # meta item with column hints only, no data - [], - dlt.mark.make_hints( - table_name=table_name, columns=table_schema["columns"] - ), + yield dlt.mark.with_hints( + [], # meta item with column hints only, no data + _table_to_resource_hints(table_schema), create_table_variant=True, ) yield dlt.mark.with_table_name(data_items, table_name) diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index f4b9619a9..84a2b651e 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -45,7 +45,7 @@ ], "oldTuple": [], }, - {"columns": {"id_y": {"primary_key": True}}}, + {"columns": {"id_y": {"primary_key": True}}, "write_disposition": "merge"}, { "name": "tbl_y", "columns": { @@ -78,6 +78,7 @@ "nullable": True, }, }, + "write_disposition": "merge", }, ), ], diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 7f6a61139..cbd5b17b5 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -54,6 +54,10 @@ def tbl_y(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), + table_hints={ + "tbl_x": {"write_disposition": "merge"}, + "tbl_y": {"write_disposition": "merge"}, + }, ) src_pl.run( @@ -78,7 +82,7 @@ def tbl_y(data): cleanup_snapshot_resources(snapshots) # process changes - info = dest_pl.run(changes, write_disposition="merge") + info = dest_pl.run(changes) assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} exp_tbl_x = [ @@ -94,7 +98,7 @@ def tbl_y(data): src_pl.run(tbl_y({"id_y": 3, "val_y": True})) # process changes - info = dest_pl.run(changes, write_disposition="merge") + info = dest_pl.run(changes) assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} exp_tbl_y = [ @@ -113,7 +117,7 @@ def tbl_y(data): c.execute_sql(f"UPDATE {qual_name} SET val_y = false WHERE id_y = 1;") # process changes - info = dest_pl.run(changes, write_disposition="merge") + info = dest_pl.run(changes) assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} exp_tbl_x = [ @@ -185,8 +189,14 @@ def tbl_y(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), table_hints={ - "tbl_x": {"columns": {"id_x": {"primary_key": True}}}, - "tbl_y": {"columns": {"id_y": {"primary_key": True}}}, + "tbl_x": { + "columns": {"id_x": {"primary_key": True}}, + "write_disposition": "merge", + }, + "tbl_y": { + "columns": {"id_y": {"primary_key": True}}, + "write_disposition": "merge", + }, }, ) @@ -203,7 +213,7 @@ def tbl_y(data): dest_pl = dlt.pipeline( pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) - info = dest_pl.run(changes, write_disposition="merge") + info = dest_pl.run(changes) assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 1} exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] @@ -217,7 +227,7 @@ def tbl_y(data): c.execute_sql(f"DELETE FROM {qual_name} WHERE id_x = 2;") # process change and assert expectations - info = dest_pl.run(changes, write_disposition="merge") + info = dest_pl.run(changes) assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} exp_tbl_x = [{"id_x": 3, "val_x": "baz"}] @@ -256,7 +266,9 @@ def items(data): else: column_schema = {"col1": {"primary_key": True}} - table_hints: Dict[str, TTableSchema] = {"items": {"columns": column_schema}} + table_hints: Dict[str, TTableSchema] = { + "items": {"columns": column_schema, "write_disposition": "merge"} + } # initialize replication and create resources snapshot = init_replication( @@ -290,7 +302,7 @@ def items(data): r2["col1"] = 2 src_pl.run(items([r1, r2])) - info = dest_pl.run(changes, write_disposition="merge") + info = dest_pl.run(changes) assert_load_info(info) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 @@ -314,7 +326,7 @@ def items(data): src_pl.run(items([r1, r2])) # process changes and assert expectations - info = dest_pl.run(changes, write_disposition="merge") + info = dest_pl.run(changes) assert_load_info(info) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 exp = [ @@ -336,7 +348,7 @@ def items(data): c.execute_sql(f"UPDATE {qual_name} SET col2 = 2.5 WHERE col1 = 2;") # process change and assert expectation - info = dest_pl.run(changes, write_disposition="merge") + info = dest_pl.run(changes) assert_load_info(info) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 exp = [{"col1": 2, "col2": 2.5, "col3": False}] From a9a9bb784bad1931af8a11a65f0cdba25711f156 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 23 Oct 2024 10:04:46 +0200 Subject: [PATCH 34/88] wip: fixed working examples --- sources/pg_legacy_replication_pipeline.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 6db9c63fb..0fd6abc51 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -46,6 +46,12 @@ def replicate_single_table() -> None: slot_name=slot_name, schema=src_pl.dataset_name, table_names="my_source_table", + table_hints={ + "my_source_table": { + "columns": {"id": {"primary_key": True}}, + "write_disposition": "merge", + }, + }, ) # insert two records in source table and propagate changes to destination @@ -97,7 +103,7 @@ def replicate_with_initial_load() -> None: slot_name=slot_name, schema=src_pl.dataset_name, table_names="my_source_table", - take_snapshots=True, # persist snapshot table(s) and let function return resource(s) for initial load + take_snapshots=True, # let function return resource(s) for initial load reset=True, ) @@ -205,7 +211,7 @@ def replicate_with_column_selection() -> None: init_replication( # requires the Postgres user to have the REPLICATION attribute assigned slot_name=slot_name, schema=src_pl.dataset_name, - table_names=["tbl_x", "tbl_y"], + table_names=("tbl_x", "tbl_y"), reset=True, ) @@ -282,8 +288,8 @@ def show_destination_table( ) -> None: with dest_pl.sql_client() as c: dest_qual_name = c.make_qualified_table_name(table_name) - dest_records = c.execute_sql(f"SELECT {column_names} FROM {dest_qual_name};") - print(table_name, ":", dest_records) + with c.execute_query(f"SELECT {column_names} FROM {dest_qual_name}") as curr: + print(table_name, ":\n", curr.df()) if __name__ == "__main__": From 37acc351a7927d20c712d9e071db2bbc3bf0715a Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 23 Oct 2024 12:47:03 +0200 Subject: [PATCH 35/88] wip: more refactoring now docs... -_- --- sources/pg_legacy_replication/__init__.py | 5 +- sources/pg_legacy_replication/helpers.py | 134 ++++++++++---------- tests/pg_legacy_replication/test_helpers.py | 31 ++--- 3 files changed, 79 insertions(+), 91 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index b674333d6..5f4fd150b 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -27,9 +27,8 @@ def replication_resource( ) -> Iterable[Union[TDataItem, DataItemWithMeta]]: """Resource yielding data items for changes in one or more postgres tables. - - Relies on a replication slot and publication that publishes DML operations - (i.e. `insert`, `update`, and/or `delete`). Helper `init_replication` can be - used to set this up. + - Relies on a replication slot that publishes DML operations + (i.e. `insert`, `update`, and `delete`). - Maintains LSN of last consumed message in state to track progress. - At start of the run, advances the slot upto last consumed message in previous run. - Processes in batches to limit memory usage. diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 1f302df2f..14b5ee541 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -348,19 +348,9 @@ def __init__( self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.included_columns = ( - { - table_name: _normalize_included_columns(columns) - for table_name, columns in included_columns.items() - } - if included_columns - else {} - ) - self.table_hints = table_hints or {} - if table_hints: - for table_schema in table_hints.values(): - if table_schema.get("columns") is None: - table_schema["columns"] = {} + + self.included_columns = self._normalize_columns(included_columns) + self.table_hints = self._normalize_hints(table_hints) self.consumed_all: bool = False # maps table names to list of data items @@ -387,20 +377,15 @@ def process_msg(self, msg: ReplicationMessage) -> None: row_msg = RowMessage() try: row_msg.ParseFromString(msg.payload) - op = row_msg.op - lsn = msg.data_start - if op == Op.BEGIN: - self.last_commit_ts = _epoch_micros_to_datetime(row_msg.commit_time) - elif op == Op.COMMIT: - self.process_commit(lsn) - elif op == Op.INSERT: - self.process_change(row_msg, lsn) - elif op == Op.UPDATE: - self.process_change(row_msg, lsn) - elif op == Op.DELETE: - self.process_delete(row_msg, lsn) - else: + if row_msg.op == Op.UNKNOWN: raise AssertionError(f"Unsupported operation : {row_msg}") + + if row_msg.op == Op.BEGIN: + self.last_commit_ts = _epoch_micros_to_datetime(row_msg.commit_time) + elif row_msg.op == Op.COMMIT: + self.process_commit(msg.data_start) + else: # INSERT, UPDATE or DELETE + self.process_change(row_msg, msg.data_start) except StopReplication: raise except Exception: @@ -424,42 +409,59 @@ def process_commit(self, lsn: int) -> None: raise StopReplication def process_change(self, msg: RowMessage, lsn: int) -> None: - """Processes replication message of type Insert or Update""" + """Processes replication message of type Insert, Update or Delete""" if msg.table not in self.table_qnames: return - _, table_name = msg.table.split(".") + table_name = msg.table.split(".")[1] + if msg.op == Op.DELETE: + data_item = gen_data_item(msg) + else: + table_schema = self._get_table_schema(msg) + data_item = gen_data_item( + msg, + included_columns=self.included_columns.get(table_name), + column_schema=table_schema["columns"], + ) + data_item["lsn"] = lsn + self.data_items[table_name].append(data_item) + + def _get_table_schema(self, msg: RowMessage) -> TTableSchema: + table_name = msg.table.split(".")[1] last_table_schema = self.last_table_schema.get(table_name) table_schema = infer_table_schema( msg, - table_hints=self.table_hints.get(table_name), included_columns=self.included_columns.get(table_name), + table_hints=self.table_hints.get(table_name), ) if last_table_schema is None: self.last_table_schema[table_name] = table_schema elif last_table_schema != table_schema: raise StopReplication # table schema change + return table_schema + + @staticmethod + def _normalize_columns( + included_columns: Optional[Dict[str, TColumnNames]] + ) -> Dict[str, Set[str]]: + if not included_columns: + return {} + return { + table_name: { + col for col in ([columns] if isinstance(columns, str) else columns) + } + for table_name, columns in included_columns.items() + } - data_item = gen_data_item( - msg.new_tuple, - column_schema=table_schema["columns"], - included_columns=self.included_columns.get(table_name), - ) - data_item["lsn"] = lsn - self.data_items[table_name].append(data_item) - - def process_delete(self, msg: RowMessage, lsn: int) -> None: - """Processes replication message of type Delete""" - if msg.table not in self.table_qnames: - return - _, table_name = msg.table.split(".") - data_item = gen_data_item( - msg.old_tuple, - for_delete=True, - included_columns=self.included_columns.get(table_name), - ) - data_item["lsn"] = lsn - data_item["deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) - self.data_items[table_name].append(data_item) + @staticmethod + def _normalize_hints( + table_hints: Optional[Dict[str, TTableSchema]] + ) -> Dict[str, TTableSchema]: + """Normalize the hints by ensuring that each table schema has a 'columns' TTableSchemaColumns.""" + if not table_hints: + return {} + for table_schema in table_hints.values(): + table_schema.setdefault("columns", {}) + return table_hints @dataclass @@ -524,8 +526,8 @@ def flush( def infer_table_schema( msg: RowMessage, *, - table_hints: Optional[TTableSchema] = None, included_columns: Optional[Set[str]] = None, + table_hints: Optional[TTableSchema] = None, ) -> TTableSchema: """Infers the table schema from the replication message and optional hints""" columns: TTableSchemaColumns = { @@ -565,34 +567,30 @@ def infer_table_schema( def gen_data_item( - row: Sequence[DatumMessage], + msg: RowMessage, *, included_columns: Optional[Set[str]] = None, column_schema: Optional[TTableSchemaColumns] = None, - for_delete: bool = False, ) -> TDataItem: - """Generates data item from a row and corresponding metadata.""" + """Generates data item from a row message and corresponding metadata.""" data_item: TDataItem = {} + if msg.op != Op.DELETE: + row = msg.new_tuple + else: + row = msg.old_tuple + data_item["deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) for data in row: col_name = data.column_name if included_columns and col_name not in included_columns: continue data_type = ( - column_schema[col_name]["data_type"] if column_schema else data.column_type + column_schema[col_name]["data_type"] + if column_schema and column_schema.get(col_name) + else data.column_type + ) + data_item[col_name] = _to_dlt_val( + data, data_type, for_delete=msg.op == Op.DELETE ) - data_item[col_name] = _to_dlt_val(data, data_type, for_delete=for_delete) return data_item - - -def _normalize_included_columns( - included_columns: Optional[TColumnNames], -) -> Optional[Set[str]]: - if included_columns is None: - return None - return ( - {included_columns} - if isinstance(included_columns, str) - else set(included_columns) - ) diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 84a2b651e..4cef1523f 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -1,9 +1,8 @@ -import random from typing import Optional import pendulum import pytest -from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns +from dlt.common.schema.typing import TTableSchema from dlt.common.typing import TDataItem from google.protobuf.json_format import ParseDict as parse_dict @@ -93,9 +92,6 @@ def test_infer_table_schema( assert infer_table_schema(row_msg, table_hints=table_hints) == expected_schema -LSN = random.randint(0, 10000) - - @pytest.mark.parametrize( "data, data_item", [ @@ -205,18 +201,6 @@ def test_infer_table_schema( "col11": pendulum.parse("13:26:45.176451", strict=False).time(), }, ), - ], -) -def test_gen_data_item(data, data_item: TDataItem): - row_msg = RowMessage() - parse_dict(data, row_msg) - column_schema = infer_table_schema(row_msg)["columns"] - assert gen_data_item(row_msg.new_tuple, column_schema=column_schema) == data_item - - -@pytest.mark.parametrize( - "data, data_item", - [ ( { "transactionId": 932, @@ -243,11 +227,18 @@ def test_gen_data_item(data, data_item: TDataItem): }, ], }, - {"id_x": 1, "val_x": "", "_dlt_load_id": "", "_dlt_id": ""}, + { + "id_x": 1, + "val_x": "", + "_dlt_load_id": "", + "_dlt_id": "", + "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), + }, ), ], ) -def test_gen_delete_item(data, data_item: TDataItem): +def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) - assert gen_data_item(row_msg.old_tuple, for_delete=True) == data_item + column_schema = infer_table_schema(row_msg)["columns"] + assert gen_data_item(row_msg, column_schema=column_schema) == data_item From a90acee3f1515ed127c41f1ae839371c5fc03f74 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 23 Oct 2024 14:06:31 +0200 Subject: [PATCH 36/88] wip: cleaning up --- pyproject.toml | 3 +- .../pg_legacy_replication/requirements.txt | 2 +- sources/pg_legacy_replication/schema_types.py | 34 ++++++++----------- sources/pg_replication/helpers.py | 16 --------- 4 files changed, 18 insertions(+), 37 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 60f6a965c..5ebf64bd4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,8 +59,9 @@ connectorx = ">=0.3.1" psycopg2-binary = ">=2.9.9" [tool.poetry.group.pg_legacy_replication.dependencies] -psycopg2-binary = ">=2.9.9" protobuf = ">=4.25" +psycopg2-binary = ">=2.9.9" +sqlalchemy = ">=1.4" [tool.poetry.group.google_sheets.dependencies] google-api-python-client = "^2.78.0" diff --git a/sources/pg_legacy_replication/requirements.txt b/sources/pg_legacy_replication/requirements.txt index 432a62270..1ad8e04f0 100644 --- a/sources/pg_legacy_replication/requirements.txt +++ b/sources/pg_legacy_replication/requirements.txt @@ -1,4 +1,4 @@ -dlt>=0.4.13 +dlt>=0.5.12 psycopg2-binary>=2.9.9 protobuf>=5 sqlalchemy>=1.4 \ No newline at end of file diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 7c84b09e3..c376d10fb 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -7,6 +7,7 @@ from dlt.common.data_types.type_helpers import coerce_value from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import TColumnSchema, TColumnType +from dlt.destinations import postgres from .pg_logicaldec_pb2 import DatumMessage, TypeInfo @@ -86,29 +87,24 @@ def _get_precision_and_scale( return (None, None) -@lru_cache(maxsize=None) -def _type_mapper() -> Any: - from dlt.destinations import postgres - - try: - from dlt.destinations.impl.postgres.factory import PostgresTypeMapper # type: ignore - except ImportError: - from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper - - return PostgresTypeMapper(postgres().capabilities()) - - # FIXME Hack to get it to work with 0.5.x and 1.x def _from_destination_type( db_type: str, precision: Optional[int] = None, scale: Optional[int] = None ) -> TColumnType: - mapper = _type_mapper() - from_db_type: Callable[[str, Optional[int], Optional[int]], TColumnType] - if hasattr(mapper, "from_destination_type"): - from_db_type = mapper.from_destination_type - else: - from_db_type = mapper.from_db_type - return from_db_type(db_type, precision, scale) + @lru_cache(maxsize=None) + def _from_db_type() -> Callable[[str, Optional[int], Optional[int]], TColumnType]: + try: + from dlt.destinations.impl.postgres.factory import PostgresTypeMapper # type: ignore + + type_mapper = PostgresTypeMapper(postgres().capabilities()) + return type_mapper.from_destination_type # type: ignore[no-any-return] + except ImportError: + from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper + + type_mapper = PostgresTypeMapper(postgres().capabilities()) + return type_mapper.from_db_type # type: ignore[no-any-return] + + return _from_db_type()(db_type, precision, scale) def _to_dlt_column_type(type_id: int, modifier: str) -> TColumnType: diff --git a/sources/pg_replication/helpers.py b/sources/pg_replication/helpers.py index 99acc3911..3b29f79c3 100644 --- a/sources/pg_replication/helpers.py +++ b/sources/pg_replication/helpers.py @@ -573,9 +573,6 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: self.generated_all = consumer.consumed_all -from devtools import debug - - class MessageConsumer: """Consumes messages from a ReplicationCursor sequentially. @@ -679,8 +676,6 @@ def process_relation(self, decoded_msg: Relation) -> None: "columns": columns, } - debug(self.last_table_schema[decoded_msg.relation_id]) - # apply user input # 1) exclude columns include_columns = ( @@ -723,15 +718,6 @@ def process_relation(self, decoded_msg: Relation) -> None: ), create_table_variant=True, ) - debug(decoded_msg) - debug( - { - "type": "_meta_item", - "table_name": table_name, - "write_disposition": write_disposition, - "columns": columns, - } - ) self.data_items[decoded_msg.relation_id] = [meta_item] def process_change( @@ -759,8 +745,6 @@ def process_change( ), ) self.data_items[decoded_msg.relation_id].append(data_item) - debug(decoded_msg) - debug(data_item) @staticmethod def gen_data_item( From f927f1316d33986a9fa41edf43be92c01a3ec696 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 23 Oct 2024 15:05:30 +0200 Subject: [PATCH 37/88] wip: cleaning up --- sources/pg_legacy_replication/helpers.py | 18 ++++++++++-------- .../test_pg_replication.py | 3 +++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 14b5ee541..f502f86cc 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -510,16 +510,18 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: def flush( self, consumer: MessageConsumer ) -> Iterator[Union[TDataItem, DataItemWithMeta]]: - self.last_commit_lsn = consumer.last_commit_lsn - for table_name, data_items in consumer.data_items.items(): - if table_schema := consumer.last_table_schema.get(table_name): - assert table_name == table_schema["name"] + """Flushes the batches of data items generated by MessageConsumer.""" + for table, items in consumer.data_items.items(): + # Retrieve the table schema if it exists (never for DELETEs) + schema = consumer.last_table_schema.get(table) + if schema: yield dlt.mark.with_hints( - [], # meta item with column hints only, no data - _table_to_resource_hints(table_schema), - create_table_variant=True, + [], _table_to_resource_hints(schema), create_table_variant=True ) - yield dlt.mark.with_table_name(data_items, table_name) + + yield dlt.mark.with_table_name(items, table) + + self.last_commit_lsn = consumer.last_commit_lsn self.generated_all = consumer.consumed_all diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index cbd5b17b5..3a537c2b8 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -294,6 +294,7 @@ def items(data): info = dest_pl.run(snapshot) assert_load_info(info) assert load_table_counts(dest_pl, "items")["items"] == 1 + cleanup_snapshot_resources(snapshot) # insert two records in postgres table r1 = deepcopy(data) @@ -476,6 +477,7 @@ def tbl_z(data): assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x"} assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y"} assert get_cols(dest_pl, "tbl_z") == {"id_z", "val_z", "another_col_z"} + cleanup_snapshot_resources(snapshots) dest_pl.run(changes) assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x", "lsn", "deleted_ts"} assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y", "lsn", "deleted_ts"} @@ -571,6 +573,7 @@ def tbl_z(data): ] == "bigint" ) + cleanup_snapshot_resources(snapshots) dest_pl.run(changes) assert ( dest_pl.default_schema.get_table_columns("tbl_x")["another_col_x"]["data_type"] From cc7ad615b73cbc3fb5b470e4340f6bd16967bc7e Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 24 Oct 2024 17:36:03 +0200 Subject: [PATCH 38/88] wip: attempting to refactor to use dlt resources --- sources/pg_legacy_replication/__init__.py | 60 +++++++++++++++++-- sources/pg_legacy_replication/helpers.py | 47 +++++++++++---- .../test_pg_replication.py | 18 +++--- tests/utils.py | 16 +++-- 4 files changed, 108 insertions(+), 33 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 5f4fd150b..0af455574 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -4,11 +4,11 @@ import dlt from dlt.common.schema.typing import TColumnNames, TTableSchema -from dlt.common.typing import TDataItem -from dlt.extract.items import DataItemWithMeta +from dlt.extract import DltResource +from dlt.extract.items import DataItemWithMeta, TDataItem from dlt.sources.credentials import ConnectionStringCredentials -from .helpers import advance_slot, get_max_lsn, ItemGenerator +from .helpers import advance_slot, get_max_lsn, ItemGenerator, table_wal_handler @dlt.resource( @@ -27,8 +27,7 @@ def replication_resource( ) -> Iterable[Union[TDataItem, DataItemWithMeta]]: """Resource yielding data items for changes in one or more postgres tables. - - Relies on a replication slot that publishes DML operations - (i.e. `insert`, `update`, and `delete`). + - Relies on a replication slot that publishes DML operations (i.e. `insert`, `update`, and `delete`). - Maintains LSN of last consumed message in state to track progress. - At start of the run, advances the slot upto last consumed message in previous run. - Processes in batches to limit memory usage. @@ -104,3 +103,54 @@ def replication_resource( dlt.current.resource_state()["last_commit_lsn"] = gen.last_commit_lsn break start_lsn = gen.last_commit_lsn + + +@dlt.source +def replication_source( + slot_name: str, + schema: str, + table_names: Union[str, Sequence[str]], + credentials: ConnectionStringCredentials = dlt.secrets.value, + included_columns: Optional[Dict[str, TColumnNames]] = None, + table_hints: Optional[Dict[str, TTableSchema]] = None, + target_batch_size: int = 1000, + flush_slot: bool = True, +) -> Sequence[DltResource]: + resources = [] + + wal_reader = replication_resource( + slot_name=slot_name, + schema=schema, + table_names=table_names, + target_batch_size=target_batch_size, + flush_slot=flush_slot, + ) + + for table in table_names: + xformer = dlt.transformer( + table_wal_handler(table), + data_from=wal_reader, + name=table, + table_name=table, + write_disposition=( + table_hints.get(table).get("write_disposition") if table_hints else None + ), + columns=table_hints.get(table).get("columns") if table_hints else None, + primary_key=None, + merge_key=None, + schema_contract=table_hints.get(table).get("schema_contract") + if table_hints + else None, + table_format=table_hints.get(table).get("table_format") + if table_hints + else None, + file_format=table_hints.get(table).get("file_format") + if table_hints + else None, + selected=True, + spec=None, + parallelized=False, + ) + resources.append(xformer) + + return resources diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index f502f86cc..e218f6d84 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -9,6 +9,10 @@ List, Sequence, Any, + Iterable, + Tuple, + Callable, + TypedDict, ) import dlt @@ -21,7 +25,7 @@ TTableSchemaColumns, ) from dlt.common.schema.utils import merge_column, merge_table -from dlt.common.typing import TDataItem +from dlt.common.typing import TDataItem, TDataItems from dlt.extract.items import DataItemWithMeta from dlt.extract.resource import DltResource, TResourceHints from dlt.sources.credentials import ConnectionStringCredentials @@ -515,16 +519,33 @@ def flush( # Retrieve the table schema if it exists (never for DELETEs) schema = consumer.last_table_schema.get(table) if schema: - yield dlt.mark.with_hints( - [], _table_to_resource_hints(schema), create_table_variant=True - ) - - yield dlt.mark.with_table_name(items, table) + yield (table, schema) + yield (table, items) self.last_commit_lsn = consumer.last_commit_lsn self.generated_all = consumer.consumed_all +def table_wal_handler( + table: str, +) -> Callable[[TDataItem], Iterable[DataItemWithMeta]]: + def handle( + schema_or_batch: Tuple[str, Union[TTableSchema, List[TDataItem]]] + ) -> Iterable[DataItemWithMeta]: + table_name, items = schema_or_batch + if table_name != table: + return + if isinstance(items, Dict): + schema: TTableSchema = items + yield dlt.mark.with_hints( + [], _table_to_resource_hints(schema), create_table_variant=True + ) + else: + yield dlt.mark.with_table_name(items, table) + + return handle + + def infer_table_schema( msg: RowMessage, *, @@ -542,13 +563,13 @@ def infer_table_schema( columns["lsn"] = {"data_type": "bigint", "nullable": True} columns["deleted_ts"] = {"data_type": "timestamp", "nullable": True} - write_disposition = ( - table_hints.get("write_disposition", "append") if table_hints else "append" - ) - - if write_disposition not in ("replace", "append"): - columns["lsn"]["dedup_sort"] = "desc" - columns["deleted_ts"]["hard_delete"] = True + # write_disposition = ( + # table_hints.get("write_disposition", "append") if table_hints else "append" + # ) + # + # FIXME if write_disposition not in ("replace", "append"): + columns["lsn"]["dedup_sort"] = "desc" + columns["deleted_ts"]["hard_delete"] = True schema, table = msg.table.split(".") table_schema: TTableSchema = {"name": table, "columns": columns} diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 3a537c2b8..7afc17e55 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -6,7 +6,7 @@ from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns from dlt.destinations.job_client_impl import SqlJobClientBase -from sources.pg_legacy_replication import replication_resource +from sources.pg_legacy_replication import replication_resource, replication_source from sources.pg_legacy_replication.helpers import ( init_replication, cleanup_snapshot_resources, @@ -50,15 +50,13 @@ def tbl_y(data): take_snapshots=True, ) - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - table_hints={ - "tbl_x": {"write_disposition": "merge"}, - "tbl_y": {"write_disposition": "merge"}, - }, ) + changes.tbl_x.apply_hints(write_disposition="merge", primary_key="id_x") + changes.tbl_y.apply_hints(write_disposition="merge", primary_key="id_y") src_pl.run( [ @@ -83,7 +81,7 @@ def tbl_y(data): # process changes info = dest_pl.run(changes) - assert_load_info(info) + assert_load_info(info, expected_load_packages=2) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 2} exp_tbl_x = [ {"id_x": 1, "val_x": "foo"}, @@ -99,7 +97,7 @@ def tbl_y(data): # process changes info = dest_pl.run(changes) - assert_load_info(info) + assert_load_info(info, expected_load_packages=2) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} exp_tbl_y = [ {"id_y": 1, "val_y": True}, @@ -118,7 +116,7 @@ def tbl_y(data): # process changes info = dest_pl.run(changes) - assert_load_info(info) + assert_load_info(info, expected_load_packages=2) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 3, "tbl_y": 3} exp_tbl_x = [ {"id_x": 1, "val_x": "foo_updated"}, @@ -140,7 +138,7 @@ def tbl_y(data): # process changes info = dest_pl.run(changes) - assert_load_info(info) + assert_load_info(info, expected_load_packages=2) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 2, "tbl_y": 3} exp_tbl_x = [{"id_x": 2, "val_x": "bar"}, {"id_x": 3, "val_x": "baz"}] exp_tbl_y = [ diff --git a/tests/utils.py b/tests/utils.py index be845765b..9ba13f974 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -228,11 +228,17 @@ def assert_query_data( def assert_load_info(info: LoadInfo, expected_load_packages: int = 1) -> None: """Asserts that expected number of packages was loaded and there are no failed jobs""" - assert len(info.loads_ids) == expected_load_packages - # all packages loaded - assert all(package.state == "loaded" for package in info.load_packages) is True - # no failed jobs in any of the packages - info.raise_on_failed_jobs() + try: + assert len(info.loads_ids) == expected_load_packages + # all packages loaded + assert all(package.state == "loaded" for package in info.load_packages) is True + # no failed jobs in any of the packages + info.raise_on_failed_jobs() + except AssertionError: + from devtools import debug + + debug(info) + raise def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: From f9c7694ecafdfed7426c40fb01b8272d81d4ca4a Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 24 Oct 2024 17:53:16 +0200 Subject: [PATCH 39/88] wip: second test passing --- tests/pg_legacy_replication/test_pg_replication.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 7afc17e55..5c4ed21fa 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -182,21 +182,13 @@ def tbl_y(data): table_names=("tbl_x", "tbl_y"), ) - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - table_hints={ - "tbl_x": { - "columns": {"id_x": {"primary_key": True}}, - "write_disposition": "merge", - }, - "tbl_y": { - "columns": {"id_y": {"primary_key": True}}, - "write_disposition": "merge", - }, - }, ) + changes.tbl_x.apply_hints(write_disposition="merge", primary_key="id_x") + changes.tbl_y.apply_hints(write_disposition="merge", primary_key="id_y") # change postgres table after replication has been initialized # these records should be in the replicated table From 927ae037f45c851c759e91b5536789aaefcf969a Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 25 Oct 2024 15:06:58 +0200 Subject: [PATCH 40/88] wip: all tests pass again now for refactoring --- sources/pg_legacy_replication/__init__.py | 137 +++++++----------- sources/pg_legacy_replication/helpers.py | 17 +-- .../test_pg_replication.py | 37 +++-- 3 files changed, 82 insertions(+), 109 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 0af455574..8f8f5ef85 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -11,11 +11,8 @@ from .helpers import advance_slot, get_max_lsn, ItemGenerator, table_wal_handler -@dlt.resource( - name=lambda args: args["slot_name"] + "_" + args["schema"], - standalone=True, -) -def replication_resource( +@dlt.source +def replication_source( slot_name: str, schema: str, table_names: Union[str, Sequence[str]], @@ -24,8 +21,8 @@ def replication_resource( table_hints: Optional[Dict[str, TTableSchema]] = None, target_batch_size: int = 1000, flush_slot: bool = True, -) -> Iterable[Union[TDataItem, DataItemWithMeta]]: - """Resource yielding data items for changes in one or more postgres tables. +) -> Sequence[DltResource]: + """Source yielding data items for changes in one or more postgres tables. - Relies on a replication slot that publishes DML operations (i.e. `insert`, `update`, and `delete`). - Maintains LSN of last consumed message in state to track progress. @@ -69,88 +66,52 @@ def replication_resource( Yields: Data items for changes published in the publication. - """ - # start where we left off in previous run - start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0) - if flush_slot: - advance_slot(start_lsn, slot_name, credentials) - - # continue until last message in replication slot - options: Dict[str, str] = {} - upto_lsn = get_max_lsn(slot_name, credentials) - if upto_lsn is None: - return - + """ if isinstance(table_names, str): table_names = [table_names] - table_qnames = {f"{schema}.{table_name}" for table_name in table_names} - - # generate items in batches - while True: - gen = ItemGenerator( - credentials=credentials, - slot_name=slot_name, - table_qnames=table_qnames, - options=options, - upto_lsn=upto_lsn, - start_lsn=start_lsn, - target_batch_size=target_batch_size, - included_columns=included_columns, - table_hints=table_hints, - ) - yield from gen - if gen.generated_all: - dlt.current.resource_state()["last_commit_lsn"] = gen.last_commit_lsn - break - start_lsn = gen.last_commit_lsn - - -@dlt.source -def replication_source( - slot_name: str, - schema: str, - table_names: Union[str, Sequence[str]], - credentials: ConnectionStringCredentials = dlt.secrets.value, - included_columns: Optional[Dict[str, TColumnNames]] = None, - table_hints: Optional[Dict[str, TTableSchema]] = None, - target_batch_size: int = 1000, - flush_slot: bool = True, -) -> Sequence[DltResource]: - resources = [] - wal_reader = replication_resource( - slot_name=slot_name, - schema=schema, - table_names=table_names, - target_batch_size=target_batch_size, - flush_slot=flush_slot, - ) - - for table in table_names: - xformer = dlt.transformer( - table_wal_handler(table), - data_from=wal_reader, - name=table, - table_name=table, - write_disposition=( - table_hints.get(table).get("write_disposition") if table_hints else None - ), - columns=table_hints.get(table).get("columns") if table_hints else None, - primary_key=None, - merge_key=None, - schema_contract=table_hints.get(table).get("schema_contract") - if table_hints - else None, - table_format=table_hints.get(table).get("table_format") - if table_hints - else None, - file_format=table_hints.get(table).get("file_format") - if table_hints - else None, - selected=True, - spec=None, - parallelized=False, + @dlt.resource(name=lambda args: args["slot_name"], standalone=True) + def replication_resource( + slot_name: str, + ) -> Iterable[Union[TDataItem, DataItemWithMeta]]: + + # start where we left off in previous run + start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0) + if flush_slot: + advance_slot(start_lsn, slot_name, credentials) + + # continue until last message in replication slot + options: Dict[str, str] = {} + upto_lsn = get_max_lsn(slot_name, credentials) + if upto_lsn is None: + return + + table_qnames = {f"{schema}.{table_name}" for table_name in table_names} + + # generate items in batches + while True: + gen = ItemGenerator( + credentials=credentials, + slot_name=slot_name, + table_qnames=table_qnames, + options=options, + upto_lsn=upto_lsn, + start_lsn=start_lsn, + target_batch_size=target_batch_size, + included_columns=included_columns, + table_hints=table_hints, + ) + yield from gen + if gen.generated_all: + dlt.current.resource_state()["last_commit_lsn"] = gen.last_commit_lsn + break + start_lsn = gen.last_commit_lsn + + wal_reader = replication_resource(slot_name) + + return [ + dlt.transformer( + table_wal_handler(table), data_from=wal_reader, name=table, table_name=table ) - resources.append(xformer) - - return resources + for table in table_names + ] \ No newline at end of file diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index e218f6d84..c5264c398 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -533,15 +533,14 @@ def handle( schema_or_batch: Tuple[str, Union[TTableSchema, List[TDataItem]]] ) -> Iterable[DataItemWithMeta]: table_name, items = schema_or_batch - if table_name != table: - return - if isinstance(items, Dict): - schema: TTableSchema = items - yield dlt.mark.with_hints( - [], _table_to_resource_hints(schema), create_table_variant=True - ) - else: - yield dlt.mark.with_table_name(items, table) + if table_name == table: + if isinstance(items, Dict): + schema: TTableSchema = items + yield dlt.mark.with_hints( + [], _table_to_resource_hints(schema), create_table_variant=True + ) + else: + yield dlt.mark.with_table_name(items, table) return handle diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 5c4ed21fa..088183be5 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -3,14 +3,15 @@ import dlt import pytest -from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns +from dlt.common.schema.typing import TTableSchema from dlt.destinations.job_client_impl import SqlJobClientBase -from sources.pg_legacy_replication import replication_resource, replication_source +from sources.pg_legacy_replication import replication_source from sources.pg_legacy_replication.helpers import ( init_replication, cleanup_snapshot_resources, ) +from sources.rest_api import exclude_keys from tests.utils import ( ALL_DESTINATIONS, assert_load_info, @@ -54,6 +55,10 @@ def tbl_y(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), + table_hints={ + "tbl_x": {"write_disposition": "merge"}, + "tbl_y": {"write_disposition": "merge"}, + }, ) changes.tbl_x.apply_hints(write_disposition="merge", primary_key="id_x") changes.tbl_y.apply_hints(write_disposition="merge", primary_key="id_y") @@ -240,6 +245,11 @@ def test_mapped_data_types( data = deepcopy(TABLE_ROW_ALL_DATA_TYPES) column_schema = deepcopy(TABLE_UPDATE_COLUMNS_SCHEMA) + # FIXME Need to figure out why when creating a snapshot my schema get loaded in another job + expected_load_packages = 1 + if init_load: + expected_load_packages = 2 + # resource to load data into postgres source table @dlt.resource(primary_key="col1", write_disposition="merge", columns=column_schema) def items(data): @@ -269,7 +279,7 @@ def items(data): table_hints=table_hints if give_hints else None, ) - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", @@ -294,7 +304,7 @@ def items(data): src_pl.run(items([r1, r2])) info = dest_pl.run(changes) - assert_load_info(info) + assert_load_info(info, expected_load_packages=expected_load_packages) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 if give_hints: @@ -318,7 +328,7 @@ def items(data): # process changes and assert expectations info = dest_pl.run(changes) - assert_load_info(info) + assert_load_info(info, expected_load_packages=expected_load_packages) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 exp = [ {"col1": 1, "col2": 1.5, "col3": True}, @@ -340,7 +350,7 @@ def items(data): # process change and assert expectation info = dest_pl.run(changes) - assert_load_info(info) + assert_load_info(info, expected_load_packages=expected_load_packages) assert load_table_counts(dest_pl, "items")["items"] == 3 if init_load else 2 exp = [{"col1": 2, "col2": 2.5, "col3": False}] assert_loaded_data( @@ -368,7 +378,7 @@ def test_unmapped_data_types( schema=src_pl.dataset_name, table_names="data_types", ) - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names="data_types", @@ -442,7 +452,7 @@ def tbl_z(data): take_snapshots=init_load, included_columns=included_columns, ) - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), @@ -523,7 +533,7 @@ def tbl_z(data): table_hints=table_hints, ) - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), @@ -607,7 +617,7 @@ def test_table_schema_change( ) # create resource and pipeline - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", @@ -713,7 +723,7 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: schema=src_pl.dataset_name, table_names="items", ) - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", @@ -727,6 +737,9 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: batch = [{**r, **{"id": key}} for r in [data] for key in range(1, 101)] src_pl.run(batch, table_name="items") extract_info = dest_pl.extract(changes) + from devtools import debug + + debug(extract_info) assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 # insert 100 records into source table in 5 transactions @@ -741,4 +754,4 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: batch = [{**r, **{"id": key}} for r in [data] for key in range(181, 201)] src_pl.run(batch, table_name="items") extract_info = dest_pl.extract(changes) - assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 + assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 \ No newline at end of file From 3f317526c3259270cda100366adeb5dfd749b5a6 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 25 Oct 2024 16:00:27 +0200 Subject: [PATCH 41/88] wip: init_replication is now a dlt source --- sources/pg_legacy_replication/__init__.py | 5 +- sources/pg_legacy_replication/helpers.py | 60 +++++++++---------- .../test_pg_replication.py | 24 ++++---- 3 files changed, 42 insertions(+), 47 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 8f8f5ef85..49ca66a23 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -66,7 +66,7 @@ def replication_source( Yields: Data items for changes published in the publication. - """ + """ if isinstance(table_names, str): table_names = [table_names] @@ -74,7 +74,6 @@ def replication_source( def replication_resource( slot_name: str, ) -> Iterable[Union[TDataItem, DataItemWithMeta]]: - # start where we left off in previous run start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0) if flush_slot: @@ -114,4 +113,4 @@ def replication_resource( table_wal_handler(table), data_from=wal_reader, name=table, table_name=table ) for table in table_names - ] \ No newline at end of file + ] diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index c5264c398..d9448ecf0 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -27,7 +27,8 @@ from dlt.common.schema.utils import merge_column, merge_table from dlt.common.typing import TDataItem, TDataItems from dlt.extract.items import DataItemWithMeta -from dlt.extract.resource import DltResource, TResourceHints +from dlt.extract import DltSource, DltResource +from dlt.extract.resource import TResourceHints from dlt.sources.credentials import ConnectionStringCredentials from psycopg2.extensions import cursor, connection as ConnectionExt from psycopg2.extras import ( @@ -44,6 +45,7 @@ @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) +@dlt.source def init_replication( slot_name: str, schema: str, @@ -51,9 +53,8 @@ def init_replication( credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = False, included_columns: Optional[Dict[str, TColumnNames]] = None, - table_hints: Optional[Dict[str, TTableSchema]] = None, reset: bool = False, -) -> Optional[List[DltResource]]: +) -> Iterable[DltResource]: """Initializes replication for one, several, or all tables within a schema. Can be called repeatedly with the same `slot_name`: @@ -112,7 +113,7 @@ def init_replication( # Close connection if no snapshots are needed if not take_snapshots: rep_conn.close() - return None + return # Ensure `sqlalchemy` and `sql_table` are available _import_sql_table_resource() @@ -133,17 +134,13 @@ def on_begin(conn: ConnectionSqla) -> None: if isinstance(table_names, str): table_names = [table_names] included_columns = included_columns or {} - table_hints = table_hints or {} - return [ - _prepare_snapshot_resource( + for table in table_names: + yield _prepare_snapshot_resource( engine, - table_name, + table, schema, - included_columns=included_columns.get(table_name), - table_hints=table_hints.get(table_name), + included_columns=included_columns.get(table), ) - for table_name in table_names - ] def _configure_engine( @@ -170,36 +167,22 @@ def _prepare_snapshot_resource( schema: str, *, included_columns: Optional[TColumnNames] = None, - table_hints: Optional[TTableSchema] = None, ) -> DltResource: t_rsrc: DltResource = sql_table( # type: ignore[name-defined] credentials=engine, table=table_name, schema=schema, included_columns=included_columns, - ) - if table_hints: - t_rsrc.merge_hints(_table_to_resource_hints(table_hints)) + )f return t_rsrc -def _table_to_resource_hints(table_hints: TTableSchema) -> TResourceHints: - return dlt.mark.make_hints( - table_name=table_hints.get("name"), - write_disposition=table_hints.get("write_disposition"), - columns=table_hints.get("columns"), - schema_contract=table_hints.get("schema_contract"), - table_format=table_hints.get("table_format"), - file_format=table_hints.get("file_format"), - ) - - -def cleanup_snapshot_resources(snapshots: List[DltResource]) -> None: +def cleanup_snapshot_resources(snapshots: DltSource) -> None: """FIXME Awful hack to release the underlying SQL engine when snapshotting tables""" - if not snapshots: - return - engine: Engine = snapshots[0]._explicit_args["credentials"] - engine.dispose() + resources = snapshots.resources + if resources: + engine: Engine = next(iter(resources.values()))._explicit_args["credentials"] + engine.dispose() @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) @@ -545,6 +528,17 @@ def handle( return handle +def _table_to_resource_hints(table_hints: TTableSchema) -> TResourceHints: + return dlt.mark.make_hints( + table_name=table_hints.get("name"), + write_disposition=table_hints.get("write_disposition"), + columns=table_hints.get("columns"), + schema_contract=table_hints.get("schema_contract"), + table_format=table_hints.get("table_format"), + file_format=table_hints.get("file_format"), + ) + + def infer_table_schema( msg: RowMessage, *, @@ -615,4 +609,4 @@ def gen_data_item( data, data_type, for_delete=msg.op == Op.DELETE ) - return data_item + return data_item \ No newline at end of file diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 088183be5..9597fc714 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -55,10 +55,6 @@ def tbl_y(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - table_hints={ - "tbl_x": {"write_disposition": "merge"}, - "tbl_y": {"write_disposition": "merge"}, - }, ) changes.tbl_x.apply_hints(write_disposition="merge", primary_key="id_x") changes.tbl_y.apply_hints(write_disposition="merge", primary_key="id_y") @@ -76,13 +72,13 @@ def tbl_y(data): # initial load info = dest_pl.run(snapshots) + cleanup_snapshot_resources(snapshots) assert_load_info(info) assert load_table_counts(dest_pl, "tbl_x", "tbl_y") == {"tbl_x": 1, "tbl_y": 1} exp_tbl_x = [{"id_x": 1, "val_x": "foo"}] exp_tbl_y = [{"id_y": 1, "val_y": True}] assert_loaded_data(dest_pl, "tbl_x", ["id_x", "val_x"], exp_tbl_x, "id_x") assert_loaded_data(dest_pl, "tbl_y", ["id_y", "val_y"], exp_tbl_y, "id_y") - cleanup_snapshot_resources(snapshots) # process changes info = dest_pl.run(changes) @@ -276,8 +272,9 @@ def items(data): schema=src_pl.dataset_name, table_names="items", take_snapshots=init_load, - table_hints=table_hints if give_hints else None, ) + if init_load and give_hints: + snapshot.items.apply_hints(write_disposition="merge", columns=column_schema) changes = replication_source( slot_name=slot_name, @@ -292,9 +289,9 @@ def items(data): ) if init_load: info = dest_pl.run(snapshot) + cleanup_snapshot_resources(snapshot) assert_load_info(info) assert load_table_counts(dest_pl, "items")["items"] == 1 - cleanup_snapshot_resources(snapshot) # insert two records in postgres table r1 = deepcopy(data) @@ -474,10 +471,11 @@ def tbl_z(data): ) if init_load: dest_pl.run(snapshots) + cleanup_snapshot_resources(snapshots) assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x"} assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y"} assert get_cols(dest_pl, "tbl_z") == {"id_z", "val_z", "another_col_z"} - cleanup_snapshot_resources(snapshots) + dest_pl.run(changes) assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x", "lsn", "deleted_ts"} assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y", "lsn", "deleted_ts"} @@ -530,8 +528,10 @@ def tbl_z(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), take_snapshots=init_load, - table_hints=table_hints, ) + if init_load: + snapshots.tbl_x.apply_hints(columns={"another_col_x": {"data_type": "double"}}) + snapshots.tbl_y.apply_hints(columns={"another_col_y": {"precision": 32}}) changes = replication_source( slot_name=slot_name, @@ -553,8 +553,10 @@ def tbl_z(data): dest_pl = dlt.pipeline( pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) + if init_load: dest_pl.run(snapshots) + cleanup_snapshot_resources(snapshots) assert ( dest_pl.default_schema.get_table_columns("tbl_x")["another_col_x"][ "data_type" @@ -573,7 +575,7 @@ def tbl_z(data): ] == "bigint" ) - cleanup_snapshot_resources(snapshots) + dest_pl.run(changes) assert ( dest_pl.default_schema.get_table_columns("tbl_x")["another_col_x"]["data_type"] @@ -754,4 +756,4 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: batch = [{**r, **{"id": key}} for r in [data] for key in range(181, 201)] src_pl.run(batch, table_name="items") extract_info = dest_pl.extract(changes) - assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 \ No newline at end of file + assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 From 637a6e9828e9ac5d4c8f6b3808e437b43e57b375 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 25 Oct 2024 16:29:25 +0200 Subject: [PATCH 42/88] wip: more refactoring --- sources/pg_legacy_replication/helpers.py | 70 ++++++++--------------- sources/pg_legacy_replication_pipeline.py | 8 +-- 2 files changed, 28 insertions(+), 50 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index d9448ecf0..223c7e05d 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -133,13 +133,13 @@ def on_begin(conn: ConnectionSqla) -> None: if isinstance(table_names, str): table_names = [table_names] - included_columns = included_columns or {} + for table in table_names: - yield _prepare_snapshot_resource( - engine, - table, - schema, - included_columns=included_columns.get(table), + yield sql_table( # type: ignore[name-defined] + credentials=engine, + table=table, + schema=schema, + included_columns=included_columns.get(table) if included_columns else None, ) @@ -161,22 +161,6 @@ def on_engine_disposed(engine: Engine) -> None: return engine -def _prepare_snapshot_resource( - engine: Engine, - table_name: str, - schema: str, - *, - included_columns: Optional[TColumnNames] = None, -) -> DltResource: - t_rsrc: DltResource = sql_table( # type: ignore[name-defined] - credentials=engine, - table=table_name, - schema=schema, - included_columns=included_columns, - )f - return t_rsrc - - def cleanup_snapshot_resources(snapshots: DltSource) -> None: """FIXME Awful hack to release the underlying SQL engine when snapshotting tables""" resources = snapshots.resources @@ -465,7 +449,7 @@ class ItemGenerator: last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False - def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: + def __iter__(self) -> Iterator[Tuple[Union[str, TTableSchema], List[TDataItem]]]: """Yields replication messages from MessageConsumer. Starts replication of messages published by the publication from the replication slot. @@ -492,38 +476,32 @@ def __iter__(self) -> Iterator[Union[TDataItem, DataItemWithMeta]]: pass finally: cur.connection.close() - yield from self.flush(consumer) - - def flush( - self, consumer: MessageConsumer - ) -> Iterator[Union[TDataItem, DataItemWithMeta]]: - """Flushes the batches of data items generated by MessageConsumer.""" - for table, items in consumer.data_items.items(): - # Retrieve the table schema if it exists (never for DELETEs) - schema = consumer.last_table_schema.get(table) - if schema: - yield (table, schema) - yield (table, items) - - self.last_commit_lsn = consumer.last_commit_lsn - self.generated_all = consumer.consumed_all + for table, data_items in consumer.data_items.items(): + # Yield schema if available; otherwise, yield the table name with items + yield (consumer.last_table_schema.get(table, table), data_items) + # Update state after flush + self.last_commit_lsn = consumer.last_commit_lsn + self.generated_all = consumer.consumed_all def table_wal_handler( table: str, ) -> Callable[[TDataItem], Iterable[DataItemWithMeta]]: def handle( - schema_or_batch: Tuple[str, Union[TTableSchema, List[TDataItem]]] + data_items: Tuple[Union[str, TTableSchema], List[TDataItem]] ) -> Iterable[DataItemWithMeta]: - table_name, items = schema_or_batch - if table_name == table: - if isinstance(items, Dict): - schema: TTableSchema = items + table_or_schema, items = data_items + if isinstance(table_or_schema, Dict): + table_name = table_or_schema["name"] + if table_name == table: yield dlt.mark.with_hints( - [], _table_to_resource_hints(schema), create_table_variant=True + [], + _table_to_resource_hints(table_or_schema), + create_table_variant=True, ) - else: yield dlt.mark.with_table_name(items, table) + elif table_or_schema == table: + yield dlt.mark.with_table_name(items, table) return handle @@ -609,4 +587,4 @@ def gen_data_item( data, data_type, for_delete=msg.op == Op.DELETE ) - return data_item \ No newline at end of file + return data_item diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 0fd6abc51..df054da24 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -3,7 +3,7 @@ from dlt.common.destination import Destination from dlt.destinations.impl.postgres.configuration import PostgresCredentials -from pg_legacy_replication import replication_resource +from pg_legacy_replication import replication_source from pg_legacy_replication.helpers import init_replication @@ -42,7 +42,7 @@ def replicate_single_table() -> None: ) # create a resource that generates items for each change in the source table - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names="my_source_table", @@ -113,7 +113,7 @@ def replicate_with_initial_load() -> None: # insert record in source table and propagate change to destination change_source_table(src_pl, "INSERT INTO {table_name} VALUES (3, true);") - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names="my_source_table", @@ -216,7 +216,7 @@ def replicate_with_column_selection() -> None: ) # create a resource that generates items for each change in the schema's tables - changes = replication_resource( + changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=["tbl_x", "tbl_y"], From 8a8134bb47524d9181e22e9f65b686cb22cefc20 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 25 Oct 2024 17:54:45 +0200 Subject: [PATCH 43/88] wip: saving work until I can get hinting to work --- sources/pg_legacy_replication/__init__.py | 4 +--- .../pg_legacy_replication/test_pg_replication.py | 15 ++++----------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 49ca66a23..f04d8cde1 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -109,8 +109,6 @@ def replication_resource( wal_reader = replication_resource(slot_name) return [ - dlt.transformer( - table_wal_handler(table), data_from=wal_reader, name=table, table_name=table - ) + dlt.transformer(table_wal_handler(table), data_from=wal_reader, name=table) for table in table_names ] diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 9597fc714..e81c0686d 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -257,15 +257,6 @@ def items(data): src_pl.run(items(data)) add_pk(src_pl.sql_client, "items", "col1") - if give_hints: - column_schema["col1"]["primary_key"] = True - else: - column_schema = {"col1": {"primary_key": True}} - - table_hints: Dict[str, TTableSchema] = { - "items": {"columns": column_schema, "write_disposition": "merge"} - } - # initialize replication and create resources snapshot = init_replication( slot_name=slot_name, @@ -274,14 +265,16 @@ def items(data): take_snapshots=init_load, ) if init_load and give_hints: - snapshot.items.apply_hints(write_disposition="merge", columns=column_schema) + snapshot.items.apply_hints(columns=column_schema) changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", - table_hints=table_hints, ) + changes.items.apply_hints(write_disposition="merge", primary_key="col1") + if give_hints: + changes.items.apply_hints(columns=column_schema) # initial load dest_pl = dlt.pipeline( From ee3cb9c01778b5a058063f4eea940864e218cf4f Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sat, 26 Oct 2024 19:41:37 +0200 Subject: [PATCH 44/88] wip: finally got something somewhat working --- sources/pg_legacy_replication/__init__.py | 32 +-- sources/pg_legacy_replication/helpers.py | 152 ++++-------- sources/pg_legacy_replication_pipeline.py | 9 +- tests/pg_legacy_replication/cases.py | 185 ++++++++++++++- tests/pg_legacy_replication/test_helpers.py | 220 +----------------- .../test_pg_replication.py | 38 ++- 6 files changed, 276 insertions(+), 360 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index f04d8cde1..89beb91c4 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -3,12 +3,12 @@ from typing import Dict, Sequence, Optional, Iterable, Union import dlt -from dlt.common.schema.typing import TColumnNames, TTableSchema +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns from dlt.extract import DltResource -from dlt.extract.items import DataItemWithMeta, TDataItem +from dlt.extract.items import TDataItem from dlt.sources.credentials import ConnectionStringCredentials -from .helpers import advance_slot, get_max_lsn, ItemGenerator, table_wal_handler +from .helpers import advance_slot, get_max_lsn, ItemGenerator, create_table_dispatch @dlt.source @@ -18,10 +18,10 @@ def replication_source( table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, included_columns: Optional[Dict[str, TColumnNames]] = None, - table_hints: Optional[Dict[str, TTableSchema]] = None, + column_hints: Optional[Dict[str, TTableSchemaColumns]] = None, target_batch_size: int = 1000, flush_slot: bool = True, -) -> Sequence[DltResource]: +) -> Iterable[DltResource]: """Source yielding data items for changes in one or more postgres tables. - Relies on a replication slot that publishes DML operations (i.e. `insert`, `update`, and `delete`). @@ -32,7 +32,7 @@ def replication_source( Args: slot_name (str): Name of the replication slot to consume replication messages from. credentials (ConnectionStringCredentials): Postgres database credentials. - included_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to + included_columns (Optional[Dict[str, TColumnNames]]): Maps table name(s) to sequence of names of columns to include in the generated data items. Any column not in the sequence is excluded. If not provided, all columns are included. For example: @@ -42,7 +42,7 @@ def replication_source( "table_y": ["col_x", "col_y", "col_z"], } ``` - columns (Optional[Dict[str, TTableHintTemplate[TAnySchemaColumns]]]): Maps + columns (Optional[Dict[str, TTableSchemaColumns]]): Maps table name(s) to column hints to apply on the replicated table(s). For example: ``` columns={ @@ -73,14 +73,13 @@ def replication_source( @dlt.resource(name=lambda args: args["slot_name"], standalone=True) def replication_resource( slot_name: str, - ) -> Iterable[Union[TDataItem, DataItemWithMeta]]: + ) -> Iterable[TDataItem]: # start where we left off in previous run start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0) if flush_slot: advance_slot(start_lsn, slot_name, credentials) # continue until last message in replication slot - options: Dict[str, str] = {} upto_lsn = get_max_lsn(slot_name, credentials) if upto_lsn is None: return @@ -93,12 +92,10 @@ def replication_resource( credentials=credentials, slot_name=slot_name, table_qnames=table_qnames, - options=options, upto_lsn=upto_lsn, start_lsn=start_lsn, target_batch_size=target_batch_size, included_columns=included_columns, - table_hints=table_hints, ) yield from gen if gen.generated_all: @@ -108,7 +105,12 @@ def replication_resource( wal_reader = replication_resource(slot_name) - return [ - dlt.transformer(table_wal_handler(table), data_from=wal_reader, name=table) - for table in table_names - ] + for table in table_names: + yield dlt.transformer( + create_table_dispatch( + table=table, + column_hints=column_hints.get(table) if column_hints else None, + ), + data_from=wal_reader, + name=table, + ) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 223c7e05d..fc3cde9b7 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -10,9 +10,8 @@ Sequence, Any, Iterable, - Tuple, Callable, - TypedDict, + NamedTuple, ) import dlt @@ -24,13 +23,12 @@ TTableSchema, TTableSchemaColumns, ) -from dlt.common.schema.utils import merge_column, merge_table -from dlt.common.typing import TDataItem, TDataItems -from dlt.extract.items import DataItemWithMeta +from dlt.common.schema.utils import merge_column +from dlt.common.typing import TDataItem from dlt.extract import DltSource, DltResource -from dlt.extract.resource import TResourceHints +from dlt.extract.items import DataItemWithMeta from dlt.sources.credentials import ConnectionStringCredentials -from psycopg2.extensions import cursor, connection as ConnectionExt +from psycopg2.extensions import connection as ConnectionExt from psycopg2.extras import ( LogicalReplicationConnection, ReplicationCursor, @@ -40,7 +38,7 @@ from sqlalchemy import Connection as ConnectionSqla, Engine, event from .exceptions import SqlDatabaseSourceImportError -from .pg_logicaldec_pb2 import DatumMessage, Op, RowMessage +from .pg_logicaldec_pb2 import Op, RowMessage from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val @@ -169,17 +167,6 @@ def cleanup_snapshot_resources(snapshots: DltSource) -> None: engine.dispose() -@dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) -def get_pg_version( - cur: cursor = None, - credentials: ConnectionStringCredentials = dlt.secrets.value, -) -> int: - """Returns Postgres server version as int.""" - if cur is not None: - return cur.connection.server_version - return _get_conn(credentials).server_version - - def create_replication_slot( # type: ignore[return] name: str, cur: ReplicationCursor, output_plugin: str = "decoderbufs" ) -> Optional[Dict[str, str]]: @@ -314,14 +301,11 @@ def __init__( table_qnames: Set[str], target_batch_size: int = 1000, included_columns: Optional[Dict[str, TColumnNames]] = None, - table_hints: Optional[Dict[str, TTableSchema]] = None, ) -> None: self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.included_columns = self._normalize_columns(included_columns) - self.table_hints = self._normalize_hints(table_hints) self.consumed_all: bool = False # maps table names to list of data items @@ -389,9 +373,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: else: table_schema = self._get_table_schema(msg) data_item = gen_data_item( - msg, - included_columns=self.included_columns.get(table_name), - column_schema=table_schema["columns"], + msg, self.included_columns.get(table_name), table_schema["columns"] ) data_item["lsn"] = lsn self.data_items[table_name].append(data_item) @@ -399,11 +381,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: def _get_table_schema(self, msg: RowMessage) -> TTableSchema: table_name = msg.table.split(".")[1] last_table_schema = self.last_table_schema.get(table_name) - table_schema = infer_table_schema( - msg, - included_columns=self.included_columns.get(table_name), - table_hints=self.table_hints.get(table_name), - ) + table_schema = infer_table_schema(msg, self.included_columns.get(table_name)) if last_table_schema is None: self.last_table_schema[table_name] = table_schema elif last_table_schema != table_schema: @@ -423,16 +401,11 @@ def _normalize_columns( for table_name, columns in included_columns.items() } - @staticmethod - def _normalize_hints( - table_hints: Optional[Dict[str, TTableSchema]] - ) -> Dict[str, TTableSchema]: - """Normalize the hints by ensuring that each table schema has a 'columns' TTableSchemaColumns.""" - if not table_hints: - return {} - for table_schema in table_hints.values(): - table_schema.setdefault("columns", {}) - return table_hints + +class TableItems(NamedTuple): + table: str + schema: Optional[TTableSchema] + items: List[TDataItem] @dataclass @@ -440,16 +413,14 @@ class ItemGenerator: credentials: ConnectionStringCredentials slot_name: str table_qnames: Set[str] - options: Dict[str, str] upto_lsn: int start_lsn: int = 0 target_batch_size: int = 1000 included_columns: Optional[Dict[str, TColumnNames]] = None - table_hints: Optional[Dict[str, TTableSchema]] = None last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False - def __iter__(self) -> Iterator[Tuple[Union[str, TTableSchema], List[TDataItem]]]: + def __iter__(self) -> Iterator[TableItems]: """Yields replication messages from MessageConsumer. Starts replication of messages published by the publication from the replication slot. @@ -459,17 +430,13 @@ def __iter__(self) -> Iterator[Tuple[Union[str, TTableSchema], List[TDataItem]]] try: cur = _get_rep_conn(self.credentials).cursor() cur.start_replication( - slot_name=self.slot_name, - start_lsn=self.start_lsn, - decode=False, - options=self.options, + slot_name=self.slot_name, start_lsn=self.start_lsn, decode=False ) consumer = MessageConsumer( upto_lsn=self.upto_lsn, table_qnames=self.table_qnames, target_batch_size=self.target_batch_size, included_columns=self.included_columns, - table_hints=self.table_hints, ) cur.consume_stream(consumer) except StopReplication: # completed batch or reached `upto_lsn` @@ -477,51 +444,41 @@ def __iter__(self) -> Iterator[Tuple[Union[str, TTableSchema], List[TDataItem]]] finally: cur.connection.close() for table, data_items in consumer.data_items.items(): - # Yield schema if available; otherwise, yield the table name with items - yield (consumer.last_table_schema.get(table, table), data_items) + yield TableItems( + table, consumer.last_table_schema.get(table), data_items + ) # Update state after flush self.last_commit_lsn = consumer.last_commit_lsn self.generated_all = consumer.consumed_all -def table_wal_handler( - table: str, -) -> Callable[[TDataItem], Iterable[DataItemWithMeta]]: - def handle( - data_items: Tuple[Union[str, TTableSchema], List[TDataItem]] - ) -> Iterable[DataItemWithMeta]: - table_or_schema, items = data_items - if isinstance(table_or_schema, Dict): - table_name = table_or_schema["name"] - if table_name == table: - yield dlt.mark.with_hints( - [], - _table_to_resource_hints(table_or_schema), - create_table_variant=True, - ) - yield dlt.mark.with_table_name(items, table) - elif table_or_schema == table: - yield dlt.mark.with_table_name(items, table) - - return handle +def create_table_dispatch( + table: str, column_hints: Optional[TTableSchemaColumns] = None +) -> Callable[[TableItems], Iterable[DataItemWithMeta]]: + """Creates a dispatch handler that processes data items based on a specified table and optional column hints.""" + def handle(table_items: TableItems) -> Iterable[DataItemWithMeta]: + if table_items.table != table: + return + if schema := table_items.schema: + columns = schema["columns"] + if column_hints: + for col_name, col_hint in column_hints.items(): + columns[col_name] = merge_column( + columns.get(col_name, {}), col_hint + ) + yield dlt.mark.with_hints( + [], + dlt.mark.make_hints(table_name=table, columns=columns), + create_table_variant=True, + ) + yield dlt.mark.with_table_name(table_items.items, table) -def _table_to_resource_hints(table_hints: TTableSchema) -> TResourceHints: - return dlt.mark.make_hints( - table_name=table_hints.get("name"), - write_disposition=table_hints.get("write_disposition"), - columns=table_hints.get("columns"), - schema_contract=table_hints.get("schema_contract"), - table_format=table_hints.get("table_format"), - file_format=table_hints.get("file_format"), - ) + return handle def infer_table_schema( - msg: RowMessage, - *, - included_columns: Optional[Set[str]] = None, - table_hints: Optional[TTableSchema] = None, + msg: RowMessage, included_columns: Optional[Set[str]] = None ) -> TTableSchema: """Infers the table schema from the replication message and optional hints""" columns: TTableSchemaColumns = { @@ -534,35 +491,14 @@ def infer_table_schema( columns["lsn"] = {"data_type": "bigint", "nullable": True} columns["deleted_ts"] = {"data_type": "timestamp", "nullable": True} - # write_disposition = ( - # table_hints.get("write_disposition", "append") if table_hints else "append" - # ) - # - # FIXME if write_disposition not in ("replace", "append"): - columns["lsn"]["dedup_sort"] = "desc" - columns["deleted_ts"]["hard_delete"] = True - - schema, table = msg.table.split(".") - table_schema: TTableSchema = {"name": table, "columns": columns} - - if table_hints: - table_hints["name"] = table - # FIXME I dont't know why I have to do this, but merge_table doesn't work right or I'm missing something - col_hints = table_hints.get("columns") - if col_hints: - col_hints = { - col_name: merge_column(columns[col_name], col_schema) - for col_name, col_schema in col_hints.items() - if not included_columns or col_name in included_columns - } - merge_table(schema, table_schema, table_hints) - - return table_schema + return { + "name": (msg.table.split(".")[1]), + "columns": columns, + } def gen_data_item( msg: RowMessage, - *, included_columns: Optional[Set[str]] = None, column_schema: Optional[TTableSchemaColumns] = None, ) -> TDataItem: diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index df054da24..47f733c43 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -1,12 +1,10 @@ import dlt - from dlt.common.destination import Destination from dlt.destinations.impl.postgres.configuration import PostgresCredentials from pg_legacy_replication import replication_source from pg_legacy_replication.helpers import init_replication - PG_CREDS = dlt.secrets.get("sources.pg_replication.credentials", PostgresCredentials) @@ -46,13 +44,8 @@ def replicate_single_table() -> None: slot_name=slot_name, schema=src_pl.dataset_name, table_names="my_source_table", - table_hints={ - "my_source_table": { - "columns": {"id": {"primary_key": True}}, - "write_disposition": "merge", - }, - }, ) + changes.my_source_table.apply_hints(write_disposition="merge", primary_key="id") # insert two records in source table and propagate changes to destination change_source_table( diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index bb99ad52e..d5fde73ba 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -1,8 +1,10 @@ from typing import List +import pendulum from dlt.common import Decimal from dlt.common.data_types.typing import DATA_TYPES -from dlt.common.schema import TColumnSchema, TTableSchemaColumns +from dlt.common.schema import TColumnSchema, TTableSchema, TTableSchemaColumns +from dlt.common.typing import TDataItem TABLE_ROW_ALL_DATA_TYPES = { "col1": 989127831, @@ -98,3 +100,184 @@ col_schema["data_type"] = "complex" TABLE_UPDATE_COLUMNS_SCHEMA: TTableSchemaColumns = {t["name"]: t for t in TABLE_UPDATE} + +ROW_MESSAGES: List[dict] = [ + { + "transactionId": 969, + "commitTime": "1728662646949062", + "table": "src_pl_dataset_202410110404048747_staging.tbl_y", + "op": "INSERT", + "newTuple": [ + { + "columnName": "id_y", + "columnType": "20", + "datumInt64": "2", + }, + { + "columnName": "val_y", + "columnType": "16", + "datumBool": False, + }, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + "datumString": "1728662646.2657657", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + "datumString": "gGjifTMTAUs5ag", + }, + ], + "newTypeinfo": [ + { + "modifier": "bigint", + "valueOptional": False, + }, + { + "modifier": "boolean", + "valueOptional": True, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + { + "modifier": "character varying", + "valueOptional": False, + }, + ], + "oldTuple": [], + }, + { + "transactionId": 2018, + "commitTime": "1729503423666542", + "table": "src_pl_dataset_202410210936594956.items", + "op": "INSERT", + "newTuple": [ + { + "columnName": "col4", + "columnType": 1184, + "datumInt64": 1653312405176451, + }, + { + "columnName": "col9", + "columnType": 3802, + "datumString": ( + '{"link": "?commen\\ntU\\nrn=urn%3Ali%3Acomment%3A%28acti\\n \\u0006 \\\\vity%3A69\'08444473\\n\\n551163392' + '%2C6n \\r \x8e9085", "complex": [1, 2, 3, "a"]}' + ), + }, + { + "columnName": "col10", + "columnType": 1082, + "datumInt32": 19415, + }, + { + "columnName": "col11", + "columnType": 1083, + "datumInt64": 48405176451, + }, + ], + "newTypeinfo": [ + {"modifier": "timestamp with time zone", "valueOptional": False}, + {"modifier": "jsonb", "valueOptional": False}, + {"modifier": "date", "valueOptional": False}, + {"modifier": "time without time zone", "valueOptional": False}, + ], + }, + { + "transactionId": 932, + "commitTime": "1729299383354856", + "table": "src_pl_dataset_202410191256122080.tbl_x", + "op": "DELETE", + "oldTuple": [ + { + "columnName": "id_x", + "columnType": "20", + "datumInt64": "1", + }, + { + "columnName": "val_x", + "columnType": "1043", + }, + { + "columnName": "_dlt_load_id", + "columnType": "1043", + }, + { + "columnName": "_dlt_id", + "columnType": "1043", + }, + ], + }, +] + +DATA_ITEMS: List[TDataItem] = [ + { + "_dlt_id": "gGjifTMTAUs5ag", + "_dlt_load_id": "1728662646.2657657", + "id_y": 2, + "val_y": False, + }, + { + "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), + "col9": { + "complex": [1, 2, 3, "a"], + "link": ( + "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" + " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" + ), + }, + "col10": pendulum.parse("2023-02-27", strict=False).date(), + "col11": pendulum.parse("13:26:45.176451", strict=False).time(), + }, + { + "id_x": 1, + "val_x": "", + "_dlt_load_id": "", + "_dlt_id": "", + "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), + }, +] + +TABLE_SCHEMAS: List[TTableSchema] = [ + { + "name": "tbl_y", + "columns": { + "id_y": { + "data_type": "bigint", + "name": "id_y", + "nullable": False, + "precision": 64, + }, + "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, + "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "lsn": {"data_type": "bigint", "nullable": True}, + "deleted_ts": {"data_type": "timestamp", "nullable": True}, + }, + }, + { + "name": "items", + "columns": { + "col4": {"data_type": "timestamp", "name": "col4", "nullable": False}, + "col9": {"data_type": "complex", "name": "col9", "nullable": False}, + "col10": {"data_type": "date", "name": "col10", "nullable": False}, + "col11": {"data_type": "time", "name": "col11", "nullable": False}, + "lsn": {"data_type": "bigint", "nullable": True}, + "deleted_ts": {"data_type": "timestamp", "nullable": True}, + }, + }, + { + "name": "tbl_x", + "columns": { + "lsn": {"data_type": "bigint", "nullable": True}, + "deleted_ts": {"data_type": "timestamp", "nullable": True}, + }, + }, +] diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 4cef1523f..3482fc768 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -11,232 +11,20 @@ gen_data_item, ) from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage +from .cases import ROW_MESSAGES, DATA_ITEMS, TABLE_SCHEMAS -@pytest.mark.parametrize( - "data, table_hints, expected_schema", - [ - ( - { - "transactionId": 969, - "commitTime": "1728662646949062", - "table": "src_pl_dataset_202410110404048747_staging.tbl_y", - "op": "INSERT", - "newTuple": [ - {"columnName": "id_y", "columnType": "20", "datumInt64": 2}, - {"columnName": "val_y", "columnType": "16", "datumBool": False}, - { - "columnName": "_dlt_load_id", - "columnType": "1043", - "datumString": "1728662646.2657657", - }, - { - "columnName": "_dlt_id", - "columnType": "1043", - "datumString": "gGjifTMTAUs5ag", - }, - ], - "newTypeinfo": [ - {"modifier": "bigint", "valueOptional": False}, - {"modifier": "boolean", "valueOptional": True}, - {"modifier": "character varying", "valueOptional": False}, - {"modifier": "character varying", "valueOptional": False}, - ], - "oldTuple": [], - }, - {"columns": {"id_y": {"primary_key": True}}, "write_disposition": "merge"}, - { - "name": "tbl_y", - "columns": { - "id_y": { - "data_type": "bigint", - "precision": 64, - "name": "id_y", - "nullable": False, - "primary_key": True, - }, - "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, - "_dlt_load_id": { - "data_type": "text", - "name": "_dlt_load_id", - "nullable": False, - }, - "_dlt_id": { - "data_type": "text", - "name": "_dlt_id", - "nullable": False, - }, - "lsn": { - "data_type": "bigint", - "dedup_sort": "desc", - "nullable": True, - }, - "deleted_ts": { - "data_type": "timestamp", - "hard_delete": True, - "nullable": True, - }, - }, - "write_disposition": "merge", - }, - ), - ], -) +@pytest.mark.parametrize("data, expected_schema", zip(ROW_MESSAGES, TABLE_SCHEMAS)) def test_infer_table_schema( data, - table_hints: Optional[TTableSchema], expected_schema: TTableSchema, ): row_msg = RowMessage() parse_dict(data, row_msg) - assert infer_table_schema(row_msg, table_hints=table_hints) == expected_schema + assert infer_table_schema(row_msg) == expected_schema -@pytest.mark.parametrize( - "data, data_item", - [ - ( - { - "transactionId": 969, - "commitTime": "1728662646949062", - "table": "src_pl_dataset_202410110404048747_staging.tbl_y", - "op": "INSERT", - "newTuple": [ - { - "columnName": "id_y", - "columnType": "20", - "datumInt64": "2", - }, - { - "columnName": "val_y", - "columnType": "16", - "datumBool": False, - }, - { - "columnName": "_dlt_load_id", - "columnType": "1043", - "datumString": "1728662646.2657657", - }, - { - "columnName": "_dlt_id", - "columnType": "1043", - "datumString": "gGjifTMTAUs5ag", - }, - ], - "newTypeinfo": [ - { - "modifier": "bigint", - "valueOptional": False, - }, - { - "modifier": "boolean", - "valueOptional": True, - }, - { - "modifier": "character varying", - "valueOptional": False, - }, - { - "modifier": "character varying", - "valueOptional": False, - }, - ], - "oldTuple": [], - }, - { - "_dlt_id": "gGjifTMTAUs5ag", - "_dlt_load_id": "1728662646.2657657", - "id_y": 2, - "val_y": False, - }, - ), - ( - { - "transactionId": 2018, - "commitTime": "1729503423666542", - "table": "src_pl_dataset_202410210936594956.items", - "op": "INSERT", - "newTuple": [ - { - "columnName": "col4", - "columnType": 1184, - "datumInt64": 1653312405176451, - }, - { - "columnName": "col9", - "columnType": 3802, - "datumString": ( - '{"link": "?commen\\ntU\\nrn=urn%3Ali%3Acomment%3A%28acti\\n \\u0006 \\\\vity%3A69\'08444473\\n\\n551163392' - '%2C6n \\r \x8e9085", "complex": [1, 2, 3, "a"]}' - ), - }, - { - "columnName": "col10", - "columnType": 1082, - "datumInt32": 19415, - }, - { - "columnName": "col11", - "columnType": 1083, - "datumInt64": 48405176451, - }, - ], - "newTypeinfo": [ - {"modifier": "timestamp with time zone", "valueOptional": False}, - {"modifier": "jsonb", "valueOptional": False}, - {"modifier": "date", "valueOptional": False}, - {"modifier": "time without time zone", "valueOptional": False}, - ], - }, - { - "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), - "col9": { - "complex": [1, 2, 3, "a"], - "link": ( - "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" - " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" - ), - }, - "col10": pendulum.parse("2023-02-27", strict=False).date(), - "col11": pendulum.parse("13:26:45.176451", strict=False).time(), - }, - ), - ( - { - "transactionId": 932, - "commitTime": "1729299383354856", - "table": "src_pl_dataset_202410191256122080.tbl_x", - "op": "DELETE", - "oldTuple": [ - { - "columnName": "id_x", - "columnType": "20", - "datumInt64": "1", - }, - { - "columnName": "val_x", - "columnType": "1043", - }, - { - "columnName": "_dlt_load_id", - "columnType": "1043", - }, - { - "columnName": "_dlt_id", - "columnType": "1043", - }, - ], - }, - { - "id_x": 1, - "val_x": "", - "_dlt_load_id": "", - "_dlt_id": "", - "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), - }, - ), - ], -) +@pytest.mark.parametrize("data, data_item", zip(ROW_MESSAGES, DATA_ITEMS)) def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index e81c0686d..35ec12c90 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -3,7 +3,7 @@ import dlt import pytest -from dlt.common.schema.typing import TTableSchema +from dlt.common.schema.typing import TTableSchemaColumns from dlt.destinations.job_client_impl import SqlJobClientBase from sources.pg_legacy_replication import replication_source @@ -11,7 +11,6 @@ init_replication, cleanup_snapshot_resources, ) -from sources.rest_api import exclude_keys from tests.utils import ( ALL_DESTINATIONS, assert_load_info, @@ -20,6 +19,11 @@ from .cases import TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA from .utils import add_pk, assert_loaded_data +merge_hints: TTableSchemaColumns = { + "deleted_ts": {"hard_delete": True}, + "lsn": {"dedup_sort": "desc"}, +} + @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) def test_core_functionality( @@ -56,8 +60,12 @@ def tbl_y(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), ) - changes.tbl_x.apply_hints(write_disposition="merge", primary_key="id_x") - changes.tbl_y.apply_hints(write_disposition="merge", primary_key="id_y") + changes.tbl_x.apply_hints( + write_disposition="merge", primary_key="id_x", columns=merge_hints + ) + changes.tbl_y.apply_hints( + write_disposition="merge", primary_key="id_y", columns=merge_hints + ) src_pl.run( [ @@ -188,8 +196,12 @@ def tbl_y(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), ) - changes.tbl_x.apply_hints(write_disposition="merge", primary_key="id_x") - changes.tbl_y.apply_hints(write_disposition="merge", primary_key="id_y") + changes.tbl_x.apply_hints( + write_disposition="merge", primary_key="id_x", columns=merge_hints + ) + changes.tbl_y.apply_hints( + write_disposition="merge", primary_key="id_y", columns=merge_hints + ) # change postgres table after replication has been initialized # these records should be in the replicated table @@ -272,7 +284,9 @@ def items(data): schema=src_pl.dataset_name, table_names="items", ) - changes.items.apply_hints(write_disposition="merge", primary_key="col1") + changes.items.apply_hints( + write_disposition="merge", primary_key="col1", columns=merge_hints + ) if give_hints: changes.items.apply_hints(columns=column_schema) @@ -483,7 +497,7 @@ def tbl_z(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("init_load", [True, False]) -def test_table_hints( +def test_column_hints( src_config: Tuple[dlt.Pipeline, str], destination_name: str, init_load: bool ) -> None: @dlt.resource @@ -510,9 +524,9 @@ def tbl_z(data): ) # initialize replication and create resources - table_hints: Dict[str, TTableSchema] = { - "tbl_x": {"columns": {"another_col_x": {"data_type": "double"}}}, - "tbl_y": {"columns": {"another_col_y": {"precision": 32}}}, + column_hints: Dict[str, TTableSchemaColumns] = { + "tbl_x": {"another_col_x": {"data_type": "double"}}, + "tbl_y": {"another_col_y": {"precision": 32}}, # tbl_z is not specified, hence all columns should be included } @@ -530,7 +544,7 @@ def tbl_z(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - table_hints=table_hints, + column_hints=column_hints, ) # update three postgres tables From 1727456e0afb68456a48cece592a76cf2e9861f4 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sun, 27 Oct 2024 02:45:24 +0200 Subject: [PATCH 45/88] wip: done with coding now docs --- sources/pg_legacy_replication/helpers.py | 29 +++----- sources/pg_legacy_replication_pipeline.py | 74 +++---------------- .../test_pg_replication.py | 59 --------------- 3 files changed, 22 insertions(+), 140 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index fc3cde9b7..8b76d6494 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -72,7 +72,7 @@ def init_replication( schema (str): Name of the schema to replicate tables from. table_names (Optional[Union[str, Sequence[str]]]): Name(s) of the table(s) to include in the publication. If not provided, all tables in the schema - are included (also tables added to the schema after the publication was created). + are included. credentials (ConnectionStringCredentials): Postgres database credentials. take_snapshots (bool): Whether the table states in the snapshot exported during replication slot creation are persisted to tables. If true, a @@ -305,7 +305,14 @@ def __init__( self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.included_columns = self._normalize_columns(included_columns) + self.included_columns = ( + { + table: {s for s in ([cols] if isinstance(cols, str) else cols)} + for table, cols in included_columns.items() + } + if included_columns + else {} + ) self.consumed_all: bool = False # maps table names to list of data items @@ -371,15 +378,14 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: if msg.op == Op.DELETE: data_item = gen_data_item(msg) else: - table_schema = self._get_table_schema(msg) + table_schema = self._get_table_schema(msg, table_name) data_item = gen_data_item( msg, self.included_columns.get(table_name), table_schema["columns"] ) data_item["lsn"] = lsn self.data_items[table_name].append(data_item) - def _get_table_schema(self, msg: RowMessage) -> TTableSchema: - table_name = msg.table.split(".")[1] + def _get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: last_table_schema = self.last_table_schema.get(table_name) table_schema = infer_table_schema(msg, self.included_columns.get(table_name)) if last_table_schema is None: @@ -388,19 +394,6 @@ def _get_table_schema(self, msg: RowMessage) -> TTableSchema: raise StopReplication # table schema change return table_schema - @staticmethod - def _normalize_columns( - included_columns: Optional[Dict[str, TColumnNames]] - ) -> Dict[str, Set[str]]: - if not included_columns: - return {} - return { - table_name: { - col for col in ([columns] if isinstance(columns, str) else columns) - } - for table_name, columns in included_columns.items() - } - class TableItems(NamedTuple): table: str diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 47f733c43..334f7c075 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -45,7 +45,14 @@ def replicate_single_table() -> None: schema=src_pl.dataset_name, table_names="my_source_table", ) - changes.my_source_table.apply_hints(write_disposition="merge", primary_key="id") + changes.my_source_table.apply_hints( + write_disposition="merge", + primary_key="id", + columns={ + "deleted_ts": {"hard_delete": True}, + "lsn": {"dedup_sort": "desc"}, + }, + ) # insert two records in source table and propagate changes to destination change_source_table( @@ -115,64 +122,6 @@ def replicate_with_initial_load() -> None: show_destination_table(dest_pl) -# def replicate_entire_schema() -> None: -# """Demonstrates setup and usage of schema replication. -# -# Schema replication requires a Postgres server version of 15 or higher. An -# exception is raised if that's not the case. -# """ -# # create source and destination pipelines -# src_pl = get_postgres_pipeline() -# dest_pl = dlt.pipeline( -# pipeline_name="pg_replication_pipeline", -# destination="duckdb", -# dataset_name="replicate_entire_schema", -# dev_mode=True, -# ) -# -# # create two source tables to demonstrate schema replication -# create_source_table( -# src_pl, -# "CREATE TABLE {table_name} (id integer PRIMARY KEY, val bool);", -# "tbl_x", -# ) -# create_source_table( -# src_pl, -# "CREATE TABLE {table_name} (id integer PRIMARY KEY, val varchar);", -# "tbl_y", -# ) -# -# # initialize schema replication by omitting the `table_names` argument -# slot_name = "example_slot" -# init_replication( # initializing schema replication requires the Postgres user to be a superuser -# slot_name=slot_name, -# schema=src_pl.dataset_name, -# reset=True, -# ) -# -# # create a resource that generates items for each change in the schema's tables -# changes = replication_resource(slot_name) -# -# # insert records in source tables and propagate changes to destination -# change_source_table( -# src_pl, "INSERT INTO {table_name} VALUES (1, true), (2, false);", "tbl_x" -# ) -# change_source_table(src_pl, "INSERT INTO {table_name} VALUES (1, 'foo');", "tbl_y") -# dest_pl.run(changes) -# show_destination_table(dest_pl, "tbl_x") -# show_destination_table(dest_pl, "tbl_y") -# -# # tables added to the schema later are also included in the replication -# create_source_table( -# src_pl, "CREATE TABLE {table_name} (id integer PRIMARY KEY, val date);", "tbl_z" -# ) -# change_source_table( -# src_pl, "INSERT INTO {table_name} VALUES (1, '2023-03-18');", "tbl_z" -# ) -# dest_pl.run(changes) -# show_destination_table(dest_pl, "tbl_z") - - def replicate_with_column_selection() -> None: """Sets up replication with column selection. @@ -212,9 +161,9 @@ def replicate_with_column_selection() -> None: changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, - table_names=["tbl_x", "tbl_y"], + table_names=("tbl_x", "tbl_y"), included_columns={ - "tbl_x": ["c1", "c2"] + "tbl_x": ("c1", "c2") }, # columns not specified here are excluded from generated data items ) @@ -288,5 +237,4 @@ def show_destination_table( if __name__ == "__main__": replicate_single_table() # replicate_with_initial_load() - # replicate_entire_schema() - # replicate_with_column_selection() + # replicate_with_column_selection() \ No newline at end of file diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 35ec12c90..dc22853dd 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -658,65 +658,6 @@ def test_table_schema_change( ) -# def test_replicate_schema(src_config: Tuple[dlt.Pipeline, str]) -> None: -# if get_pg_version() < 150000: -# pytest.skip("incompatible Postgres server version") -# if not is_super_user(src_config[0].sql_client): -# pytest.skip("Postgres user needs to be superuser") -# -# @dlt.resource -# def tbl_x(data): -# yield data -# -# @dlt.resource -# def tbl_y(data): -# yield data -# -# @dlt.resource -# def tbl_z(data): -# yield data -# -# src_pl, slot_name = src_config -# -# # create two postgres tables -# src_pl.run( -# [ -# tbl_x({"id_x": 1, "val_x": "foo"}), -# tbl_y({"id_y": 1, "val_y": "foo"}), -# ] -# ) -# -# # initialize replication and create resource -# init_replication( -# slot_name=slot_name, -# schema=src_pl.dataset_name, # we only specify `schema`, not `table_names` -# publish="insert", -# ) -# changes = replication_resource(slot_name) -# -# # change source tables and load to destination -# src_pl.run( -# [ -# tbl_x({"id_x": 2, "val_x": "foo"}), -# tbl_y({"id_y": 2, "val_y": "foo"}), -# ] -# ) -# dest_pl = dlt.pipeline(pipeline_name="dest_pl", dev_mode=True) -# dest_pl.extract(changes) -# assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y"} -# -# # introduce new table in source and assert it gets included in the replication -# src_pl.run( -# [ -# tbl_x({"id_x": 3, "val_x": "foo"}), -# tbl_y({"id_y": 3, "val_y": "foo"}), -# tbl_z({"id_z": 1, "val_z": "foo"}), -# ] -# ) -# dest_pl.extract(changes) -# assert set(dest_pl.default_schema.data_table_names()) == {"tbl_x", "tbl_y", "tbl_z"} - - def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: # this test asserts the number of data items yielded by the replication resource # is not affected by `target_batch_size` and the number of replication messages per transaction From 81fdce81a6980a4b115587e891a205bb146c81f3 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 28 Oct 2024 19:38:15 +0100 Subject: [PATCH 46/88] fix: various performance improvements --- sources/pg_legacy_replication/__init__.py | 4 +- sources/pg_legacy_replication/helpers.py | 81 +++++++++++++------ sources/pg_legacy_replication/schema_types.py | 47 +++++++---- sources/pg_legacy_replication_pipeline.py | 2 +- tests/pg_legacy_replication/cases.py | 8 ++ 5 files changed, 96 insertions(+), 46 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 89beb91c4..f90f59875 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -71,9 +71,7 @@ def replication_source( table_names = [table_names] @dlt.resource(name=lambda args: args["slot_name"], standalone=True) - def replication_resource( - slot_name: str, - ) -> Iterable[TDataItem]: + def replication_resource(slot_name: str) -> Iterable[TDataItem]: # start where we left off in previous run start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0) if flush_slot: diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 8b76d6494..c858b37c6 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -1,5 +1,6 @@ from collections import defaultdict from dataclasses import dataclass, field +from functools import partial from typing import ( Optional, Dict, @@ -15,6 +16,7 @@ ) import dlt +import hashlib import psycopg2 from dlt.common import logger from dlt.common.pendulum import pendulum @@ -38,7 +40,7 @@ from sqlalchemy import Connection as ConnectionSqla, Engine, event from .exceptions import SqlDatabaseSourceImportError -from .pg_logicaldec_pb2 import Op, RowMessage +from .pg_logicaldec_pb2 import Op, RowMessage, TypeInfo from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val @@ -173,7 +175,7 @@ def create_replication_slot( # type: ignore[return] """Creates a replication slot if it doesn't exist yet.""" try: cur.create_replication_slot(name, output_plugin=output_plugin) - logger.info(f'Successfully created replication slot "{name}".') + logger.info("Successfully created replication slot '%s'", name) result = cur.fetchone() return { "slot_name": result[0], @@ -183,7 +185,7 @@ def create_replication_slot( # type: ignore[return] } except psycopg2.errors.DuplicateObject: # the replication slot already exists logger.info( - f'Replication slot "{name}" cannot be created because it already exists.' + "Replication slot '%s' cannot be created because it already exists", name ) @@ -191,10 +193,10 @@ def drop_replication_slot(name: str, cur: ReplicationCursor) -> None: """Drops a replication slot if it exists.""" try: cur.drop_replication_slot(name) - logger.info(f'Successfully dropped replication slot "{name}".') + logger.info("Successfully dropped replication slot '%s'", name) except psycopg2.errors.UndefinedObject: # the replication slot does not exist logger.info( - f'Replication slot "{name}" cannot be dropped because it does not exist.' + "Replication slot '%s' cannot be dropped because it does not exist", name ) @@ -318,7 +320,8 @@ def __init__( # maps table names to list of data items self.data_items: Dict[str, List[TDataItem]] = defaultdict(list) # maps table name to table schema - self.last_table_schema: Dict[str, TTableSchema] = dict() + self.last_table_schema: Dict[str, TTableSchema] = {} + self.last_seen_schemas: Dict[str, int] = {} self.last_commit_ts: pendulum.DateTime self.last_commit_lsn: int @@ -352,7 +355,7 @@ def process_msg(self, msg: ReplicationMessage) -> None: raise except Exception: logger.error( - "A fatal error occured while processing a message: %s", row_msg + "A fatal error occurred while processing a message: %s", row_msg ) raise @@ -386,13 +389,35 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: self.data_items[table_name].append(data_item) def _get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: - last_table_schema = self.last_table_schema.get(table_name) - table_schema = infer_table_schema(msg, self.included_columns.get(table_name)) - if last_table_schema is None: - self.last_table_schema[table_name] = table_schema - elif last_table_schema != table_schema: - raise StopReplication # table schema change - return table_schema + current_hash = hash_typeinfo(msg.new_typeinfo) + cached_hash = self.last_seen_schemas.get(table_name) + + # Return cached schema if hash matches + if cached_hash == current_hash: + return self.last_table_schema[table_name] + + # Infer the current schema + inferred_schema = infer_table_schema(msg, self.included_columns.get(table_name)) + cached_schema = self.last_table_schema.get(table_name) + + if cached_schema is None: + # Cache the inferred schema and hash if it is not already cached + self.last_table_schema[table_name] = inferred_schema + self.last_seen_schemas[table_name] = current_hash + elif cached_schema != inferred_schema: + # Raise an exception if there's a schema mismatch + raise StopReplication("Table schema change detected") + + return inferred_schema + + +def hash_typeinfo(new_typeinfo: Sequence[TypeInfo]) -> int: + """Generate a hash for the entire new_typeinfo list by hashing each TypeInfo message.""" + typeinfo_tuple = tuple( + (info.modifier, info.value_optional) for info in new_typeinfo + ) + hash_obj = hashlib.blake2b(repr(typeinfo_tuple).encode(), digest_size=8) + return int(hash_obj.hexdigest(), 16) class TableItems(NamedTuple): @@ -420,29 +445,33 @@ def __iter__(self) -> Iterator[TableItems]: Maintains LSN of last consumed Commit message in object state. Does not advance the slot. """ + cur = _get_rep_conn(self.credentials).cursor() + ack_lsn = partial(cur.send_feedback, reply=True, force=True) + cur.start_replication( + slot_name=self.slot_name, start_lsn=self.start_lsn, decode=False + ) + consumer = MessageConsumer( + upto_lsn=self.upto_lsn, + table_qnames=self.table_qnames, + target_batch_size=self.target_batch_size, + included_columns=self.included_columns, + ) try: - cur = _get_rep_conn(self.credentials).cursor() - cur.start_replication( - slot_name=self.slot_name, start_lsn=self.start_lsn, decode=False - ) - consumer = MessageConsumer( - upto_lsn=self.upto_lsn, - table_qnames=self.table_qnames, - target_batch_size=self.target_batch_size, - included_columns=self.included_columns, - ) cur.consume_stream(consumer) except StopReplication: # completed batch or reached `upto_lsn` pass finally: - cur.connection.close() + last_commit_lsn = consumer.last_commit_lsn + ack_lsn(write_lsn=last_commit_lsn) for table, data_items in consumer.data_items.items(): yield TableItems( table, consumer.last_table_schema.get(table), data_items ) # Update state after flush - self.last_commit_lsn = consumer.last_commit_lsn + self.last_commit_lsn = last_commit_lsn self.generated_all = consumer.consumed_all + ack_lsn(flush_lsn=last_commit_lsn) + cur.connection.close() def create_table_dispatch( diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index c376d10fb..bb8688d9a 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -8,6 +8,7 @@ from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import TColumnSchema, TColumnType from dlt.destinations import postgres +from dlt.common import logger from .pg_logicaldec_pb2 import DatumMessage, TypeInfo @@ -37,12 +38,18 @@ 1043: "character varying", 1082: "date", 1083: "time without time zone", + 1114: "timestamp without time zone", 1184: "timestamp with time zone", 1700: "numeric", 3802: "jsonb", } """Maps postgres type OID to type string. Only includes types present in PostgresTypeMapper.""" +_MISSING_TYPES: Dict[str, TDataType] = { + "timestamp without time zone": "timestamp", +} +# FIXME Missing types for old postgres versions + _DATUM_RAW_TYPES: Dict[str, TDataType] = { "datum_int32": "bigint", "datum_int64": "bigint", @@ -72,7 +79,7 @@ def _get_precision_and_scale( type_id: int, modifier: str -) -> Optional[Tuple[int, Optional[int]]]: +) -> Optional[Tuple[Optional[int], Optional[int]]]: """Get precision from postgres type attributes and modifiers.""" if type_id in _FIXED_PRECISION_TYPES: return _FIXED_PRECISION_TYPES[type_id] @@ -82,28 +89,29 @@ def _get_precision_and_scale( groups = match.groups() precision = int(groups[0]) scale = int(groups[1]) if len(groups) > 1 else None - return (precision, scale) + return precision, scale - return (None, None) + return None, None # FIXME Hack to get it to work with 0.5.x and 1.x -def _from_destination_type( - db_type: str, precision: Optional[int] = None, scale: Optional[int] = None -) -> TColumnType: - @lru_cache(maxsize=None) - def _from_db_type() -> Callable[[str, Optional[int], Optional[int]], TColumnType]: - try: - from dlt.destinations.impl.postgres.factory import PostgresTypeMapper # type: ignore +@lru_cache(maxsize=None) +def _from_db_type() -> Callable[[str, Optional[int], Optional[int]], TColumnType]: + try: + from dlt.destinations.impl.postgres.factory import PostgresTypeMapper # type: ignore - type_mapper = PostgresTypeMapper(postgres().capabilities()) - return type_mapper.from_destination_type # type: ignore[no-any-return] - except ImportError: - from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper + type_mapper = PostgresTypeMapper(postgres().capabilities()) + return type_mapper.from_destination_type # type: ignore[no-any-return] + except ImportError: + from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper - type_mapper = PostgresTypeMapper(postgres().capabilities()) - return type_mapper.from_db_type # type: ignore[no-any-return] + type_mapper = PostgresTypeMapper(postgres().capabilities()) + return type_mapper.from_db_type # type: ignore[no-any-return] + +def _from_destination_type( + db_type: str, precision: Optional[int] = None, scale: Optional[int] = None +) -> TColumnType: return _from_db_type()(db_type, precision, scale) @@ -113,6 +121,13 @@ def _to_dlt_column_type(type_id: int, modifier: str) -> TColumnType: Type OIDs not in _PG_TYPES mapping default to "text" type. """ pg_type = _PG_TYPES.get(type_id) + if pg_type in _MISSING_TYPES: + return {"data_type": _MISSING_TYPES[pg_type]} + if pg_type is None: + logger.warning( + "No type found for type_id '%s' and modifier '%s'", type_id, modifier + ) + precision, scale = _get_precision_and_scale(type_id, modifier) return _from_destination_type(pg_type, precision, scale) diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 334f7c075..1e0466ba2 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -237,4 +237,4 @@ def show_destination_table( if __name__ == "__main__": replicate_single_table() # replicate_with_initial_load() - # replicate_with_column_selection() \ No newline at end of file + # replicate_with_column_selection() diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index d5fde73ba..292fa92f3 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -178,12 +178,14 @@ "columnType": 1083, "datumInt64": 48405176451, }, + {"columnName": "alumnized_at", "columnType": 1114}, ], "newTypeinfo": [ {"modifier": "timestamp with time zone", "valueOptional": False}, {"modifier": "jsonb", "valueOptional": False}, {"modifier": "date", "valueOptional": False}, {"modifier": "time without time zone", "valueOptional": False}, + {"modifier": "timestamp without time zone", "valueOptional": True}, ], }, { @@ -221,6 +223,7 @@ "val_y": False, }, { + "alumnized_at": None, "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), "col9": { "complex": [1, 2, 3, "a"], @@ -265,6 +268,11 @@ { "name": "items", "columns": { + "alumnized_at": { + "data_type": "timestamp", + "name": "alumnized_at", + "nullable": True, + }, "col4": {"data_type": "timestamp", "name": "col4", "nullable": False}, "col9": {"data_type": "complex", "name": "col9", "nullable": False}, "col10": {"data_type": "date", "name": "col10", "nullable": False}, From 8fbfc6269a3f1efe74a34b9d0749a4422c01fe7c Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 28 Oct 2024 21:46:07 +0100 Subject: [PATCH 47/88] fix: minor corrections to handle old versions of postgres --- sources/pg_legacy_replication/helpers.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index c858b37c6..d56ecf77e 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -30,7 +30,7 @@ from dlt.extract import DltSource, DltResource from dlt.extract.items import DataItemWithMeta from dlt.sources.credentials import ConnectionStringCredentials -from psycopg2.extensions import connection as ConnectionExt +from psycopg2.extensions import cursor, connection as ConnectionExt from psycopg2.extras import ( LogicalReplicationConnection, ReplicationCursor, @@ -169,6 +169,11 @@ def cleanup_snapshot_resources(snapshots: DltSource) -> None: engine.dispose() +def get_pg_version(cur: cursor) -> int: + """Returns Postgres server version as int.""" + return cur.connection.server_version + + def create_replication_slot( # type: ignore[return] name: str, cur: ReplicationCursor, output_plugin: str = "decoderbufs" ) -> Optional[Dict[str, str]]: @@ -211,8 +216,9 @@ def get_max_lsn( Raises error if the replication slot or publication does not exist. """ cur = _get_conn(credentials).cursor() + lsn_field = "location" if get_pg_version(cur) < 100000 else "lsn" cur.execute( - "SELECT MAX(lsn) - '0/0' AS max_lsn " # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) + f"SELECT MAX({lsn_field} - '0/0') AS max_lsn " # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) f"FROM pg_logical_slot_peek_binary_changes('{slot_name}', NULL, NULL);" ) lsn: int = cur.fetchone()[0] @@ -239,9 +245,10 @@ def advance_slot( """ if upto_lsn != 0: cur = _get_conn(credentials).cursor() - cur.execute( - f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" - ) + if get_pg_version(cur) > 100000: + cur.execute( + f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" + ) cur.connection.close() From fd4638b56833c3ed3a9c14482ad263610305ba7b Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 29 Oct 2024 13:50:53 +0100 Subject: [PATCH 48/88] fix: small type corrections for pg9.6 --- sources/pg_legacy_replication/README.md | 2 +- sources/pg_legacy_replication/__init__.py | 12 +++++++++++- sources/pg_legacy_replication/helpers.py | 4 +++- sources/pg_legacy_replication/requirements.txt | 2 +- sources/pg_legacy_replication/schema_types.py | 3 +++ tests/pg_legacy_replication/cases.py | 14 +++++++------- 6 files changed, 26 insertions(+), 11 deletions(-) diff --git a/sources/pg_legacy_replication/README.md b/sources/pg_legacy_replication/README.md index f34fcd4d6..d661854ef 100644 --- a/sources/pg_legacy_replication/README.md +++ b/sources/pg_legacy_replication/README.md @@ -76,4 +76,4 @@ sources.pg_replication.credentials="postgresql://loader:password@host.rds.amazon ```bash dlt pipeline pg_replication_pipeline show - ``` \ No newline at end of file + ``` diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index f90f59875..790111f1a 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -8,7 +8,14 @@ from dlt.extract.items import TDataItem from dlt.sources.credentials import ConnectionStringCredentials -from .helpers import advance_slot, get_max_lsn, ItemGenerator, create_table_dispatch +from .helpers import ( + advance_slot, + get_max_lsn, + ItemGenerator, + create_table_dispatch, + init_replication, + cleanup_snapshot_resources, +) @dlt.source @@ -112,3 +119,6 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: data_from=wal_reader, name=table, ) + + +__all__ = ["cleanup_snapshot_resources", "init_replication", "replication_source"] diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index d56ecf77e..63fcd029f 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -49,7 +49,7 @@ def init_replication( slot_name: str, schema: str, - table_names: Union[str, Sequence[str]], + table_names: Optional[Union[str, Sequence[str]]] = None, credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = False, included_columns: Optional[Dict[str, TColumnNames]] = None, @@ -115,6 +115,8 @@ def init_replication( rep_conn.close() return + assert table_names + # Ensure `sqlalchemy` and `sql_table` are available _import_sql_table_resource() engine = _configure_engine(credentials, rep_conn) diff --git a/sources/pg_legacy_replication/requirements.txt b/sources/pg_legacy_replication/requirements.txt index 1ad8e04f0..98459d020 100644 --- a/sources/pg_legacy_replication/requirements.txt +++ b/sources/pg_legacy_replication/requirements.txt @@ -1,4 +1,4 @@ dlt>=0.5.12 psycopg2-binary>=2.9.9 protobuf>=5 -sqlalchemy>=1.4 \ No newline at end of file +sqlalchemy>=1.4 diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index bb8688d9a..c97556bd3 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -34,6 +34,7 @@ 20: "bigint", 21: "smallint", 23: "integer", + 700: "real", 701: "double precision", 1043: "character varying", 1082: "date", @@ -46,6 +47,7 @@ """Maps postgres type OID to type string. Only includes types present in PostgresTypeMapper.""" _MISSING_TYPES: Dict[str, TDataType] = { + "real": "double", "timestamp without time zone": "timestamp", } # FIXME Missing types for old postgres versions @@ -65,6 +67,7 @@ 21: (16, None), # smallint 23: (32, None), # integer 20: (64, None), # bigint + 700: (32, None), # real } """Dict for fixed precision types""" diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 292fa92f3..b3073eb39 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -178,7 +178,8 @@ "columnType": 1083, "datumInt64": 48405176451, }, - {"columnName": "alumnized_at", "columnType": 1114}, + {"columnName": "col12", "columnType": 1114}, + {"columnName": "col13", "columnType": 700}, ], "newTypeinfo": [ {"modifier": "timestamp with time zone", "valueOptional": False}, @@ -186,6 +187,7 @@ {"modifier": "date", "valueOptional": False}, {"modifier": "time without time zone", "valueOptional": False}, {"modifier": "timestamp without time zone", "valueOptional": True}, + {"modifier": "real", "valueOptional": True}, ], }, { @@ -223,7 +225,6 @@ "val_y": False, }, { - "alumnized_at": None, "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), "col9": { "complex": [1, 2, 3, "a"], @@ -234,6 +235,8 @@ }, "col10": pendulum.parse("2023-02-27", strict=False).date(), "col11": pendulum.parse("13:26:45.176451", strict=False).time(), + "col12": None, + "col13": None, }, { "id_x": 1, @@ -268,15 +271,12 @@ { "name": "items", "columns": { - "alumnized_at": { - "data_type": "timestamp", - "name": "alumnized_at", - "nullable": True, - }, "col4": {"data_type": "timestamp", "name": "col4", "nullable": False}, "col9": {"data_type": "complex", "name": "col9", "nullable": False}, "col10": {"data_type": "date", "name": "col10", "nullable": False}, "col11": {"data_type": "time", "name": "col11", "nullable": False}, + "col12": {"data_type": "timestamp", "name": "col12", "nullable": True}, + "col13": {"data_type": "double", "name": "col13", "nullable": True}, "lsn": {"data_type": "bigint", "nullable": True}, "deleted_ts": {"data_type": "timestamp", "nullable": True}, }, From 526eff3ca84173a96b940283d3a73adae5013357 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 29 Oct 2024 19:36:28 +0100 Subject: [PATCH 49/88] fix: exposing table options for later arrow support --- sources/pg_legacy_replication/__init__.py | 10 +- sources/pg_legacy_replication/exceptions.py | 12 +- sources/pg_legacy_replication/helpers.py | 121 +++++++++++------- sources/pg_legacy_replication_pipeline.py | 6 +- .../test_pg_replication.py | 12 +- 5 files changed, 92 insertions(+), 69 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 790111f1a..d86510c39 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -3,7 +3,7 @@ from typing import Dict, Sequence, Optional, Iterable, Union import dlt -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns +from dlt.common.schema.typing import TTableSchemaColumns from dlt.extract import DltResource from dlt.extract.items import TDataItem from dlt.sources.credentials import ConnectionStringCredentials @@ -15,6 +15,7 @@ create_table_dispatch, init_replication, cleanup_snapshot_resources, + ReplicationOptions, ) @@ -24,7 +25,7 @@ def replication_source( schema: str, table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, - included_columns: Optional[Dict[str, TColumnNames]] = None, + table_options: Optional[Dict[str, ReplicationOptions]] = None, column_hints: Optional[Dict[str, TTableSchemaColumns]] = None, target_batch_size: int = 1000, flush_slot: bool = True, @@ -74,8 +75,7 @@ def replication_source( Yields: Data items for changes published in the publication. """ - if isinstance(table_names, str): - table_names = [table_names] + table_names = [table_names] if isinstance(table_names, str) else table_names or [] @dlt.resource(name=lambda args: args["slot_name"], standalone=True) def replication_resource(slot_name: str) -> Iterable[TDataItem]: @@ -100,7 +100,7 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: upto_lsn=upto_lsn, start_lsn=start_lsn, target_batch_size=target_batch_size, - included_columns=included_columns, + table_options=table_options, ) yield from gen if gen.generated_all: diff --git a/sources/pg_legacy_replication/exceptions.py b/sources/pg_legacy_replication/exceptions.py index ea850999d..99e3db420 100644 --- a/sources/pg_legacy_replication/exceptions.py +++ b/sources/pg_legacy_replication/exceptions.py @@ -1,6 +1,6 @@ -class SqlDatabaseSourceImportError(Exception): - def __init__(self) -> None: - super().__init__( - "Could not import `sql_database` source. Run `dlt init sql_database `" - " to download the source code." - ) +# class SqlDatabaseSourceImportError(Exception): +# def __init__(self) -> None: +# super().__init__( +# "Could not import `sql_database` source. Run `dlt init sql_database `" +# " to download the source code." +# ) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 63fcd029f..07f8550bb 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -1,3 +1,4 @@ +import hashlib from collections import defaultdict from dataclasses import dataclass, field from functools import partial @@ -13,18 +14,14 @@ Iterable, Callable, NamedTuple, + TypedDict, ) import dlt -import hashlib import psycopg2 from dlt.common import logger from dlt.common.pendulum import pendulum -from dlt.common.schema.typing import ( - TColumnNames, - TTableSchema, - TTableSchemaColumns, -) +from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns from dlt.common.schema.utils import merge_column from dlt.common.typing import TDataItem from dlt.extract import DltSource, DltResource @@ -37,13 +34,65 @@ ReplicationMessage, StopReplication, ) -from sqlalchemy import Connection as ConnectionSqla, Engine, event -from .exceptions import SqlDatabaseSourceImportError +# Favoring 1.x over 0.5.x imports +try: + from dlt.common.libs.sql_alchemy import ( # type: ignore[attr-defined] + Connection as ConnectionSqla, + Engine, + event, + Table, + MetaData, + ) +except ImportError: + from sqlalchemy import ( + Connection as ConnectionSqla, + Engine, + event, + Table, + MetaData, + ) + +try: + from dlt.sources.sql_database import ( # type: ignore[import-not-found] + sql_table, + engine_from_credentials, + TQueryAdapter, + TTypeAdapter, + ReflectionLevel, + TableBackend, + ) +except ImportError: + from ..sql_database import ( # type: ignore[import-untyped] + sql_table, + engine_from_credentials, + TQueryAdapter, + TTypeAdapter, + ReflectionLevel, + TableBackend, + ) + from .pg_logicaldec_pb2 import Op, RowMessage, TypeInfo from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val +class ReplicationOptions(TypedDict, total=False): + included_columns: Optional[Sequence[str]] + + +class SqlTableOptions(ReplicationOptions, total=False): + metadata: Optional[MetaData] + chunk_size: Optional[int] + backend: Optional[TableBackend] + detect_precision_hints: Optional[bool] + reflection_level: Optional[ReflectionLevel] + defer_table_reflect: Optional[bool] + table_adapter_callback: Optional[Callable[[Table], None]] + backend_kwargs: Optional[Dict[str, Any]] + type_adapter_callback: Optional[TTypeAdapter] + query_adapter_callback: Optional[TQueryAdapter] + + @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) @dlt.source def init_replication( @@ -52,7 +101,7 @@ def init_replication( table_names: Optional[Union[str, Sequence[str]]] = None, credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = False, - included_columns: Optional[Dict[str, TColumnNames]] = None, + table_options: Optional[Dict[str, SqlTableOptions]] = None, reset: bool = False, ) -> Iterable[DltResource]: """Initializes replication for one, several, or all tables within a schema. @@ -115,10 +164,8 @@ def init_replication( rep_conn.close() return - assert table_names + assert table_names is not None - # Ensure `sqlalchemy` and `sql_table` are available - _import_sql_table_resource() engine = _configure_engine(credentials, rep_conn) @event.listens_for(engine, "begin") @@ -133,16 +180,13 @@ def on_begin(conn: ConnectionSqla) -> None: cur.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") cur.execute(f"SET TRANSACTION SNAPSHOT '{slot['snapshot_name']}'") - if isinstance(table_names, str): - table_names = [table_names] + table_names = [table_names] if isinstance(table_names, str) else table_names or [] for table in table_names: - yield sql_table( # type: ignore[name-defined] - credentials=engine, - table=table, - schema=schema, - included_columns=included_columns.get(table) if included_columns else None, + table_args = ( + table_options[table] if table_options and table in table_options else {} ) + yield sql_table(credentials=engine, table=table, schema=schema, **table_args) def _configure_engine( @@ -152,7 +196,7 @@ def _configure_engine( Configures the SQLAlchemy engine. Also attaches the replication connection in order to prevent it being garbage collected and closed. """ - engine: Engine = engine_from_credentials(credentials, may_dispose_after_use=False) # type: ignore[name-defined] + engine: Engine = engine_from_credentials(credentials, may_dispose_after_use=False) engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) setattr(engine, "rep_conn", rep_conn) # noqa @@ -254,24 +298,6 @@ def advance_slot( cur.connection.close() -def _import_sql_table_resource() -> None: - """Imports external `sql_table` resource from `sql_database` source. - - Raises error if `sql_database` source is not available. - """ - global sql_table, engine_from_credentials - try: - from ..sql_database import sql_table, engine_from_credentials # type: ignore[import-untyped] - except ImportError: - try: - from dlt.sources.sql_database import sql_table, engine_from_credentials # type: ignore[import-not-found] - except ImportError: - try: - from sql_database import sql_table, engine_from_credentials - except ImportError as e: - raise SqlDatabaseSourceImportError from e - - def _get_conn( credentials: ConnectionStringCredentials, connection_factory: Optional[Any] = None, @@ -311,19 +337,16 @@ def __init__( upto_lsn: int, table_qnames: Set[str], target_batch_size: int = 1000, - included_columns: Optional[Dict[str, TColumnNames]] = None, + table_options: Optional[Dict[str, ReplicationOptions]] = None, ) -> None: self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.included_columns = ( - { - table: {s for s in ([cols] if isinstance(cols, str) else cols)} - for table, cols in included_columns.items() - } - if included_columns - else {} - ) + self.included_columns = { + table: set(options["included_columns"]) + for table, options in (table_options or {}).items() + if options.get("included_columns") + } self.consumed_all: bool = False # maps table names to list of data items @@ -443,7 +466,7 @@ class ItemGenerator: upto_lsn: int start_lsn: int = 0 target_batch_size: int = 1000 - included_columns: Optional[Dict[str, TColumnNames]] = None + table_options: Optional[Dict[str, ReplicationOptions]] = None last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -463,7 +486,7 @@ def __iter__(self) -> Iterator[TableItems]: upto_lsn=self.upto_lsn, table_qnames=self.table_qnames, target_batch_size=self.target_batch_size, - included_columns=self.included_columns, + table_options=self.table_options, ) try: cur.consume_stream(consumer) diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 1e0466ba2..eb5402e6c 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -3,7 +3,7 @@ from dlt.destinations.impl.postgres.configuration import PostgresCredentials from pg_legacy_replication import replication_source -from pg_legacy_replication.helpers import init_replication +from pg_legacy_replication.helpers import init_replication, ReplicationOptions PG_CREDS = dlt.secrets.get("sources.pg_replication.credentials", PostgresCredentials) @@ -162,8 +162,8 @@ def replicate_with_column_selection() -> None: slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - included_columns={ - "tbl_x": ("c1", "c2") + table_options={ + "tbl_x": ReplicationOptions(included_columns=["c1", "c2"]) }, # columns not specified here are excluded from generated data items ) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index dc22853dd..0401fae37 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -1,5 +1,5 @@ from copy import deepcopy -from typing import Dict, Sequence, Tuple +from typing import Dict, Tuple import dlt import pytest @@ -444,9 +444,9 @@ def tbl_z(data): ) # initialize replication and create resources - included_columns: Dict[str, Sequence[str]] = { - "tbl_x": ("id_x", "val_x"), - "tbl_y": ("id_y", "val_y"), + options = { + "tbl_x": {"included_columns": ["id_x", "val_x"]}, + "tbl_y": {"included_columns": ["id_y", "val_y"]}, # tbl_z is not specified, hence all columns should be included } snapshots = init_replication( @@ -454,13 +454,13 @@ def tbl_z(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), take_snapshots=init_load, - included_columns=included_columns, + table_options=options, ) changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - included_columns=included_columns, + table_options=options, ) # update three postgres tables From 2f5ad1562bb01c6b10e6710df168553840b959eb Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 30 Oct 2024 18:14:47 +0100 Subject: [PATCH 50/88] wip: saving work for arrow --- sources/pg_legacy_replication/__init__.py | 5 +- sources/pg_legacy_replication/helpers.py | 109 ++++++++++-------- sources/pg_legacy_replication/schema_types.py | 46 ++++---- sources/pg_legacy_replication_pipeline.py | 4 +- tests/pg_legacy_replication/cases.py | 4 + tests/pg_legacy_replication/test_helpers.py | 2 +- .../test_pg_replication.py | 16 ++- 7 files changed, 110 insertions(+), 76 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index d86510c39..2f682b2f6 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -15,7 +15,7 @@ create_table_dispatch, init_replication, cleanup_snapshot_resources, - ReplicationOptions, + SqlTableOptions, ) @@ -25,7 +25,7 @@ def replication_source( schema: str, table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, - table_options: Optional[Dict[str, ReplicationOptions]] = None, + table_options: Optional[Dict[str, SqlTableOptions]] = None, column_hints: Optional[Dict[str, TTableSchemaColumns]] = None, target_batch_size: int = 1000, flush_slot: bool = True, @@ -115,6 +115,7 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: create_table_dispatch( table=table, column_hints=column_hints.get(table) if column_hints else None, + table_options=table_options.get(table) if table_options else None, ), data_from=wal_reader, name=table, diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 07f8550bb..a5e3222c9 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -37,21 +37,10 @@ # Favoring 1.x over 0.5.x imports try: - from dlt.common.libs.sql_alchemy import ( # type: ignore[attr-defined] - Connection as ConnectionSqla, - Engine, - event, - Table, - MetaData, - ) + from dlt.common.libs.sql_alchemy import Engine, Table, MetaData # type: ignore[attr-defined] except ImportError: - from sqlalchemy import ( - Connection as ConnectionSqla, - Engine, - event, - Table, - MetaData, - ) + from sqlalchemy import Engine, Table, MetaData +from sqlalchemy import Connection as ConnectionSqla, event try: from dlt.sources.sql_database import ( # type: ignore[import-not-found] @@ -61,6 +50,7 @@ TTypeAdapter, ReflectionLevel, TableBackend, + arrow_helpers as arrow, ) except ImportError: from ..sql_database import ( # type: ignore[import-untyped] @@ -70,17 +60,17 @@ TTypeAdapter, ReflectionLevel, TableBackend, + arrow_helpers as arrow, ) from .pg_logicaldec_pb2 import Op, RowMessage, TypeInfo from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val -class ReplicationOptions(TypedDict, total=False): +class SqlTableOptions(TypedDict, total=False): + # Used by both sql_table and replication resources included_columns: Optional[Sequence[str]] - - -class SqlTableOptions(ReplicationOptions, total=False): + # Used only by sql_table resource metadata: Optional[MetaData] chunk_size: Optional[int] backend: Optional[TableBackend] @@ -337,7 +327,7 @@ def __init__( upto_lsn: int, table_qnames: Set[str], target_batch_size: int = 1000, - table_options: Optional[Dict[str, ReplicationOptions]] = None, + table_options: Optional[Dict[str, SqlTableOptions]] = None, ) -> None: self.upto_lsn = upto_lsn self.table_qnames = table_qnames @@ -353,7 +343,8 @@ def __init__( self.data_items: Dict[str, List[TDataItem]] = defaultdict(list) # maps table name to table schema self.last_table_schema: Dict[str, TTableSchema] = {} - self.last_seen_schemas: Dict[str, int] = {} + # maps table names to new_typeinfo hashes + self.last_table_hashes: Dict[str, int] = {} self.last_commit_ts: pendulum.DateTime self.last_commit_lsn: int @@ -410,37 +401,38 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: if msg.table not in self.table_qnames: return table_name = msg.table.split(".")[1] - if msg.op == Op.DELETE: - data_item = gen_data_item(msg) - else: - table_schema = self._get_table_schema(msg, table_name) - data_item = gen_data_item( - msg, self.included_columns.get(table_name), table_schema["columns"] - ) + table_schema = self.get_table_schema(msg, table_name) + data_item = gen_data_item( + msg, table_schema["columns"], self.included_columns.get(table_name) + ) data_item["lsn"] = lsn self.data_items[table_name].append(data_item) - def _get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: - current_hash = hash_typeinfo(msg.new_typeinfo) - cached_hash = self.last_seen_schemas.get(table_name) + def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: + cached = self.last_table_schema.get(table_name) + included_columns = self.included_columns.get(table_name) + + # If DELETE try to fetch an already cached schema or infer a less precise one + if msg.op == Op.DELETE: + if cached: + return cached + return infer_table_schema(msg, included_columns) # Return cached schema if hash matches - if cached_hash == current_hash: + current_hash = hash_typeinfo(msg.new_typeinfo) + if current_hash == self.last_table_hashes.get(table_name): return self.last_table_schema[table_name] - # Infer the current schema - inferred_schema = infer_table_schema(msg, self.included_columns.get(table_name)) - cached_schema = self.last_table_schema.get(table_name) - - if cached_schema is None: + inferred = infer_table_schema(msg, included_columns) + if cached is None: # Cache the inferred schema and hash if it is not already cached - self.last_table_schema[table_name] = inferred_schema - self.last_seen_schemas[table_name] = current_hash - elif cached_schema != inferred_schema: + self.last_table_schema[table_name] = inferred + self.last_table_hashes[table_name] = current_hash + elif cached != inferred: # Raise an exception if there's a schema mismatch raise StopReplication("Table schema change detected") - return inferred_schema + return inferred def hash_typeinfo(new_typeinfo: Sequence[TypeInfo]) -> int: @@ -466,7 +458,7 @@ class ItemGenerator: upto_lsn: int start_lsn: int = 0 target_batch_size: int = 1000 - table_options: Optional[Dict[str, ReplicationOptions]] = None + table_options: Optional[Dict[str, SqlTableOptions]] = None last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -507,7 +499,9 @@ def __iter__(self) -> Iterator[TableItems]: def create_table_dispatch( - table: str, column_hints: Optional[TTableSchemaColumns] = None + table: str, + column_hints: Optional[TTableSchemaColumns] = None, + table_options: Optional[SqlTableOptions] = None, ) -> Callable[[TableItems], Iterable[DataItemWithMeta]]: """Creates a dispatch handler that processes data items based on a specified table and optional column hints.""" @@ -526,7 +520,23 @@ def handle(table_items: TableItems) -> Iterable[DataItemWithMeta]: dlt.mark.make_hints(table_name=table, columns=columns), create_table_variant=True, ) - yield dlt.mark.with_table_name(table_items.items, table) + if table_options: + backend = table_options.get("backend") + backend_kwargs = table_options.get("backend_kwargs", {}) + if backend == "pyarrow": + ordered_keys = list(columns.keys()) + rows = [ + tuple(data_item.get(column, None) for column in ordered_keys) + for data_item in table_items.items + ] + arrow_table = arrow.row_tuples_to_arrow( + rows, columns, tz=backend_kwargs.get("tz", "UTC") + ) + yield dlt.mark.with_table_name(arrow_table, table) + else: + yield dlt.mark.with_table_name(table_items.items, table) + else: + yield dlt.mark.with_table_name(table_items.items, table) return handle @@ -535,9 +545,16 @@ def infer_table_schema( msg: RowMessage, included_columns: Optional[Set[str]] = None ) -> TTableSchema: """Infers the table schema from the replication message and optional hints""" + # Choose the correct source based on operation type + is_change = msg.op != Op.DELETE + tuples = msg.new_tuple if is_change else msg.old_tuple + + # Filter and map columns, conditionally using `new_typeinfo` when available columns: TTableSchemaColumns = { - col.column_name: _to_dlt_column_schema(col, col_info) - for col, col_info in zip(msg.new_tuple, msg.new_typeinfo) + col.column_name: _to_dlt_column_schema( + col, msg.new_typeinfo[i] if is_change else None + ) + for i, col in enumerate(tuples) if not included_columns or col.column_name in included_columns } @@ -553,8 +570,8 @@ def infer_table_schema( def gen_data_item( msg: RowMessage, + column_schema: TTableSchemaColumns, included_columns: Optional[Set[str]] = None, - column_schema: Optional[TTableSchemaColumns] = None, ) -> TDataItem: """Generates data item from a row message and corresponding metadata.""" data_item: TDataItem = {} diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index c97556bd3..8796481e7 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -81,18 +81,21 @@ def _get_precision_and_scale( - type_id: int, modifier: str + type_id: int, modifier: Optional[str] ) -> Optional[Tuple[Optional[int], Optional[int]]]: """Get precision from postgres type attributes and modifiers.""" if type_id in _FIXED_PRECISION_TYPES: return _FIXED_PRECISION_TYPES[type_id] - if pattern := _VARYING_PRECISION_PATTERNS.get(type_id): - if match := re.search(pattern, modifier): - groups = match.groups() - precision = int(groups[0]) - scale = int(groups[1]) if len(groups) > 1 else None - return precision, scale + # If modifier or pattern is missing, return defaults + if not modifier or (pattern := _VARYING_PRECISION_PATTERNS.get(type_id)) is None: + return None, None + + if match := re.search(pattern, modifier): + groups = match.groups() + precision = int(groups[0]) + scale = int(groups[1]) if len(groups) > 1 else None + return precision, scale return None, None @@ -112,13 +115,7 @@ def _from_db_type() -> Callable[[str, Optional[int], Optional[int]], TColumnType return type_mapper.from_db_type # type: ignore[no-any-return] -def _from_destination_type( - db_type: str, precision: Optional[int] = None, scale: Optional[int] = None -) -> TColumnType: - return _from_db_type()(db_type, precision, scale) - - -def _to_dlt_column_type(type_id: int, modifier: str) -> TColumnType: +def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: """Converts postgres type OID to dlt column type. Type OIDs not in _PG_TYPES mapping default to "text" type. @@ -132,17 +129,26 @@ def _to_dlt_column_type(type_id: int, modifier: str) -> TColumnType: ) precision, scale = _get_precision_and_scale(type_id, modifier) - return _from_destination_type(pg_type, precision, scale) + return _from_db_type()(pg_type, precision, scale) -def _to_dlt_column_schema(datum: DatumMessage, type_info: TypeInfo) -> TColumnSchema: +def _to_dlt_column_schema( + datum: DatumMessage, type_info: Optional[TypeInfo] +) -> TColumnSchema: """Converts decoderbuf's datum value/typeinfo to dlt column schema.""" - return { + column_schema: TColumnSchema = { "name": datum.column_name, - "nullable": type_info.value_optional, - **_to_dlt_column_type(datum.column_type, type_info.modifier), + **_to_dlt_column_type( + datum.column_type, type_info.modifier if type_info else None + ), } + # Set nullable attribute if type_info is available + if type_info: + column_schema["nullable"] = type_info.value_optional + + return column_schema + def _epoch_micros_to_datetime(microseconds_since_1970: int) -> pendulum.DateTime: return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000) @@ -169,7 +175,7 @@ def _to_dlt_val( ) -> Any: """Converts decoderbuf's datum value into dlt-compatible data value.""" if isinstance(data_type, int): - col_type: TColumnType = _from_destination_type(_PG_TYPES[data_type]) + col_type: TColumnType = _from_db_type()(_PG_TYPES[data_type]) # type: ignore[call-arg] data_type = col_type["data_type"] datum = val.WhichOneof("datum") diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index eb5402e6c..e2d580f19 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -3,7 +3,7 @@ from dlt.destinations.impl.postgres.configuration import PostgresCredentials from pg_legacy_replication import replication_source -from pg_legacy_replication.helpers import init_replication, ReplicationOptions +from pg_legacy_replication.helpers import init_replication PG_CREDS = dlt.secrets.get("sources.pg_replication.credentials", PostgresCredentials) @@ -163,7 +163,7 @@ def replicate_with_column_selection() -> None: schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), table_options={ - "tbl_x": ReplicationOptions(included_columns=["c1", "c2"]) + "tbl_x": {"included_columns": ["c1", "c2"]} }, # columns not specified here are excluded from generated data items ) diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index b3073eb39..a6dee4e2b 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -284,6 +284,10 @@ { "name": "tbl_x", "columns": { + "id_x": {"data_type": "bigint", "name": "id_x", "precision": 64}, + "val_x": {"data_type": "text", "name": "val_x"}, + "_dlt_id": {"data_type": "text", "name": "_dlt_id"}, + "_dlt_load_id": {"data_type": "text", "name": "_dlt_load_id"}, "lsn": {"data_type": "bigint", "nullable": True}, "deleted_ts": {"data_type": "timestamp", "nullable": True}, }, diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 3482fc768..03f2aa89f 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -29,4 +29,4 @@ def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) column_schema = infer_table_schema(row_msg)["columns"] - assert gen_data_item(row_msg, column_schema=column_schema) == data_item + assert gen_data_item(row_msg, column_schema) == data_item diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 0401fae37..7c0215216 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -6,11 +6,12 @@ from dlt.common.schema.typing import TTableSchemaColumns from dlt.destinations.job_client_impl import SqlJobClientBase -from sources.pg_legacy_replication import replication_source -from sources.pg_legacy_replication.helpers import ( +from sources.pg_legacy_replication import ( init_replication, cleanup_snapshot_resources, + replication_source, ) +from sources.pg_legacy_replication.helpers import SqlTableOptions, TableBackend from tests.utils import ( ALL_DESTINATIONS, assert_load_info, @@ -26,8 +27,9 @@ @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy"]) def test_core_functionality( - src_config: Tuple[dlt.Pipeline, str], destination_name: str + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend ) -> None: @dlt.resource(write_disposition="merge", primary_key="id_x") def tbl_x(data): @@ -48,17 +50,21 @@ def tbl_y(data): add_pk(src_pl.sql_client, "tbl_x", "id_x") add_pk(src_pl.sql_client, "tbl_y", "id_y") + table_options = {"tbl_x": {"backend": backend}, "tbl_y": {"backend": backend}} + snapshots = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), take_snapshots=True, + table_options=table_options, ) changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), + table_options=table_options, ) changes.tbl_x.apply_hints( write_disposition="merge", primary_key="id_x", columns=merge_hints @@ -445,8 +451,8 @@ def tbl_z(data): # initialize replication and create resources options = { - "tbl_x": {"included_columns": ["id_x", "val_x"]}, - "tbl_y": {"included_columns": ["id_y", "val_y"]}, + "tbl_x": SqlTableOptions(included_columns=["id_x", "val_x"]), + "tbl_y": SqlTableOptions(included_columns=["id_y", "val_y"]), # tbl_z is not specified, hence all columns should be included } snapshots = init_replication( From 32063e2a3b07c0121807651befc2b8d21a628c15 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 30 Oct 2024 19:18:57 +0100 Subject: [PATCH 51/88] wip: first test with arrow passing --- sources/pg_legacy_replication/helpers.py | 81 +++++++++---------- sources/pg_legacy_replication/schema_types.py | 20 ++--- tests/pg_legacy_replication/cases.py | 24 ++++-- .../test_pg_replication.py | 2 +- 4 files changed, 62 insertions(+), 65 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index a5e3222c9..af1f62ecd 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -409,30 +409,24 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: self.data_items[table_name].append(data_item) def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: - cached = self.last_table_schema.get(table_name) + last_schema = self.last_table_schema.get(table_name) included_columns = self.included_columns.get(table_name) - # If DELETE try to fetch an already cached schema or infer a less precise one - if msg.op == Op.DELETE: - if cached: - return cached - return infer_table_schema(msg, included_columns) - # Return cached schema if hash matches current_hash = hash_typeinfo(msg.new_typeinfo) if current_hash == self.last_table_hashes.get(table_name): return self.last_table_schema[table_name] - inferred = infer_table_schema(msg, included_columns) - if cached is None: + new_schema = infer_table_schema(msg, included_columns) + if last_schema is None: # Cache the inferred schema and hash if it is not already cached - self.last_table_schema[table_name] = inferred + self.last_table_schema[table_name] = new_schema self.last_table_hashes[table_name] = current_hash - elif cached != inferred: + elif last_schema != new_schema: # Raise an exception if there's a schema mismatch raise StopReplication("Table schema change detected") - return inferred + return new_schema def hash_typeinfo(new_typeinfo: Sequence[TypeInfo]) -> int: @@ -446,7 +440,7 @@ def hash_typeinfo(new_typeinfo: Sequence[TypeInfo]) -> int: class TableItems(NamedTuple): table: str - schema: Optional[TTableSchema] + schema: TTableSchema items: List[TDataItem] @@ -508,35 +502,35 @@ def create_table_dispatch( def handle(table_items: TableItems) -> Iterable[DataItemWithMeta]: if table_items.table != table: return - if schema := table_items.schema: - columns = schema["columns"] - if column_hints: - for col_name, col_hint in column_hints.items(): - columns[col_name] = merge_column( - columns.get(col_name, {}), col_hint - ) + columns = table_items.schema["columns"] + if column_hints: + for col_name, col_hint in column_hints.items(): + columns[col_name] = merge_column(columns.get(col_name, {}), col_hint) + backend = ( + table_options.get("backend", "sqlalchemy") + if table_options + else "sqlalchemy" + ) + if backend == "sqlalchemy": yield dlt.mark.with_hints( [], dlt.mark.make_hints(table_name=table, columns=columns), create_table_variant=True, ) - if table_options: - backend = table_options.get("backend") + yield dlt.mark.with_table_name(table_items.items, table) + elif backend == "pyarrow": backend_kwargs = table_options.get("backend_kwargs", {}) - if backend == "pyarrow": - ordered_keys = list(columns.keys()) - rows = [ - tuple(data_item.get(column, None) for column in ordered_keys) - for data_item in table_items.items - ] - arrow_table = arrow.row_tuples_to_arrow( - rows, columns, tz=backend_kwargs.get("tz", "UTC") - ) - yield dlt.mark.with_table_name(arrow_table, table) - else: - yield dlt.mark.with_table_name(table_items.items, table) + ordered_keys = list(columns.keys()) + rows = [ + tuple(data_item.get(column, None) for column in ordered_keys) + for data_item in table_items.items + ] + arrow_table = arrow.row_tuples_to_arrow( + rows, columns, tz=backend_kwargs.get("tz", "UTC") + ) + yield dlt.mark.with_table_name(arrow_table, table) else: - yield dlt.mark.with_table_name(table_items.items, table) + raise NotImplementedError(f"Unsupported backend: {backend}") return handle @@ -559,8 +553,12 @@ def infer_table_schema( } # Add replication columns - columns["lsn"] = {"data_type": "bigint", "nullable": True} - columns["deleted_ts"] = {"data_type": "timestamp", "nullable": True} + columns["lsn"] = {"data_type": "bigint", "name": "lsn", "nullable": True} + columns["deleted_ts"] = { + "data_type": "timestamp", + "name": "deleted_ts", + "nullable": True, + } return { "name": (msg.table.split(".")[1]), @@ -585,13 +583,10 @@ def gen_data_item( col_name = data.column_name if included_columns and col_name not in included_columns: continue - data_type = ( - column_schema[col_name]["data_type"] - if column_schema and column_schema.get(col_name) - else data.column_type - ) data_item[col_name] = _to_dlt_val( - data, data_type, for_delete=msg.op == Op.DELETE + data, + column_schema[col_name]["data_type"], + for_delete=msg.op == Op.DELETE, ) return data_item diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 8796481e7..273b71957 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -146,6 +146,8 @@ def _to_dlt_column_schema( # Set nullable attribute if type_info is available if type_info: column_schema["nullable"] = type_info.value_optional + elif datum.WhichOneof("datum"): # Or simply guess as this is a very rare case + column_schema["nullable"] = False return column_schema @@ -171,13 +173,9 @@ def _epoch_days_to_date(epoch_days: int) -> pendulum.Date: def _to_dlt_val( - val: DatumMessage, data_type: Union[TDataType, int], *, for_delete: bool = False + val: DatumMessage, data_type: TDataType, *, for_delete: bool = False ) -> Any: """Converts decoderbuf's datum value into dlt-compatible data value.""" - if isinstance(data_type, int): - col_type: TColumnType = _from_db_type()(_PG_TYPES[data_type]) # type: ignore[call-arg] - data_type = col_type["data_type"] - datum = val.WhichOneof("datum") if datum is None: return _DUMMY_VALS[data_type] if for_delete else None @@ -187,17 +185,9 @@ def _to_dlt_val( return data_type_handlers[data_type](raw_value) try: - return coerce_value( - to_type=data_type, - from_type=_DATUM_RAW_TYPES[datum], - value=raw_value, - ) + return coerce_value(data_type, _DATUM_RAW_TYPES[datum], raw_value) except ValueError: # FIXME Hack to get it to work with 0.5.x and 1.x if data_type == "json": - return coerce_value( - "complex", - from_type=_DATUM_RAW_TYPES[datum], - value=raw_value, - ) + return coerce_value("complex", _DATUM_RAW_TYPES[datum], raw_value) raise diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index a6dee4e2b..4d7f66514 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -264,8 +264,12 @@ "name": "_dlt_load_id", "nullable": False, }, - "lsn": {"data_type": "bigint", "nullable": True}, - "deleted_ts": {"data_type": "timestamp", "nullable": True}, + "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, + "deleted_ts": { + "data_type": "timestamp", + "name": "deleted_ts", + "nullable": True, + }, }, }, { @@ -277,8 +281,12 @@ "col11": {"data_type": "time", "name": "col11", "nullable": False}, "col12": {"data_type": "timestamp", "name": "col12", "nullable": True}, "col13": {"data_type": "double", "name": "col13", "nullable": True}, - "lsn": {"data_type": "bigint", "nullable": True}, - "deleted_ts": {"data_type": "timestamp", "nullable": True}, + "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, + "deleted_ts": { + "data_type": "timestamp", + "name": "deleted_ts", + "nullable": True, + }, }, }, { @@ -288,8 +296,12 @@ "val_x": {"data_type": "text", "name": "val_x"}, "_dlt_id": {"data_type": "text", "name": "_dlt_id"}, "_dlt_load_id": {"data_type": "text", "name": "_dlt_load_id"}, - "lsn": {"data_type": "bigint", "nullable": True}, - "deleted_ts": {"data_type": "timestamp", "nullable": True}, + "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, + "deleted_ts": { + "data_type": "timestamp", + "name": "deleted_ts", + "nullable": True, + }, }, }, ] diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 7c0215216..71e549106 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -27,7 +27,7 @@ @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) -@pytest.mark.parametrize("backend", ["sqlalchemy"]) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) def test_core_functionality( src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend ) -> None: From 28f463db54562f007d87fe4250c653ac58bb3372 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 30 Oct 2024 21:14:12 +0100 Subject: [PATCH 52/88] wip: almost done passing all tests --- sources/pg_legacy_replication/helpers.py | 62 ++++++- tests/pg_legacy_replication/cases.py | 7 +- tests/pg_legacy_replication/test_helpers.py | 164 +++++++++++++++++- .../test_pg_replication.py | 23 ++- 4 files changed, 246 insertions(+), 10 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index af1f62ecd..14c653e19 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -21,7 +21,7 @@ import psycopg2 from dlt.common import logger from dlt.common.pendulum import pendulum -from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns +from dlt.common.schema.typing import TColumnSchema, TTableSchema, TTableSchemaColumns from dlt.common.schema.utils import merge_column from dlt.common.typing import TDataItem from dlt.extract import DltSource, DltResource @@ -412,6 +412,10 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: last_schema = self.last_table_schema.get(table_name) included_columns = self.included_columns.get(table_name) + # Used cached schema if the operation is a delete since the inferred one will always be less precise + if msg.op == Op.DELETE and last_schema: + return last_schema + # Return cached schema if hash matches current_hash = hash_typeinfo(msg.new_typeinfo) if current_hash == self.last_table_hashes.get(table_name): @@ -422,9 +426,13 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: # Cache the inferred schema and hash if it is not already cached self.last_table_schema[table_name] = new_schema self.last_table_hashes[table_name] = current_hash - elif last_schema != new_schema: - # Raise an exception if there's a schema mismatch - raise StopReplication("Table schema change detected") + else: + try: + retained_schema = compare_schemas(last_schema, new_schema) + self.last_table_schema[table_name] = retained_schema + except AssertionError as e: + logger.warning(str(e)) + raise StopReplication return new_schema @@ -590,3 +598,49 @@ def gen_data_item( ) return data_item + + +ALLOWED_COL_SCHEMA_FIELDS: Set[str] = { + "name", + "data_type", + "nullable", + "precision", + "scale", +} + + +def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: + """Compares the last schema with the new one and chooses the more + precise one if they are relatively equal or else raises a + AssertionError due to an incompatible schema change""" + table_name = last["name"] + assert table_name == new["name"], "Table names do not match" + + table_schema: TTableSchema = {"name": table_name, "columns": {}} + + for name, s1 in last["columns"].items(): + s2 = new["columns"].get(name) + assert ( + s2 is not None and s1["data_type"] == s2["data_type"] + ), f"Incompatible schema for column '{name}'" + + # Ensure new has no fields outside of allowed fields + extra_fields = set(s2.keys()) - ALLOWED_COL_SCHEMA_FIELDS + assert not extra_fields, f"Unexpected fields {extra_fields} in column '{name}'" + + # Select the more precise schema by comparing nullable, precision, and scale + col_schema: TColumnSchema = { + "name": name, + "data_type": s1["data_type"], + } + if "nullable" in s1 or "nullable" in s2: + col_schema["nullable"] = s1.get("nullable", s2.get("nullable")) + if "precision" in s1 or "precision" in s2: + col_schema["precision"] = s1.get("precision", s2.get("precision")) + if "scale" in s1 or "scale" in s2: + col_schema["scale"] = s1.get("scale", s2.get("scale")) + + # Update with the more detailed schema per column + table_schema["columns"][name] = col_schema + + return table_schema diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 4d7f66514..876698f3f 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -292,7 +292,12 @@ { "name": "tbl_x", "columns": { - "id_x": {"data_type": "bigint", "name": "id_x", "precision": 64}, + "id_x": { + "data_type": "bigint", + "name": "id_x", + "precision": 64, + "nullable": False, + }, "val_x": {"data_type": "text", "name": "val_x"}, "_dlt_id": {"data_type": "text", "name": "_dlt_id"}, "_dlt_load_id": {"data_type": "text", "name": "_dlt_load_id"}, diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 03f2aa89f..107960dca 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -1,14 +1,13 @@ -from typing import Optional - -import pendulum import pytest from dlt.common.schema.typing import TTableSchema from dlt.common.typing import TDataItem from google.protobuf.json_format import ParseDict as parse_dict +from psycopg2.extras import StopReplication from sources.pg_legacy_replication.helpers import ( infer_table_schema, gen_data_item, + compare_schemas, ) from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage from .cases import ROW_MESSAGES, DATA_ITEMS, TABLE_SCHEMAS @@ -30,3 +29,162 @@ def test_gen_data_item(data, data_item: TDataItem): parse_dict(data, row_msg) column_schema = infer_table_schema(row_msg)["columns"] assert gen_data_item(row_msg, column_schema) == data_item + + +def test_compare_schemas(): + s1: TTableSchema = { + "name": "items", + "columns": { + "col1": { + "name": "col1", + "data_type": "bigint", + "precision": 64, + "nullable": False, + }, + "col2": {"name": "col2", "data_type": "double", "nullable": False}, + "col3": {"name": "col3", "data_type": "bool", "nullable": False}, + "col4": {"name": "col4", "data_type": "timestamp", "nullable": False}, + "col5": {"name": "col5", "data_type": "text", "nullable": False}, + "col6": { + "name": "col6", + "data_type": "decimal", + "precision": 38, + "scale": 9, + "nullable": False, + }, + "col7": {"name": "col7", "data_type": "binary", "nullable": False}, + "col9": {"name": "col9", "data_type": "complex", "nullable": False}, + "col10": {"name": "col10", "data_type": "date", "nullable": False}, + "col11": {"name": "col11", "data_type": "time", "nullable": False}, + "col1_null": { + "name": "col1_null", + "data_type": "bigint", + "precision": 64, + "nullable": True, + }, + "col2_null": {"name": "col2_null", "data_type": "double", "nullable": True}, + "col3_null": {"name": "col3_null", "data_type": "bool", "nullable": True}, + "col4_null": { + "name": "col4_null", + "data_type": "timestamp", + "nullable": True, + }, + "col5_null": {"name": "col5_null", "data_type": "text", "nullable": True}, + "col6_null": { + "name": "col6_null", + "data_type": "decimal", + "precision": 38, + "scale": 9, + "nullable": True, + }, + "col7_null": {"name": "col7_null", "data_type": "binary", "nullable": True}, + "col9_null": { + "name": "col9_null", + "data_type": "complex", + "nullable": True, + }, + "col10_null": {"name": "col10_null", "data_type": "date", "nullable": True}, + "col11_null": {"name": "col11_null", "data_type": "time", "nullable": True}, + "col1_precision": { + "name": "col1_precision", + "data_type": "bigint", + "precision": 16, + "nullable": False, + }, + "col4_precision": { + "name": "col4_precision", + "data_type": "timestamp", + "precision": 3, + "nullable": False, + }, + "col5_precision": { + "name": "col5_precision", + "data_type": "text", + "precision": 25, + "nullable": False, + }, + "col6_precision": { + "name": "col6_precision", + "data_type": "decimal", + "precision": 6, + "scale": 2, + "nullable": False, + }, + "col7_precision": { + "name": "col7_precision", + "data_type": "binary", + "nullable": False, + }, + "col11_precision": { + "name": "col11_precision", + "data_type": "time", + "precision": 3, + "nullable": False, + }, + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + }, + "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}, + "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, + "deleted_ts": { + "data_type": "timestamp", + "name": "deleted_ts", + "nullable": True, + }, + }, + } + s2: TTableSchema = { + "name": "items", + "columns": { + "col1": { + "name": "col1", + "data_type": "bigint", + "precision": 64, + "nullable": False, + }, + "col2": {"name": "col2", "data_type": "double"}, + "col3": {"name": "col3", "data_type": "bool"}, + "col4": {"name": "col4", "data_type": "timestamp"}, + "col5": {"name": "col5", "data_type": "text"}, + "col6": {"name": "col6", "data_type": "decimal"}, + "col7": {"name": "col7", "data_type": "binary"}, + "col9": {"name": "col9", "data_type": "complex"}, + "col10": {"name": "col10", "data_type": "date"}, + "col11": {"name": "col11", "data_type": "time"}, + "col1_null": {"name": "col1_null", "data_type": "bigint", "precision": 64}, + "col2_null": {"name": "col2_null", "data_type": "double"}, + "col3_null": {"name": "col3_null", "data_type": "bool"}, + "col4_null": {"name": "col4_null", "data_type": "timestamp"}, + "col5_null": {"name": "col5_null", "data_type": "text"}, + "col6_null": {"name": "col6_null", "data_type": "decimal"}, + "col7_null": {"name": "col7_null", "data_type": "binary"}, + "col9_null": {"name": "col9_null", "data_type": "complex"}, + "col10_null": {"name": "col10_null", "data_type": "date"}, + "col11_null": {"name": "col11_null", "data_type": "time"}, + "col1_precision": { + "name": "col1_precision", + "data_type": "bigint", + "precision": 16, + }, + "col4_precision": {"name": "col4_precision", "data_type": "timestamp"}, + "col5_precision": {"name": "col5_precision", "data_type": "text"}, + "col6_precision": {"name": "col6_precision", "data_type": "decimal"}, + "col7_precision": {"name": "col7_precision", "data_type": "binary"}, + "col11_precision": {"name": "col11_precision", "data_type": "time"}, + "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text"}, + "_dlt_id": {"name": "_dlt_id", "data_type": "text"}, + "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, + "deleted_ts": { + "data_type": "timestamp", + "name": "deleted_ts", + "nullable": True, + }, + }, + } + assert compare_schemas(s1, s2) == s1 + assert compare_schemas(s2, s1) == s1 + with pytest.raises(AssertionError): + s1["columns"]["col12"] = {"name": "col12", "data_type": "text"} + compare_schemas(s1, s2) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 71e549106..d44b4b5e2 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -50,7 +50,10 @@ def tbl_y(data): add_pk(src_pl.sql_client, "tbl_x", "id_x") add_pk(src_pl.sql_client, "tbl_y", "id_y") - table_options = {"tbl_x": {"backend": backend}, "tbl_y": {"backend": backend}} + table_options: Dict[str, SqlTableOptions] = { + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + } snapshots = init_replication( slot_name=slot_name, @@ -166,8 +169,9 @@ def tbl_y(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) def test_without_init_load( - src_config: Tuple[dlt.Pipeline, str], destination_name: str + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend ) -> None: @dlt.resource(write_disposition="merge", primary_key="id_x") def tbl_x(data): @@ -190,6 +194,11 @@ def tbl_y(data): add_pk(src_pl.sql_client, "tbl_x", "id_x") add_pk(src_pl.sql_client, "tbl_y", "id_y") + table_options: Dict[str, SqlTableOptions] = { + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + } + # initialize replication and create resource for changes init_replication( slot_name=slot_name, @@ -201,6 +210,7 @@ def tbl_y(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), + table_options=table_options, ) changes.tbl_x.apply_hints( write_disposition="merge", primary_key="id_x", columns=merge_hints @@ -248,11 +258,13 @@ def tbl_y(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("give_hints", [True, False]) @pytest.mark.parametrize("init_load", [True, False]) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) def test_mapped_data_types( src_config: Tuple[dlt.Pipeline, str], destination_name: str, give_hints: bool, init_load: bool, + backend: TableBackend, ) -> None: """Assert common data types (the ones mapped in PostgresTypeMapper) are properly handled.""" @@ -275,12 +287,18 @@ def items(data): src_pl.run(items(data)) add_pk(src_pl.sql_client, "items", "col1") + table_options: Dict[str, SqlTableOptions] = { + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + } + # initialize replication and create resources snapshot = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", take_snapshots=init_load, + table_options=table_options, ) if init_load and give_hints: snapshot.items.apply_hints(columns=column_schema) @@ -289,6 +307,7 @@ def items(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", + table_options=table_options, ) changes.items.apply_hints( write_disposition="merge", primary_key="col1", columns=merge_hints From 385e8a692e077d0839b34f0f15bafa07b6655b7a Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 30 Oct 2024 23:15:11 +0100 Subject: [PATCH 53/88] wip: some arrow tests are still not passing --- sources/pg_legacy_replication/__init__.py | 29 +++-- sources/pg_legacy_replication/helpers.py | 111 ++++++++++-------- .../test_pg_replication.py | 88 ++++++++------ tests/utils.py | 3 - 4 files changed, 132 insertions(+), 99 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 2f682b2f6..236bfb922 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -1,21 +1,20 @@ """Replicates postgres tables in batch using logical decoding.""" -from typing import Dict, Sequence, Optional, Iterable, Union +from typing import Any, Callable, Dict, Sequence, Optional, Iterable, Union import dlt -from dlt.common.schema.typing import TTableSchemaColumns from dlt.extract import DltResource from dlt.extract.items import TDataItem from dlt.sources.credentials import ConnectionStringCredentials from .helpers import ( + BackendHandler, + ItemGenerator, + SqlTableOptions, advance_slot, + cleanup_snapshot_resources, get_max_lsn, - ItemGenerator, - create_table_dispatch, init_replication, - cleanup_snapshot_resources, - SqlTableOptions, ) @@ -26,7 +25,6 @@ def replication_source( table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, table_options: Optional[Dict[str, SqlTableOptions]] = None, - column_hints: Optional[Dict[str, TTableSchemaColumns]] = None, target_batch_size: int = 1000, flush_slot: bool = True, ) -> Iterable[DltResource]: @@ -111,15 +109,22 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: wal_reader = replication_resource(slot_name) for table in table_names: + table_opts = table_options.get(table) if table_options else {} yield dlt.transformer( - create_table_dispatch( - table=table, - column_hints=column_hints.get(table) if column_hints else None, - table_options=table_options.get(table) if table_options else None, - ), + _create_table_dispatch(table=table, table_options=table_opts), data_from=wal_reader, name=table, ) +def _create_table_dispatch( + table: str, table_options: SqlTableOptions +) -> Callable[[TDataItem], Any]: + """Creates a dispatch handler that processes data items based on a specified table and optional column hints.""" + handler = BackendHandler(table, table_options) + # FIXME Uhhh.. why do I have to do this? + handler.__qualname__ = "BackendHandler.__call__" # type: ignore[attr-defined] + return handler + + __all__ = ["cleanup_snapshot_resources", "init_replication", "replication_source"] diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 14c653e19..df2afedd7 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -70,17 +70,19 @@ class SqlTableOptions(TypedDict, total=False): # Used by both sql_table and replication resources included_columns: Optional[Sequence[str]] + backend: Optional[TableBackend] + backend_kwargs: Optional[Dict[str, Any]] # Used only by sql_table resource metadata: Optional[MetaData] chunk_size: Optional[int] - backend: Optional[TableBackend] detect_precision_hints: Optional[bool] reflection_level: Optional[ReflectionLevel] defer_table_reflect: Optional[bool] table_adapter_callback: Optional[Callable[[Table], None]] - backend_kwargs: Optional[Dict[str, Any]] type_adapter_callback: Optional[TTypeAdapter] query_adapter_callback: Optional[TQueryAdapter] + # Used only by replication resource + column_hints: Optional[TTableSchemaColumns] @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) @@ -173,9 +175,8 @@ def on_begin(conn: ConnectionSqla) -> None: table_names = [table_names] if isinstance(table_names, str) else table_names or [] for table in table_names: - table_args = ( - table_options[table] if table_options and table in table_options else {} - ) + table_args = (table_options or {}).get(table, {}).copy() + table_args.pop("column_hints", None) # Remove "column_hints" if present yield sql_table(credentials=engine, table=table, schema=schema, **table_args) @@ -447,7 +448,6 @@ def hash_typeinfo(new_typeinfo: Sequence[TypeInfo]) -> int: class TableItems(NamedTuple): - table: str schema: TTableSchema items: List[TDataItem] @@ -490,9 +490,7 @@ def __iter__(self) -> Iterator[TableItems]: last_commit_lsn = consumer.last_commit_lsn ack_lsn(write_lsn=last_commit_lsn) for table, data_items in consumer.data_items.items(): - yield TableItems( - table, consumer.last_table_schema.get(table), data_items - ) + yield TableItems(consumer.last_table_schema[table], data_items) # Update state after flush self.last_commit_lsn = last_commit_lsn self.generated_all = consumer.consumed_all @@ -500,47 +498,66 @@ def __iter__(self) -> Iterator[TableItems]: cur.connection.close() -def create_table_dispatch( - table: str, - column_hints: Optional[TTableSchemaColumns] = None, - table_options: Optional[SqlTableOptions] = None, -) -> Callable[[TableItems], Iterable[DataItemWithMeta]]: - """Creates a dispatch handler that processes data items based on a specified table and optional column hints.""" +class BackendHandler: + def __init__(self, table: str, table_options: SqlTableOptions): + self.table = table + self.column_hints = table_options.get("column_hints") + self.backend = table_options.get("backend", "sqlalchemy") + self.backend_kwargs = table_options.get("backend_kwargs", {}) + + def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: + """Yields replication messages from ItemGenerator. - def handle(table_items: TableItems) -> Iterable[DataItemWithMeta]: - if table_items.table != table: + Args: + table_items: An object containing schema and items for the table. + + Yields: + DataItemWithMeta: Processed data items based on the table and backend. + """ + schema = table_items.schema + if schema["name"] != self.table: return - columns = table_items.schema["columns"] - if column_hints: - for col_name, col_hint in column_hints.items(): - columns[col_name] = merge_column(columns.get(col_name, {}), col_hint) - backend = ( - table_options.get("backend", "sqlalchemy") - if table_options - else "sqlalchemy" - ) - if backend == "sqlalchemy": - yield dlt.mark.with_hints( - [], - dlt.mark.make_hints(table_name=table, columns=columns), - create_table_variant=True, - ) - yield dlt.mark.with_table_name(table_items.items, table) - elif backend == "pyarrow": - backend_kwargs = table_options.get("backend_kwargs", {}) - ordered_keys = list(columns.keys()) - rows = [ - tuple(data_item.get(column, None) for column in ordered_keys) - for data_item in table_items.items - ] - arrow_table = arrow.row_tuples_to_arrow( - rows, columns, tz=backend_kwargs.get("tz", "UTC") - ) - yield dlt.mark.with_table_name(arrow_table, table) - else: - raise NotImplementedError(f"Unsupported backend: {backend}") - return handle + # Apply column hints if provided + if self.column_hints: + self.apply_column_hints(schema["columns"]) + + # Process based on backend + if self.backend == "sqlalchemy": + yield from self.emit_schema_and_items(schema["columns"], table_items.items) + elif self.backend == "pyarrow": + yield from self.emit_arrow_table(schema["columns"], table_items.items) + else: + raise NotImplementedError(f"Unsupported backend: {self.backend}") + + def apply_column_hints(self, columns: TTableSchemaColumns) -> None: + for col_name, col_hint in self.column_hints.items(): + columns[col_name] = merge_column(columns.get(col_name, {}), col_hint) + + def emit_schema_and_items( + self, columns: TTableSchemaColumns, items: List[TDataItem] + ) -> Iterator[DataItemWithMeta]: + yield dlt.mark.with_hints( + [], + dlt.mark.make_hints(table_name=self.table, columns=columns), + create_table_variant=True, + ) + yield dlt.mark.with_table_name(items, self.table) + + def emit_arrow_table( + self, columns: TTableSchemaColumns, items: List[TDataItem] + ) -> Iterator[DataItemWithMeta]: + # Create rows for pyarrow using ordered column keys + row_keys = list(columns.keys()) + rows = [tuple(item.get(col, None) for col in row_keys) for item in items] + yield dlt.mark.with_table_name( + arrow.row_tuples_to_arrow( + rows, + columns, + tz=self.backend_kwargs.get("tz", "UTC"), + ), + self.table, + ) def infer_table_schema( diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index d44b4b5e2..6b23186fa 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -50,24 +50,25 @@ def tbl_y(data): add_pk(src_pl.sql_client, "tbl_x", "id_x") add_pk(src_pl.sql_client, "tbl_y", "id_y") - table_options: Dict[str, SqlTableOptions] = { - "tbl_x": {"backend": backend}, - "tbl_y": {"backend": backend}, - } - snapshots = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), take_snapshots=True, - table_options=table_options, + table_options={ + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + }, ) changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - table_options=table_options, + table_options={ + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + }, ) changes.tbl_x.apply_hints( write_disposition="merge", primary_key="id_x", columns=merge_hints @@ -194,11 +195,6 @@ def tbl_y(data): add_pk(src_pl.sql_client, "tbl_x", "id_x") add_pk(src_pl.sql_client, "tbl_y", "id_y") - table_options: Dict[str, SqlTableOptions] = { - "tbl_x": {"backend": backend}, - "tbl_y": {"backend": backend}, - } - # initialize replication and create resource for changes init_replication( slot_name=slot_name, @@ -210,7 +206,10 @@ def tbl_y(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - table_options=table_options, + table_options={ + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + }, ) changes.tbl_x.apply_hints( write_disposition="merge", primary_key="id_x", columns=merge_hints @@ -287,18 +286,13 @@ def items(data): src_pl.run(items(data)) add_pk(src_pl.sql_client, "items", "col1") - table_options: Dict[str, SqlTableOptions] = { - "tbl_x": {"backend": backend}, - "tbl_y": {"backend": backend}, - } - # initialize replication and create resources snapshot = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", take_snapshots=init_load, - table_options=table_options, + table_options={"items": {"backend": backend}}, ) if init_load and give_hints: snapshot.items.apply_hints(columns=column_schema) @@ -307,7 +301,7 @@ def items(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", - table_options=table_options, + table_options={"items": {"backend": backend}}, ) changes.items.apply_hints( write_disposition="merge", primary_key="col1", columns=merge_hints @@ -388,8 +382,9 @@ def items(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) def test_unmapped_data_types( - src_config: Tuple[dlt.Pipeline, str], destination_name: str + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend ) -> None: """Assert postgres data types that aren't explicitly mapped default to "text" type.""" src_pl, slot_name = src_config @@ -411,6 +406,7 @@ def test_unmapped_data_types( slot_name=slot_name, schema=src_pl.dataset_name, table_names="data_types", + table_options={"data_types": {"backend": backend}}, ) # insert record in source table to create replication item @@ -433,8 +429,12 @@ def test_unmapped_data_types( @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("init_load", [True, False]) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) def test_included_columns( - src_config: Tuple[dlt.Pipeline, str], destination_name: str, init_load: bool + src_config: Tuple[dlt.Pipeline, str], + destination_name: str, + init_load: bool, + backend: TableBackend, ) -> None: def get_cols(pipeline: dlt.Pipeline, table_name: str) -> set: with pipeline.destination_client(pipeline.default_schema_name) as client: @@ -469,9 +469,10 @@ def tbl_z(data): ) # initialize replication and create resources - options = { - "tbl_x": SqlTableOptions(included_columns=["id_x", "val_x"]), - "tbl_y": SqlTableOptions(included_columns=["id_y", "val_y"]), + table_options = { + "tbl_x": SqlTableOptions(included_columns=["id_x", "val_x"], backend=backend), + "tbl_y": SqlTableOptions(included_columns=["id_y", "val_y"], backend=backend), + "tbl_z": SqlTableOptions(backend=backend), # tbl_z is not specified, hence all columns should be included } snapshots = init_replication( @@ -479,13 +480,13 @@ def tbl_z(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), take_snapshots=init_load, - table_options=options, + table_options=table_options, ) changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - table_options=options, + table_options=table_options, ) # update three postgres tables @@ -522,8 +523,12 @@ def tbl_z(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("init_load", [True, False]) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) def test_column_hints( - src_config: Tuple[dlt.Pipeline, str], destination_name: str, init_load: bool + src_config: Tuple[dlt.Pipeline, str], + destination_name: str, + init_load: bool, + backend: TableBackend, ) -> None: @dlt.resource def tbl_x(data): @@ -549,9 +554,16 @@ def tbl_z(data): ) # initialize replication and create resources - column_hints: Dict[str, TTableSchemaColumns] = { - "tbl_x": {"another_col_x": {"data_type": "double"}}, - "tbl_y": {"another_col_y": {"precision": 32}}, + table_options: Dict[str, SqlTableOptions] = { + "tbl_x": { + "backend": backend, + "column_hints": {"another_col_x": {"data_type": "double"}}, + }, + "tbl_y": { + "backend": backend, + "column_hints": {"another_col_y": {"precision": 32}}, + }, + "tbl_z": {"backend": backend}, # tbl_z is not specified, hence all columns should be included } @@ -560,6 +572,7 @@ def tbl_z(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), take_snapshots=init_load, + table_options=table_options, ) if init_load: snapshots.tbl_x.apply_hints(columns={"another_col_x": {"data_type": "double"}}) @@ -569,7 +582,7 @@ def tbl_z(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - column_hints=column_hints, + table_options=table_options, ) # update three postgres tables @@ -635,8 +648,9 @@ def tbl_z(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) def test_table_schema_change( - src_config: Tuple[dlt.Pipeline, str], destination_name: str + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend ) -> None: src_pl, slot_name = src_config @@ -655,6 +669,7 @@ def test_table_schema_change( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", + table_options={"items": {"backend": backend}}, ) dest_pl = dlt.pipeline( pipeline_name="dest_pl", destination=destination_name, dev_mode=True @@ -683,7 +698,8 @@ def test_table_schema_change( ) -def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_batching(src_config: Tuple[dlt.Pipeline, str], backend: TableBackend) -> None: # this test asserts the number of data items yielded by the replication resource # is not affected by `target_batch_size` and the number of replication messages per transaction src_pl, slot_name = src_config @@ -703,6 +719,7 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: schema=src_pl.dataset_name, table_names="items", target_batch_size=50, + table_options={"items": {"backend": backend}}, ) # create destination pipeline and resource @@ -712,9 +729,6 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str]) -> None: batch = [{**r, **{"id": key}} for r in [data] for key in range(1, 101)] src_pl.run(batch, table_name="items") extract_info = dest_pl.extract(changes) - from devtools import debug - - debug(extract_info) assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 # insert 100 records into source table in 5 transactions diff --git a/tests/utils.py b/tests/utils.py index 9ba13f974..cb795fd10 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -235,9 +235,6 @@ def assert_load_info(info: LoadInfo, expected_load_packages: int = 1) -> None: # no failed jobs in any of the packages info.raise_on_failed_jobs() except AssertionError: - from devtools import debug - - debug(info) raise From a291b698195dba0531a0e59d5775e75580c3f83c Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 31 Oct 2024 01:08:10 +0100 Subject: [PATCH 54/88] fix: done with pyarrow; too many issues with duckdb atm --- sources/pg_legacy_replication/helpers.py | 10 ++-- tests/pg_legacy_replication/test_helpers.py | 46 ++++++++++++++++++- .../test_pg_replication.py | 4 +- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index df2afedd7..d032e9011 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -432,7 +432,7 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: retained_schema = compare_schemas(last_schema, new_schema) self.last_table_schema[table_name] = retained_schema except AssertionError as e: - logger.warning(str(e)) + logger.debug(str(e)) raise StopReplication return new_schema @@ -634,9 +634,13 @@ def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: assert table_name == new["name"], "Table names do not match" table_schema: TTableSchema = {"name": table_name, "columns": {}} + last_cols, new_cols = last["columns"], new["columns"] + assert len(last_cols) == len( + new_cols + ), f"Columns mismatch last:{last['columns']} new:{new['columns']}" - for name, s1 in last["columns"].items(): - s2 = new["columns"].get(name) + for name, s1 in last_cols.items(): + s2 = new_cols.get(name) assert ( s2 is not None and s1["data_type"] == s2["data_type"] ), f"Incompatible schema for column '{name}'" diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 107960dca..60e0fac67 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -185,6 +185,50 @@ def test_compare_schemas(): } assert compare_schemas(s1, s2) == s1 assert compare_schemas(s2, s1) == s1 + + s1 = { + "columns": { + "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "c1": { + "data_type": "bigint", + "name": "c1", + "nullable": True, + "precision": 64, + }, + "c2": { + "data_type": "bigint", + "name": "c2", + "nullable": True, + "precision": 64, + }, + "c3": { + "data_type": "bigint", + "name": "c3", + "nullable": True, + "precision": 64, + }, + "deleted_ts": { + "data_type": "timestamp", + "name": "deleted_ts", + "nullable": True, + }, + "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, + }, + "name": "items", + } + from copy import deepcopy + + s2 = deepcopy(s1) + s2["columns"]["c4"] = { + "data_type": "bigint", + "name": "c4", + "nullable": True, + "precision": 64, + } with pytest.raises(AssertionError): - s1["columns"]["col12"] = {"name": "col12", "data_type": "text"} compare_schemas(s1, s2) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 6b23186fa..535a2424b 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -257,7 +257,9 @@ def tbl_y(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("give_hints", [True, False]) @pytest.mark.parametrize("init_load", [True, False]) -@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +@pytest.mark.parametrize( + "backend", ["sqlalchemy"] +) # FIXME Too many issues with duckdb and timestamps atm def test_mapped_data_types( src_config: Tuple[dlt.Pipeline, str], destination_name: str, From ba235052d3f634302fc7cb3875eebb6b096900d8 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 31 Oct 2024 18:35:28 +0100 Subject: [PATCH 55/88] wip: some bug fixes --- sources/pg_legacy_replication/__init__.py | 2 +- sources/pg_legacy_replication/helpers.py | 42 ++++++---- sources/pg_legacy_replication/schema_types.py | 17 ++-- tests/pg_legacy_replication/cases.py | 82 ++++++++++++++++--- 4 files changed, 109 insertions(+), 34 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 236bfb922..39f86893e 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -68,7 +68,7 @@ def replication_source( slot. Recommended value is True. Be careful when setting False—not flushing can eventually lead to a “disk full” condition on the server, because the server retains all the WAL segments that might be needed to stream - the changes via all of the currently open replication slots. + the changes via all the currently open replication slots. Yields: Data items for changes published in the publication. diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index d032e9011..43ca38714 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -1,7 +1,6 @@ import hashlib from collections import defaultdict from dataclasses import dataclass, field -from functools import partial from typing import ( Optional, Dict, @@ -472,7 +471,6 @@ def __iter__(self) -> Iterator[TableItems]: Does not advance the slot. """ cur = _get_rep_conn(self.credentials).cursor() - ack_lsn = partial(cur.send_feedback, reply=True, force=True) cur.start_replication( slot_name=self.slot_name, start_lsn=self.start_lsn, decode=False ) @@ -487,15 +485,20 @@ def __iter__(self) -> Iterator[TableItems]: except StopReplication: # completed batch or reached `upto_lsn` pass finally: - last_commit_lsn = consumer.last_commit_lsn - ack_lsn(write_lsn=last_commit_lsn) for table, data_items in consumer.data_items.items(): yield TableItems(consumer.last_table_schema[table], data_items) # Update state after flush - self.last_commit_lsn = last_commit_lsn + self.last_commit_lsn = consumer.last_commit_lsn self.generated_all = consumer.consumed_all - ack_lsn(flush_lsn=last_commit_lsn) - cur.connection.close() + self.ack_and_close(cur) + + def ack_and_close(self, cur: ReplicationCursor) -> None: + if self.generated_all: + commit_lsn = self.last_commit_lsn + cur.send_feedback( + write_lsn=commit_lsn, flush_lsn=commit_lsn, reply=True, force=True + ) + cur.connection.close() class BackendHandler: @@ -519,16 +522,27 @@ def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: return # Apply column hints if provided + columns = schema["columns"] + data = table_items.items if self.column_hints: - self.apply_column_hints(schema["columns"]) + self.apply_column_hints(columns) # Process based on backend - if self.backend == "sqlalchemy": - yield from self.emit_schema_and_items(schema["columns"], table_items.items) - elif self.backend == "pyarrow": - yield from self.emit_arrow_table(schema["columns"], table_items.items) - else: - raise NotImplementedError(f"Unsupported backend: {self.backend}") + try: + if self.backend == "sqlalchemy": + yield from self.emit_schema_and_items(columns, data) + elif self.backend == "pyarrow": + yield from self.emit_arrow_table(columns, data) + else: + raise NotImplementedError(f"Unsupported backend: {self.backend}") + except Exception: + logger.error( + "A fatal error occurred while processing batch for '%s' (columns=%s, data=%s)", + self.table, + columns, + data, + ) + raise def apply_column_hints(self, columns: TTableSchemaColumns) -> None: for col_name, col_hint in self.column_hints.items(): diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 273b71957..40298fda6 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -1,14 +1,14 @@ import re from functools import lru_cache -from typing import Optional, Any, Dict, Callable, Union, Tuple +from typing import Optional, Any, Dict, Callable, Tuple import pendulum from dlt.common import Decimal +from dlt.common import logger from dlt.common.data_types.type_helpers import coerce_value from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import TColumnSchema, TColumnType from dlt.destinations import postgres -from dlt.common import logger from .pg_logicaldec_pb2 import DatumMessage, TypeInfo @@ -18,12 +18,12 @@ "bool": True, "complex": [0], "json": [0], # type: ignore[dict-item] - "date": "2000-01-01", + "date": pendulum.Date(1970, 1, 1), "decimal": Decimal(0), "double": 0.0, "text": "", - "time": "00:00:00", - "timestamp": "2000-01-01T00:00:00", + "time": pendulum.Time(0, 0, 0), + "timestamp": pendulum.from_timestamp(0), "wei": 0, } """Dummy values used to replace NULLs in NOT NULL columns in key-only delete records.""" @@ -44,7 +44,7 @@ 1700: "numeric", 3802: "jsonb", } -"""Maps postgres type OID to type string. Only includes types present in PostgresTypeMapper.""" +"""Maps postgres type OID to type string.""" _MISSING_TYPES: Dict[str, TDataType] = { "real": "double", @@ -184,10 +184,11 @@ def _to_dlt_val( if data_type in data_type_handlers: return data_type_handlers[data_type](raw_value) + raw_type = _DATUM_RAW_TYPES[datum] try: - return coerce_value(data_type, _DATUM_RAW_TYPES[datum], raw_value) + return coerce_value(data_type, raw_type, raw_value) except ValueError: # FIXME Hack to get it to work with 0.5.x and 1.x if data_type == "json": - return coerce_value("complex", _DATUM_RAW_TYPES[datum], raw_value) + return coerce_value("complex", raw_type, raw_value) raise diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 876698f3f..b4071c785 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -198,20 +198,60 @@ "oldTuple": [ { "columnName": "id_x", - "columnType": "20", - "datumInt64": "1", + "columnType": 20, + "datumInt64": 1, }, { "columnName": "val_x", - "columnType": "1043", + "columnType": 1043, }, { - "columnName": "_dlt_load_id", - "columnType": "1043", + "columnName": "col_bool", + "columnType": 16, }, { - "columnName": "_dlt_id", - "columnType": "1043", + "columnName": "col_bytea", + "columnType": 17, + }, + { + "columnName": "col_int4", + "columnType": 21, + }, + { + "columnName": "col_int", + "columnType": 23, + }, + { + "columnName": "col_real", + "columnType": 700, + }, + { + "columnName": "col_double", + "columnType": 701, + }, + { + "columnName": "col_date", + "columnType": 1082, + }, + { + "columnName": "col_time", + "columnType": 1083, + }, + { + "columnName": "col_ts", + "columnType": 1114, + }, + { + "columnName": "col_tstz", + "columnType": 1184, + }, + { + "columnName": "col_num", + "columnType": 1700, + }, + { + "columnName": "col_json", + "columnType": 3802, }, ], }, @@ -241,8 +281,18 @@ { "id_x": 1, "val_x": "", - "_dlt_load_id": "", - "_dlt_id": "", + "col_bool": True, + "col_bytea": b" ", + "col_int4": 0, + "col_int": 0, + "col_real": 0.0, + "col_double": 0.0, + "col_time": pendulum.parse("00:00:00", strict=False).time(), + "col_date": pendulum.parse("1970-01-01", strict=False).date(), + "col_ts": pendulum.parse("1970-01-01T00:00:00+00:00"), + "col_tstz": pendulum.parse("1970-01-01T00:00:00+00:00"), + "col_num": Decimal(0), + "col_json": [0], "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), }, ] @@ -299,8 +349,18 @@ "nullable": False, }, "val_x": {"data_type": "text", "name": "val_x"}, - "_dlt_id": {"data_type": "text", "name": "_dlt_id"}, - "_dlt_load_id": {"data_type": "text", "name": "_dlt_load_id"}, + "col_bool": {"data_type": "bool", "name": "col_bool"}, + "col_bytea": {"data_type": "binary", "name": "col_bytea"}, + "col_int4": {"data_type": "bigint", "name": "col_int4", "precision": 16}, + "col_int": {"data_type": "bigint", "name": "col_int", "precision": 32}, + "col_real": {"data_type": "double", "name": "col_real"}, + "col_double": {"data_type": "double", "name": "col_double"}, + "col_date": {"data_type": "date", "name": "col_date"}, + "col_time": {"data_type": "time", "name": "col_time"}, + "col_ts": {"data_type": "timestamp", "name": "col_ts"}, + "col_tstz": {"data_type": "timestamp", "name": "col_tstz"}, + "col_num": {"data_type": "decimal", "name": "col_num"}, + "col_json": {"data_type": "complex", "name": "col_json"}, "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, "deleted_ts": { "data_type": "timestamp", From 5993fb4bb3fa8a550d36223707c06741698d038b Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sun, 3 Nov 2024 17:06:42 +0100 Subject: [PATCH 56/88] wip: small refactoring --- sources/pg_legacy_replication/helpers.py | 27 ++++++++++++------------ 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 43ca38714..5024fae76 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -485,19 +485,20 @@ def __iter__(self) -> Iterator[TableItems]: except StopReplication: # completed batch or reached `upto_lsn` pass finally: - for table, data_items in consumer.data_items.items(): - yield TableItems(consumer.last_table_schema[table], data_items) - # Update state after flush - self.last_commit_lsn = consumer.last_commit_lsn - self.generated_all = consumer.consumed_all - self.ack_and_close(cur) - - def ack_and_close(self, cur: ReplicationCursor) -> None: - if self.generated_all: - commit_lsn = self.last_commit_lsn - cur.send_feedback( - write_lsn=commit_lsn, flush_lsn=commit_lsn, reply=True, force=True - ) + yield from self.flush_batch(cur, consumer) + + def flush_batch( + self, cur: ReplicationCursor, consumer: MessageConsumer + ) -> Iterator[TableItems]: + last_commit_lsn = consumer.last_commit_lsn + consumed_all = consumer.consumed_all + for table, data_items in consumer.data_items.items(): + yield TableItems(consumer.last_table_schema[table], data_items) + cur.send_feedback(write_lsn=last_commit_lsn, reply=True, force=True) + if consumed_all: + cur.send_feedback(flush_lsn=last_commit_lsn, reply=True, force=True) + self.last_commit_lsn = last_commit_lsn + self.generated_all = consumed_all cur.connection.close() From 6db693adc096e5492468c2adff80d0eb0b7bb1ee Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sun, 3 Nov 2024 22:37:31 +0100 Subject: [PATCH 57/88] wip: duckdb needs patching, trying out new max_lsn --- sources/pg_legacy_replication/helpers.py | 30 +++++++++++++------ tests/pg_legacy_replication/cases.py | 4 +-- .../test_pg_replication.py | 4 +-- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 5024fae76..a5b60f449 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -252,11 +252,18 @@ def get_max_lsn( Raises error if the replication slot or publication does not exist. """ cur = _get_conn(credentials).cursor() - lsn_field = "location" if get_pg_version(cur) < 100000 else "lsn" - cur.execute( - f"SELECT MAX({lsn_field} - '0/0') AS max_lsn " # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) - f"FROM pg_logical_slot_peek_binary_changes('{slot_name}', NULL, NULL);" + loc_fn = ( + "pg_current_xlog_location" + if get_pg_version(cur) < 100000 + else "pg_current_wal_lsn" ) + # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) + cur.execute(f"SELECT {loc_fn}() - '0/0' as max_lsn;") + # lsn_field = "location" if get_pg_version(cur) < 100000 else "lsn" + # cur.execute( + # f"SELECT MAX({lsn_field} - '0/0') AS max_lsn " # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) + # f"FROM pg_logical_slot_peek_binary_changes('{slot_name}', NULL, NULL);" + # ) lsn: int = cur.fetchone()[0] cur.connection.close() return lsn @@ -483,9 +490,9 @@ def __iter__(self) -> Iterator[TableItems]: try: cur.consume_stream(consumer) except StopReplication: # completed batch or reached `upto_lsn` - pass - finally: yield from self.flush_batch(cur, consumer) + finally: + cur.connection.close() def flush_batch( self, cur: ReplicationCursor, consumer: MessageConsumer @@ -494,12 +501,17 @@ def flush_batch( consumed_all = consumer.consumed_all for table, data_items in consumer.data_items.items(): yield TableItems(consumer.last_table_schema[table], data_items) - cur.send_feedback(write_lsn=last_commit_lsn, reply=True, force=True) if consumed_all: - cur.send_feedback(flush_lsn=last_commit_lsn, reply=True, force=True) + cur.send_feedback( + write_lsn=last_commit_lsn, + flush_lsn=last_commit_lsn, + reply=True, + force=True, + ) + else: + cur.send_feedback(write_lsn=last_commit_lsn, reply=True, force=True) self.last_commit_lsn = last_commit_lsn self.generated_all = consumed_all - cur.connection.close() class BackendHandler: diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index b4071c785..1fbef56ab 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -74,7 +74,7 @@ { "name": "col4_precision", "data_type": "timestamp", - "precision": 3, + "precision": 6, "nullable": False, }, {"name": "col5_precision", "data_type": "text", "precision": 25, "nullable": False}, @@ -91,7 +91,7 @@ "precision": 19, "nullable": False, }, - {"name": "col11_precision", "data_type": "time", "precision": 3, "nullable": False}, + {"name": "col11_precision", "data_type": "time", "precision": 6, "nullable": False}, ] if "complex" in DATA_TYPES: diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 535a2424b..6b23186fa 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -257,9 +257,7 @@ def tbl_y(data): @pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) @pytest.mark.parametrize("give_hints", [True, False]) @pytest.mark.parametrize("init_load", [True, False]) -@pytest.mark.parametrize( - "backend", ["sqlalchemy"] -) # FIXME Too many issues with duckdb and timestamps atm +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) def test_mapped_data_types( src_config: Tuple[dlt.Pipeline, str], destination_name: str, From c53c9f9ea21272be344f42f6a916d835ff79b71d Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 14 Nov 2024 15:56:26 +0100 Subject: [PATCH 58/88] wip: some refactoring of options to make certain features togglable --- sources/pg_legacy_replication/__init__.py | 13 +++++-- sources/pg_legacy_replication/helpers.py | 38 ++++++++++--------- sources/pg_legacy_replication_pipeline.py | 2 +- tests/pg_legacy_replication/cases.py | 20 +++++----- tests/pg_legacy_replication/test_helpers.py | 19 +++++----- .../test_pg_replication.py | 28 ++++++++------ 6 files changed, 67 insertions(+), 53 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 39f86893e..e744fba5b 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -10,7 +10,7 @@ from .helpers import ( BackendHandler, ItemGenerator, - SqlTableOptions, + ReplicationOptions, advance_slot, cleanup_snapshot_resources, get_max_lsn, @@ -24,7 +24,7 @@ def replication_source( schema: str, table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, - table_options: Optional[Dict[str, SqlTableOptions]] = None, + table_options: Optional[Dict[str, ReplicationOptions]] = None, target_batch_size: int = 1000, flush_slot: bool = True, ) -> Iterable[DltResource]: @@ -118,7 +118,7 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: def _create_table_dispatch( - table: str, table_options: SqlTableOptions + table: str, table_options: ReplicationOptions ) -> Callable[[TDataItem], Any]: """Creates a dispatch handler that processes data items based on a specified table and optional column hints.""" handler = BackendHandler(table, table_options) @@ -127,4 +127,9 @@ def _create_table_dispatch( return handler -__all__ = ["cleanup_snapshot_resources", "init_replication", "replication_source"] +__all__ = [ + "ReplicationOptions", + "cleanup_snapshot_resources", + "init_replication", + "replication_source", +] diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index a5b60f449..53d67b0fe 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -66,22 +66,27 @@ from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val +class ReplicationOptions(TypedDict, total=False): + backend: Optional[TableBackend] + backend_kwargs: Optional[Dict[str, Any]] + column_hints: Optional[TTableSchemaColumns] + include_deleted_timestamp: Optional[bool] # Default is true + include_lsn: Optional[bool] # Default is true + included_columns: Optional[Set[str]] + + class SqlTableOptions(TypedDict, total=False): - # Used by both sql_table and replication resources - included_columns: Optional[Sequence[str]] backend: Optional[TableBackend] backend_kwargs: Optional[Dict[str, Any]] - # Used only by sql_table resource - metadata: Optional[MetaData] chunk_size: Optional[int] + defer_table_reflect: Optional[bool] detect_precision_hints: Optional[bool] + included_columns: Optional[Sequence[str]] + metadata: Optional[MetaData] + query_adapter_callback: Optional[TQueryAdapter] reflection_level: Optional[ReflectionLevel] - defer_table_reflect: Optional[bool] table_adapter_callback: Optional[Callable[[Table], None]] type_adapter_callback: Optional[TTypeAdapter] - query_adapter_callback: Optional[TQueryAdapter] - # Used only by replication resource - column_hints: Optional[TTableSchemaColumns] @dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) @@ -175,7 +180,6 @@ def on_begin(conn: ConnectionSqla) -> None: for table in table_names: table_args = (table_options or {}).get(table, {}).copy() - table_args.pop("column_hints", None) # Remove "column_hints" if present yield sql_table(credentials=engine, table=table, schema=schema, **table_args) @@ -334,7 +338,7 @@ def __init__( upto_lsn: int, table_qnames: Set[str], target_batch_size: int = 1000, - table_options: Optional[Dict[str, SqlTableOptions]] = None, + table_options: Optional[Dict[str, ReplicationOptions]] = None, ) -> None: self.upto_lsn = upto_lsn self.table_qnames = table_qnames @@ -412,7 +416,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: data_item = gen_data_item( msg, table_schema["columns"], self.included_columns.get(table_name) ) - data_item["lsn"] = lsn + data_item["_pg_lsn"] = lsn self.data_items[table_name].append(data_item) def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: @@ -466,7 +470,7 @@ class ItemGenerator: upto_lsn: int start_lsn: int = 0 target_batch_size: int = 1000 - table_options: Optional[Dict[str, SqlTableOptions]] = None + table_options: Optional[Dict[str, ReplicationOptions]] = None last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -515,7 +519,7 @@ def flush_batch( class BackendHandler: - def __init__(self, table: str, table_options: SqlTableOptions): + def __init__(self, table: str, table_options: ReplicationOptions): self.table = table self.column_hints = table_options.get("column_hints") self.backend = table_options.get("backend", "sqlalchemy") @@ -605,10 +609,10 @@ def infer_table_schema( } # Add replication columns - columns["lsn"] = {"data_type": "bigint", "name": "lsn", "nullable": True} - columns["deleted_ts"] = { + columns["_pg_lsn"] = {"data_type": "bigint", "name": "_pg_lsn", "nullable": True} + columns["_pg_deleted_ts"] = { "data_type": "timestamp", - "name": "deleted_ts", + "name": "_pg_deleted_ts", "nullable": True, } @@ -629,7 +633,7 @@ def gen_data_item( row = msg.new_tuple else: row = msg.old_tuple - data_item["deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) + data_item["_pg_deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) for data in row: col_name = data.column_name diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index e2d580f19..c8275d426 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -163,7 +163,7 @@ def replicate_with_column_selection() -> None: schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), table_options={ - "tbl_x": {"included_columns": ["c1", "c2"]} + "tbl_x": {"included_columns": {"c1", "c2"}} }, # columns not specified here are excluded from generated data items ) diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 1fbef56ab..e22ddc3fc 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -293,7 +293,7 @@ "col_tstz": pendulum.parse("1970-01-01T00:00:00+00:00"), "col_num": Decimal(0), "col_json": [0], - "deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), + "_pg_deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), }, ] @@ -314,10 +314,10 @@ "name": "_dlt_load_id", "nullable": False, }, - "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, - "deleted_ts": { + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { "data_type": "timestamp", - "name": "deleted_ts", + "name": "_pg_deleted_ts", "nullable": True, }, }, @@ -331,10 +331,10 @@ "col11": {"data_type": "time", "name": "col11", "nullable": False}, "col12": {"data_type": "timestamp", "name": "col12", "nullable": True}, "col13": {"data_type": "double", "name": "col13", "nullable": True}, - "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, - "deleted_ts": { + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { "data_type": "timestamp", - "name": "deleted_ts", + "name": "_pg_deleted_ts", "nullable": True, }, }, @@ -361,10 +361,10 @@ "col_tstz": {"data_type": "timestamp", "name": "col_tstz"}, "col_num": {"data_type": "decimal", "name": "col_num"}, "col_json": {"data_type": "complex", "name": "col_json"}, - "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, - "deleted_ts": { + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { "data_type": "timestamp", - "name": "deleted_ts", + "name": "_pg_deleted_ts", "nullable": True, }, }, diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 60e0fac67..dbe5107de 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -2,7 +2,6 @@ from dlt.common.schema.typing import TTableSchema from dlt.common.typing import TDataItem from google.protobuf.json_format import ParseDict as parse_dict -from psycopg2.extras import StopReplication from sources.pg_legacy_replication.helpers import ( infer_table_schema, @@ -127,10 +126,10 @@ def test_compare_schemas(): "nullable": False, }, "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}, - "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, - "deleted_ts": { + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { "data_type": "timestamp", - "name": "deleted_ts", + "name": "_pg_deleted_ts", "nullable": True, }, }, @@ -175,10 +174,10 @@ def test_compare_schemas(): "col11_precision": {"name": "col11_precision", "data_type": "time"}, "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text"}, "_dlt_id": {"name": "_dlt_id", "data_type": "text"}, - "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, - "deleted_ts": { + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { "data_type": "timestamp", - "name": "deleted_ts", + "name": "_pg_deleted_ts", "nullable": True, }, }, @@ -212,12 +211,12 @@ def test_compare_schemas(): "nullable": True, "precision": 64, }, - "deleted_ts": { + "_pg_deleted_ts": { "data_type": "timestamp", - "name": "deleted_ts", + "name": "_pg_deleted_ts", "nullable": True, }, - "lsn": {"data_type": "bigint", "name": "lsn", "nullable": True}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, }, "name": "items", } diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 6b23186fa..c333b7b56 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -10,6 +10,7 @@ init_replication, cleanup_snapshot_resources, replication_source, + ReplicationOptions, ) from sources.pg_legacy_replication.helpers import SqlTableOptions, TableBackend from tests.utils import ( @@ -21,8 +22,8 @@ from .utils import add_pk, assert_loaded_data merge_hints: TTableSchemaColumns = { - "deleted_ts": {"hard_delete": True}, - "lsn": {"dedup_sort": "desc"}, + "_pg_deleted_ts": {"hard_delete": True}, + "_pg_lsn": {"dedup_sort": "desc"}, } @@ -470,9 +471,9 @@ def tbl_z(data): # initialize replication and create resources table_options = { - "tbl_x": SqlTableOptions(included_columns=["id_x", "val_x"], backend=backend), - "tbl_y": SqlTableOptions(included_columns=["id_y", "val_y"], backend=backend), - "tbl_z": SqlTableOptions(backend=backend), + "tbl_x": {"backend": backend, "included_columns": {"id_x", "val_x"}}, + "tbl_y": {"backend": backend, "included_columns": {"id_y", "val_y"}}, + "tbl_z": {"backend": backend}, # tbl_z is not specified, hence all columns should be included } snapshots = init_replication( @@ -510,14 +511,14 @@ def tbl_z(data): assert get_cols(dest_pl, "tbl_z") == {"id_z", "val_z", "another_col_z"} dest_pl.run(changes) - assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x", "lsn", "deleted_ts"} - assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y", "lsn", "deleted_ts"} + assert get_cols(dest_pl, "tbl_x") == {"id_x", "val_x", "_pg_lsn", "_pg_deleted_ts"} + assert get_cols(dest_pl, "tbl_y") == {"id_y", "val_y", "_pg_lsn", "_pg_deleted_ts"} assert get_cols(dest_pl, "tbl_z") == { "id_z", "val_z", "another_col_z", - "lsn", - "deleted_ts", + "_pg_lsn", + "_pg_deleted_ts", } @@ -554,7 +555,7 @@ def tbl_z(data): ) # initialize replication and create resources - table_options: Dict[str, SqlTableOptions] = { + table_options: Dict[str, ReplicationOptions] = { "tbl_x": { "backend": backend, "column_hints": {"another_col_x": {"data_type": "double"}}, @@ -572,7 +573,12 @@ def tbl_z(data): schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), take_snapshots=init_load, - table_options=table_options, + table_options={ + "tbl_x": {"backend": backend}, + "tbl_y": {"backend": backend}, + "tbl_z": {"backend": backend}, + # tbl_z is not specified, hence all columns should be included + }, ) if init_load: snapshots.tbl_x.apply_hints(columns={"another_col_x": {"data_type": "double"}}) From ba1c3fcec611231394ee2504e12364033b0952a2 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 15 Nov 2024 10:46:56 +0100 Subject: [PATCH 59/88] wip: lsn and deleted ts are optional --- sources/pg_legacy_replication/helpers.py | 104 ++++++++++---------- sources/pg_legacy_replication_pipeline.py | 4 +- tests/pg_legacy_replication/cases.py | 7 +- tests/pg_legacy_replication/test_helpers.py | 2 +- 4 files changed, 61 insertions(+), 56 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 53d67b0fe..557bd9690 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -70,7 +70,7 @@ class ReplicationOptions(TypedDict, total=False): backend: Optional[TableBackend] backend_kwargs: Optional[Dict[str, Any]] column_hints: Optional[TTableSchemaColumns] - include_deleted_timestamp: Optional[bool] # Default is true + include_deleted_ts: Optional[bool] # Default is true include_lsn: Optional[bool] # Default is true included_columns: Optional[Set[str]] @@ -343,11 +343,7 @@ def __init__( self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.included_columns = { - table: set(options["included_columns"]) - for table, options in (table_options or {}).items() - if options.get("included_columns") - } + self.table_options = table_options or {} self.consumed_all: bool = False # maps table names to list of data items @@ -414,14 +410,12 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: table_name = msg.table.split(".")[1] table_schema = self.get_table_schema(msg, table_name) data_item = gen_data_item( - msg, table_schema["columns"], self.included_columns.get(table_name) + msg, table_schema["columns"], lsn, **self.table_options.get(table_name) ) - data_item["_pg_lsn"] = lsn self.data_items[table_name].append(data_item) def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: last_schema = self.last_table_schema.get(table_name) - included_columns = self.included_columns.get(table_name) # Used cached schema if the operation is a delete since the inferred one will always be less precise if msg.op == Op.DELETE and last_schema: @@ -432,7 +426,7 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: if current_hash == self.last_table_hashes.get(table_name): return self.last_table_schema[table_name] - new_schema = infer_table_schema(msg, included_columns) + new_schema = infer_table_schema(msg, **self.table_options.get(table_name)) if last_schema is None: # Cache the inferred schema and hash if it is not already cached self.last_table_schema[table_name] = new_schema @@ -521,9 +515,7 @@ def flush_batch( class BackendHandler: def __init__(self, table: str, table_options: ReplicationOptions): self.table = table - self.column_hints = table_options.get("column_hints") - self.backend = table_options.get("backend", "sqlalchemy") - self.backend_kwargs = table_options.get("backend_kwargs", {}) + self.table_options = table_options def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: """Yields replication messages from ItemGenerator. @@ -540,18 +532,20 @@ def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: # Apply column hints if provided columns = schema["columns"] - data = table_items.items - if self.column_hints: - self.apply_column_hints(columns) + if column_hints := self.table_options.get("column_hints"): + for col_name, col_hint in column_hints.items(): + columns[col_name] = merge_column(columns.get(col_name, {}), col_hint) # Process based on backend + data = table_items.items + backend = self.table_options.get("backend", "sqlalchemy") try: - if self.backend == "sqlalchemy": + if backend == "sqlalchemy": yield from self.emit_schema_and_items(columns, data) - elif self.backend == "pyarrow": + elif backend == "pyarrow": yield from self.emit_arrow_table(columns, data) else: - raise NotImplementedError(f"Unsupported backend: {self.backend}") + raise NotImplementedError(f"Unsupported backend: {backend}") except Exception: logger.error( "A fatal error occurred while processing batch for '%s' (columns=%s, data=%s)", @@ -561,10 +555,6 @@ def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: ) raise - def apply_column_hints(self, columns: TTableSchemaColumns) -> None: - for col_name, col_hint in self.column_hints.items(): - columns[col_name] = merge_column(columns.get(col_name, {}), col_hint) - def emit_schema_and_items( self, columns: TTableSchemaColumns, items: List[TDataItem] ) -> Iterator[DataItemWithMeta]: @@ -579,20 +569,23 @@ def emit_arrow_table( self, columns: TTableSchemaColumns, items: List[TDataItem] ) -> Iterator[DataItemWithMeta]: # Create rows for pyarrow using ordered column keys - row_keys = list(columns.keys()) - rows = [tuple(item.get(col, None) for col in row_keys) for item in items] + rows = [ + tuple(item.get(column, None) for column in list(columns.keys())) + for item in items + ] + tz = self.table_options.get("backend_kwargs", {}).get("tz", "UTC") yield dlt.mark.with_table_name( - arrow.row_tuples_to_arrow( - rows, - columns, - tz=self.backend_kwargs.get("tz", "UTC"), - ), + arrow.row_tuples_to_arrow(rows, columns, tz=tz), self.table, ) def infer_table_schema( - msg: RowMessage, included_columns: Optional[Set[str]] = None + msg: RowMessage, + include_deleted_ts: bool = True, + include_lsn: bool = True, + included_columns: Optional[Set[str]] = None, + **kwargs: Any, ) -> TTableSchema: """Infers the table schema from the replication message and optional hints""" # Choose the correct source based on operation type @@ -602,22 +595,28 @@ def infer_table_schema( # Filter and map columns, conditionally using `new_typeinfo` when available columns: TTableSchemaColumns = { col.column_name: _to_dlt_column_schema( - col, msg.new_typeinfo[i] if is_change else None + col, msg.new_typeinfo[i] if is_change and msg.new_typeinfo else None ) for i, col in enumerate(tuples) if not included_columns or col.column_name in included_columns } # Add replication columns - columns["_pg_lsn"] = {"data_type": "bigint", "name": "_pg_lsn", "nullable": True} - columns["_pg_deleted_ts"] = { - "data_type": "timestamp", - "name": "_pg_deleted_ts", - "nullable": True, - } + if include_lsn: + columns["_pg_lsn"] = { + "data_type": "bigint", + "name": "_pg_lsn", + "nullable": True, + } + if include_deleted_ts: + columns["_pg_deleted_ts"] = { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + } return { - "name": (msg.table.split(".")[1]), + "name": msg.table.split(".")[1], "columns": columns, } @@ -625,25 +624,28 @@ def infer_table_schema( def gen_data_item( msg: RowMessage, column_schema: TTableSchemaColumns, + lsn: int, + include_deleted_ts: bool = True, + include_lsn: bool = True, included_columns: Optional[Set[str]] = None, + **kwargs: Any, ) -> TDataItem: """Generates data item from a row message and corresponding metadata.""" - data_item: TDataItem = {} - if msg.op != Op.DELETE: - row = msg.new_tuple - else: - row = msg.old_tuple + data_item: TDataItem = {"_pg_lsn": lsn} if include_lsn else {} + + # Select the relevant row tuple based on operation type + row = msg.new_tuple if msg.op != Op.DELETE else msg.old_tuple + if msg.op == Op.DELETE and include_deleted_ts: data_item["_pg_deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) for data in row: col_name = data.column_name - if included_columns and col_name not in included_columns: - continue - data_item[col_name] = _to_dlt_val( - data, - column_schema[col_name]["data_type"], - for_delete=msg.op == Op.DELETE, - ) + if not included_columns or col_name in included_columns: + data_item[col_name] = _to_dlt_val( + data, + column_schema[col_name]["data_type"], + for_delete=msg.op == Op.DELETE, + ) return data_item diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index c8275d426..957fb41a4 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -49,8 +49,8 @@ def replicate_single_table() -> None: write_disposition="merge", primary_key="id", columns={ - "deleted_ts": {"hard_delete": True}, - "lsn": {"dedup_sort": "desc"}, + "_pg_deleted_ts": {"hard_delete": True}, + "_pg_lsn": {"dedup_sort": "desc"}, }, ) diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index e22ddc3fc..02bf306d9 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -259,10 +259,11 @@ DATA_ITEMS: List[TDataItem] = [ { - "_dlt_id": "gGjifTMTAUs5ag", - "_dlt_load_id": "1728662646.2657657", "id_y": 2, "val_y": False, + "_dlt_id": "gGjifTMTAUs5ag", + "_dlt_load_id": "1728662646.2657657", + "_pg_lsn": 1, }, { "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), @@ -277,6 +278,7 @@ "col11": pendulum.parse("13:26:45.176451", strict=False).time(), "col12": None, "col13": None, + "_pg_lsn": 1, }, { "id_x": 1, @@ -294,6 +296,7 @@ "col_num": Decimal(0), "col_json": [0], "_pg_deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), + "_pg_lsn": 1, }, ] diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index dbe5107de..5b379ca59 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -27,7 +27,7 @@ def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) column_schema = infer_table_schema(row_msg)["columns"] - assert gen_data_item(row_msg, column_schema) == data_item + assert gen_data_item(row_msg, column_schema, lsn=1) == data_item def test_compare_schemas(): From 6b960df6c1a8c971d9c61c8d4ed5de22dca16d1f Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sat, 16 Nov 2024 17:32:09 +0100 Subject: [PATCH 60/88] feat: added optional transaction id --- sources/pg_legacy_replication/helpers.py | 15 ++++++++++++++- tests/pg_legacy_replication/cases.py | 21 +++++++++++++++++++++ tests/pg_legacy_replication/test_helpers.py | 4 ++-- 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 557bd9690..bbd97730a 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -584,6 +584,7 @@ def infer_table_schema( msg: RowMessage, include_deleted_ts: bool = True, include_lsn: bool = True, + include_tx_id: bool = False, included_columns: Optional[Set[str]] = None, **kwargs: Any, ) -> TTableSchema: @@ -614,6 +615,13 @@ def infer_table_schema( "name": "_pg_deleted_ts", "nullable": True, } + if include_tx_id: + columns["_pg_tx_id"] = { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + } return { "name": msg.table.split(".")[1], @@ -627,11 +635,16 @@ def gen_data_item( lsn: int, include_deleted_ts: bool = True, include_lsn: bool = True, + include_tx_id: bool = False, included_columns: Optional[Set[str]] = None, **kwargs: Any, ) -> TDataItem: """Generates data item from a row message and corresponding metadata.""" - data_item: TDataItem = {"_pg_lsn": lsn} if include_lsn else {} + data_item: TDataItem = {} + if include_lsn: + data_item["_pg_lsn"] = lsn + if include_tx_id: + data_item["_pg_tx_id"] = msg.transaction_id # Select the relevant row tuple based on operation type row = msg.new_tuple if msg.op != Op.DELETE else msg.old_tuple diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 02bf306d9..90ca4c518 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -264,6 +264,7 @@ "_dlt_id": "gGjifTMTAUs5ag", "_dlt_load_id": "1728662646.2657657", "_pg_lsn": 1, + "_pg_tx_id": 969, }, { "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), @@ -279,6 +280,7 @@ "col12": None, "col13": None, "_pg_lsn": 1, + "_pg_tx_id": 2018, }, { "id_x": 1, @@ -297,6 +299,7 @@ "col_json": [0], "_pg_deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), "_pg_lsn": 1, + "_pg_tx_id": 932, }, ] @@ -323,6 +326,12 @@ "name": "_pg_deleted_ts", "nullable": True, }, + "_pg_tx_id": { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + }, }, }, { @@ -340,6 +349,12 @@ "name": "_pg_deleted_ts", "nullable": True, }, + "_pg_tx_id": { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + }, }, }, { @@ -370,6 +385,12 @@ "name": "_pg_deleted_ts", "nullable": True, }, + "_pg_tx_id": { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + }, }, }, ] diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 5b379ca59..06802fcc0 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -19,7 +19,7 @@ def test_infer_table_schema( ): row_msg = RowMessage() parse_dict(data, row_msg) - assert infer_table_schema(row_msg) == expected_schema + assert infer_table_schema(row_msg, include_tx_id=True) == expected_schema @pytest.mark.parametrize("data, data_item", zip(ROW_MESSAGES, DATA_ITEMS)) @@ -27,7 +27,7 @@ def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) column_schema = infer_table_schema(row_msg)["columns"] - assert gen_data_item(row_msg, column_schema, lsn=1) == data_item + assert gen_data_item(row_msg, column_schema, lsn=1, include_tx_id=True) == data_item def test_compare_schemas(): From 9fa9d98bca9bb0efd50652dcf4754ef871682625 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sun, 17 Nov 2024 14:57:00 +0100 Subject: [PATCH 61/88] feat: added optional commit timestamp --- sources/pg_legacy_replication/helpers.py | 18 +++++++++++++++--- tests/pg_legacy_replication/cases.py | 20 +++++++++++++++++++- tests/pg_legacy_replication/test_helpers.py | 20 ++++++++++++++++++-- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index bbd97730a..66bdf53c4 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -70,8 +70,10 @@ class ReplicationOptions(TypedDict, total=False): backend: Optional[TableBackend] backend_kwargs: Optional[Dict[str, Any]] column_hints: Optional[TTableSchemaColumns] - include_deleted_ts: Optional[bool] # Default is true include_lsn: Optional[bool] # Default is true + include_deleted_ts: Optional[bool] # Default is true + include_commit_ts: Optional[bool] + include_tx_id: Optional[bool] included_columns: Optional[Set[str]] @@ -582,8 +584,9 @@ def emit_arrow_table( def infer_table_schema( msg: RowMessage, - include_deleted_ts: bool = True, include_lsn: bool = True, + include_deleted_ts: bool = True, + include_commit_ts: bool = False, include_tx_id: bool = False, included_columns: Optional[Set[str]] = None, **kwargs: Any, @@ -615,6 +618,12 @@ def infer_table_schema( "name": "_pg_deleted_ts", "nullable": True, } + if include_commit_ts: + columns["_pg_commit_ts"] = { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + } if include_tx_id: columns["_pg_tx_id"] = { "data_type": "bigint", @@ -633,8 +642,9 @@ def gen_data_item( msg: RowMessage, column_schema: TTableSchemaColumns, lsn: int, - include_deleted_ts: bool = True, include_lsn: bool = True, + include_deleted_ts: bool = True, + include_commit_ts: bool = False, include_tx_id: bool = False, included_columns: Optional[Set[str]] = None, **kwargs: Any, @@ -643,6 +653,8 @@ def gen_data_item( data_item: TDataItem = {} if include_lsn: data_item["_pg_lsn"] = lsn + if include_commit_ts: + data_item["_pg_commit_ts"] = _epoch_micros_to_datetime(msg.commit_time) if include_tx_id: data_item["_pg_tx_id"] = msg.transaction_id diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 90ca4c518..7a1e436e5 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -264,6 +264,7 @@ "_dlt_id": "gGjifTMTAUs5ag", "_dlt_load_id": "1728662646.2657657", "_pg_lsn": 1, + "_pg_commit_ts": pendulum.parse("2024-10-11T16:04:06.949062+00:00"), "_pg_tx_id": 969, }, { @@ -280,6 +281,7 @@ "col12": None, "col13": None, "_pg_lsn": 1, + "_pg_commit_ts": pendulum.parse("2024-10-21T09:37:03.666542+00:00"), "_pg_tx_id": 2018, }, { @@ -297,8 +299,9 @@ "col_tstz": pendulum.parse("1970-01-01T00:00:00+00:00"), "col_num": Decimal(0), "col_json": [0], - "_pg_deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), "_pg_lsn": 1, + "_pg_deleted_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), + "_pg_commit_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), "_pg_tx_id": 932, }, ] @@ -326,6 +329,11 @@ "name": "_pg_deleted_ts", "nullable": True, }, + "_pg_commit_ts": { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + }, "_pg_tx_id": { "data_type": "bigint", "name": "_pg_tx_id", @@ -349,6 +357,11 @@ "name": "_pg_deleted_ts", "nullable": True, }, + "_pg_commit_ts": { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + }, "_pg_tx_id": { "data_type": "bigint", "name": "_pg_tx_id", @@ -385,6 +398,11 @@ "name": "_pg_deleted_ts", "nullable": True, }, + "_pg_commit_ts": { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + }, "_pg_tx_id": { "data_type": "bigint", "name": "_pg_tx_id", diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 06802fcc0..43fe3547c 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -19,7 +19,14 @@ def test_infer_table_schema( ): row_msg = RowMessage() parse_dict(data, row_msg) - assert infer_table_schema(row_msg, include_tx_id=True) == expected_schema + assert ( + infer_table_schema( + row_msg, + include_commit_ts=True, + include_tx_id=True, + ) + == expected_schema + ) @pytest.mark.parametrize("data, data_item", zip(ROW_MESSAGES, DATA_ITEMS)) @@ -27,7 +34,16 @@ def test_gen_data_item(data, data_item: TDataItem): row_msg = RowMessage() parse_dict(data, row_msg) column_schema = infer_table_schema(row_msg)["columns"] - assert gen_data_item(row_msg, column_schema, lsn=1, include_tx_id=True) == data_item + assert ( + gen_data_item( + row_msg, + column_schema, + lsn=1, + include_commit_ts=True, + include_tx_id=True, + ) + == data_item + ) def test_compare_schemas(): From 1947029ef24fd869b693e26d848d4684141cd8bd Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 18 Nov 2024 11:14:16 +0100 Subject: [PATCH 62/88] fix: never handled missing type and added text oid mapping --- sources/pg_legacy_replication/schema_types.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 40298fda6..f7a4ef4f4 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -48,6 +48,7 @@ _MISSING_TYPES: Dict[str, TDataType] = { "real": "double", + "text": "text", "timestamp without time zone": "timestamp", } # FIXME Missing types for old postgres versions @@ -179,6 +180,8 @@ def _to_dlt_val( datum = val.WhichOneof("datum") if datum is None: return _DUMMY_VALS[data_type] if for_delete else None + if datum == "datum_missing": + return None raw_value = getattr(val, datum) if data_type in data_type_handlers: From 7a7ba30a1b97dfe705ec592c1db0c08dd8c412d3 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 20 Nov 2024 15:06:33 +0100 Subject: [PATCH 63/88] fix: added some logging and bug fixes --- sources/pg_legacy_replication/helpers.py | 5 ++++- sources/pg_legacy_replication/schema_types.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 66bdf53c4..c1e7a1be0 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -490,6 +490,7 @@ def __iter__(self) -> Iterator[TableItems]: try: cur.consume_stream(consumer) except StopReplication: # completed batch or reached `upto_lsn` + logger.info("Flushing batch of %s events", self.target_batch_size) yield from self.flush_batch(cur, consumer) finally: cur.connection.close() @@ -500,6 +501,7 @@ def flush_batch( last_commit_lsn = consumer.last_commit_lsn consumed_all = consumer.consumed_all for table, data_items in consumer.data_items.items(): + logger.info("Flushing %s events for table '%s'", len(data_items), table) yield TableItems(consumer.last_table_schema[table], data_items) if consumed_all: cur.send_feedback( @@ -536,7 +538,8 @@ def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: columns = schema["columns"] if column_hints := self.table_options.get("column_hints"): for col_name, col_hint in column_hints.items(): - columns[col_name] = merge_column(columns.get(col_name, {}), col_hint) + if col_name in columns: + columns[col_name] = merge_column(columns[col_name], col_hint) # Process based on backend data = table_items.items diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index f7a4ef4f4..692adb0e9 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -34,6 +34,7 @@ 20: "bigint", 21: "smallint", 23: "integer", + 25: "text", 700: "real", 701: "double precision", 1043: "character varying", From a7525810dcbe210e980e68134c318e31bcb9d028 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 27 Nov 2024 16:34:33 +0100 Subject: [PATCH 64/88] chore: basic refactoring --- sources/pg_legacy_replication/__init__.py | 4 +-- sources/pg_legacy_replication/helpers.py | 35 ++++++++++--------- sources/pg_legacy_replication/schema_types.py | 5 ++- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index e744fba5b..949f4b76d 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -1,6 +1,6 @@ """Replicates postgres tables in batch using logical decoding.""" -from typing import Any, Callable, Dict, Sequence, Optional, Iterable, Union +from typing import Any, Callable, Dict, Iterable, Optional, Sequence, Union import dlt from dlt.extract import DltResource @@ -109,7 +109,7 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: wal_reader = replication_resource(slot_name) for table in table_names: - table_opts = table_options.get(table) if table_options else {} + table_opts = table_options.get(table, {}) if table_options else {} yield dlt.transformer( _create_table_dispatch(table=table, table_options=table_opts), data_from=wal_reader, diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index c1e7a1be0..9f951b4d6 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -2,18 +2,18 @@ from collections import defaultdict from dataclasses import dataclass, field from typing import ( - Optional, + Any, + Callable, Dict, - Set, + Iterable, Iterator, - Union, List, - Sequence, - Any, - Iterable, - Callable, NamedTuple, + Optional, + Sequence, + Set, TypedDict, + Union, ) import dlt @@ -23,10 +23,10 @@ from dlt.common.schema.typing import TColumnSchema, TTableSchema, TTableSchemaColumns from dlt.common.schema.utils import merge_column from dlt.common.typing import TDataItem -from dlt.extract import DltSource, DltResource +from dlt.extract import DltResource, DltSource from dlt.extract.items import DataItemWithMeta from dlt.sources.credentials import ConnectionStringCredentials -from psycopg2.extensions import cursor, connection as ConnectionExt +from psycopg2.extensions import connection as ConnectionExt, cursor from psycopg2.extras import ( LogicalReplicationConnection, ReplicationCursor, @@ -36,20 +36,21 @@ # Favoring 1.x over 0.5.x imports try: - from dlt.common.libs.sql_alchemy import Engine, Table, MetaData # type: ignore[attr-defined] + from dlt.common.libs.sql_alchemy import Engine, MetaData, Table # type: ignore[attr-defined] except ImportError: from sqlalchemy import Engine, Table, MetaData + from sqlalchemy import Connection as ConnectionSqla, event try: from dlt.sources.sql_database import ( # type: ignore[import-not-found] - sql_table, - engine_from_credentials, - TQueryAdapter, - TTypeAdapter, ReflectionLevel, TableBackend, + TQueryAdapter, + TTypeAdapter, arrow_helpers as arrow, + engine_from_credentials, + sql_table, ) except ImportError: from ..sql_database import ( # type: ignore[import-untyped] @@ -516,10 +517,10 @@ def flush_batch( self.generated_all = consumed_all +@dataclass class BackendHandler: - def __init__(self, table: str, table_options: ReplicationOptions): - self.table = table - self.table_options = table_options + table: str + table_options: ReplicationOptions def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: """Yields replication messages from ItemGenerator. diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 692adb0e9..d4d82474c 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -1,10 +1,9 @@ import re from functools import lru_cache -from typing import Optional, Any, Dict, Callable, Tuple +from typing import Any, Callable, Dict, Optional, Tuple import pendulum -from dlt.common import Decimal -from dlt.common import logger +from dlt.common import Decimal, logger from dlt.common.data_types.type_helpers import coerce_value from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import TColumnSchema, TColumnType From 4184ca92e42bea16bccc823fe7753edff6ea5809 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 16 Dec 2024 14:21:27 +0100 Subject: [PATCH 65/88] fix: minor corrections --- sources/pg_legacy_replication/helpers.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 9f951b4d6..95db8d3f1 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -375,8 +375,7 @@ def process_msg(self, msg: ReplicationMessage) -> None: row_msg = RowMessage() try: row_msg.ParseFromString(msg.payload) - if row_msg.op == Op.UNKNOWN: - raise AssertionError(f"Unsupported operation : {row_msg}") + assert row_msg.op != Op.UNKNOWN, f"Unsupported operation : {row_msg}" if row_msg.op == Op.BEGIN: self.last_commit_ts = _epoch_micros_to_datetime(row_msg.commit_time) @@ -581,7 +580,7 @@ def emit_arrow_table( ] tz = self.table_options.get("backend_kwargs", {}).get("tz", "UTC") yield dlt.mark.with_table_name( - arrow.row_tuples_to_arrow(rows, columns, tz=tz), + arrow.row_tuples_to_arrow(rows, columns=columns, tz=tz), self.table, ) From 3c7232f0fad3b7550b61525893297aec2c492a8b Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 16 Dec 2024 15:22:52 +0100 Subject: [PATCH 66/88] chore: reverting back to prev state --- tests/utils.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/utils.py b/tests/utils.py index cb795fd10..35287898c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -228,14 +228,11 @@ def assert_query_data( def assert_load_info(info: LoadInfo, expected_load_packages: int = 1) -> None: """Asserts that expected number of packages was loaded and there are no failed jobs""" - try: - assert len(info.loads_ids) == expected_load_packages - # all packages loaded - assert all(package.state == "loaded" for package in info.load_packages) is True - # no failed jobs in any of the packages - info.raise_on_failed_jobs() - except AssertionError: - raise + assert len(info.loads_ids) == expected_load_packages + # all packages loaded + assert all(package.state == "loaded" for package in info.load_packages) is True + # no failed jobs in any of the packages + info.raise_on_failed_jobs() def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: @@ -380,4 +377,4 @@ def data_item_length(data: TDataItem) -> int: elif isinstance(data, pa.Table) or isinstance(data, pa.RecordBatch): return data.num_rows else: - raise TypeError("Unsupported data type.") + raise TypeError("Unsupported data type.") \ No newline at end of file From c8f1ad22efe1fc11c893abe0664c6420243db9ad Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 16 Dec 2024 17:49:00 +0100 Subject: [PATCH 67/88] chore: rebasing 1.x branch onto my own --- poetry.lock | 163 ++++++++++++++++-- pyproject.toml | 1 - sources/pg_legacy_replication/helpers.py | 38 ++-- sources/pg_legacy_replication/schema_types.py | 26 +-- tests/pg_legacy_replication/cases.py | 19 +- tests/pg_legacy_replication/test_helpers.py | 8 +- tests/utils.py | 2 +- 7 files changed, 175 insertions(+), 82 deletions(-) diff --git a/poetry.lock b/poetry.lock index 6c1979521..b10a9c72b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "adlfs" @@ -962,6 +962,93 @@ files = [ {file = "constantly-23.10.4.tar.gz", hash = "sha256:aa92b70a33e2ac0bb33cd745eb61776594dc48764b06c35e0efd050b7f1c7cbd"}, ] +[[package]] +name = "coverage" +version = "7.6.1" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "coverage-7.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b06079abebbc0e89e6163b8e8f0e16270124c154dc6e4a47b413dd538859af16"}, + {file = "coverage-7.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf4b19715bccd7ee27b6b120e7e9dd56037b9c0681dcc1adc9ba9db3d417fa36"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61c0abb4c85b095a784ef23fdd4aede7a2628478e7baba7c5e3deba61070a02"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd21f6ae3f08b41004dfb433fa895d858f3f5979e7762d052b12aef444e29afc"}, + {file = "coverage-7.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f59d57baca39b32db42b83b2a7ba6f47ad9c394ec2076b084c3f029b7afca23"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a1ac0ae2b8bd743b88ed0502544847c3053d7171a3cff9228af618a068ed9c34"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e6a08c0be454c3b3beb105c0596ebdc2371fab6bb90c0c0297f4e58fd7e1012c"}, + {file = "coverage-7.6.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f5796e664fe802da4f57a168c85359a8fbf3eab5e55cd4e4569fbacecc903959"}, + {file = "coverage-7.6.1-cp310-cp310-win32.whl", hash = "sha256:7bb65125fcbef8d989fa1dd0e8a060999497629ca5b0efbca209588a73356232"}, + {file = "coverage-7.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:3115a95daa9bdba70aea750db7b96b37259a81a709223c8448fa97727d546fe0"}, + {file = "coverage-7.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7dea0889685db8550f839fa202744652e87c60015029ce3f60e006f8c4462c93"}, + {file = "coverage-7.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed37bd3c3b063412f7620464a9ac1314d33100329f39799255fb8d3027da50d3"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85f5e9a5f8b73e2350097c3756ef7e785f55bd71205defa0bfdaf96c31616ff"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bc572be474cafb617672c43fe989d6e48d3c83af02ce8de73fff1c6bb3c198d"}, + {file = "coverage-7.6.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0420b573964c760df9e9e86d1a9a622d0d27f417e1a949a8a66dd7bcee7bc6"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f4aa8219db826ce6be7099d559f8ec311549bfc4046f7f9fe9b5cea5c581c56"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fc5a77d0c516700ebad189b587de289a20a78324bc54baee03dd486f0855d234"}, + {file = "coverage-7.6.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b48f312cca9621272ae49008c7f613337c53fadca647d6384cc129d2996d1133"}, + {file = "coverage-7.6.1-cp311-cp311-win32.whl", hash = "sha256:1125ca0e5fd475cbbba3bb67ae20bd2c23a98fac4e32412883f9bcbaa81c314c"}, + {file = "coverage-7.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:8ae539519c4c040c5ffd0632784e21b2f03fc1340752af711f33e5be83a9d6c6"}, + {file = "coverage-7.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95cae0efeb032af8458fc27d191f85d1717b1d4e49f7cb226cf526ff28179778"}, + {file = "coverage-7.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5621a9175cf9d0b0c84c2ef2b12e9f5f5071357c4d2ea6ca1cf01814f45d2391"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:260933720fdcd75340e7dbe9060655aff3af1f0c5d20f46b57f262ab6c86a5e8"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07e2ca0ad381b91350c0ed49d52699b625aab2b44b65e1b4e02fa9df0e92ad2d"}, + {file = "coverage-7.6.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44fee9975f04b33331cb8eb272827111efc8930cfd582e0320613263ca849ca"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877abb17e6339d96bf08e7a622d05095e72b71f8afd8a9fefc82cf30ed944163"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e0cadcf6733c09154b461f1ca72d5416635e5e4ec4e536192180d34ec160f8a"}, + {file = "coverage-7.6.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3c02d12f837d9683e5ab2f3d9844dc57655b92c74e286c262e0fc54213c216d"}, + {file = "coverage-7.6.1-cp312-cp312-win32.whl", hash = "sha256:e05882b70b87a18d937ca6768ff33cc3f72847cbc4de4491c8e73880766718e5"}, + {file = "coverage-7.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:b5d7b556859dd85f3a541db6a4e0167b86e7273e1cdc973e5b175166bb634fdb"}, + {file = "coverage-7.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a4acd025ecc06185ba2b801f2de85546e0b8ac787cf9d3b06e7e2a69f925b106"}, + {file = "coverage-7.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a6d3adcf24b624a7b778533480e32434a39ad8fa30c315208f6d3e5542aeb6e9"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0c212c49b6c10e6951362f7c6df3329f04c2b1c28499563d4035d964ab8e08c"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e81d7a3e58882450ec4186ca59a3f20a5d4440f25b1cff6f0902ad890e6748a"}, + {file = "coverage-7.6.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78b260de9790fd81e69401c2dc8b17da47c8038176a79092a89cb2b7d945d060"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a78d169acd38300060b28d600344a803628c3fd585c912cacc9ea8790fe96862"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2c09f4ce52cb99dd7505cd0fc8e0e37c77b87f46bc9c1eb03fe3bc9991085388"}, + {file = "coverage-7.6.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6878ef48d4227aace338d88c48738a4258213cd7b74fd9a3d4d7582bb1d8a155"}, + {file = "coverage-7.6.1-cp313-cp313-win32.whl", hash = "sha256:44df346d5215a8c0e360307d46ffaabe0f5d3502c8a1cefd700b34baf31d411a"}, + {file = "coverage-7.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:8284cf8c0dd272a247bc154eb6c95548722dce90d098c17a883ed36e67cdb129"}, + {file = "coverage-7.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d3296782ca4eab572a1a4eca686d8bfb00226300dcefdf43faa25b5242ab8a3e"}, + {file = "coverage-7.6.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:502753043567491d3ff6d08629270127e0c31d4184c4c8d98f92c26f65019962"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a89ecca80709d4076b95f89f308544ec8f7b4727e8a547913a35f16717856cb"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a318d68e92e80af8b00fa99609796fdbcdfef3629c77c6283566c6f02c6d6704"}, + {file = "coverage-7.6.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13b0a73a0896988f053e4fbb7de6d93388e6dd292b0d87ee51d106f2c11b465b"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4421712dbfc5562150f7554f13dde997a2e932a6b5f352edcce948a815efee6f"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:166811d20dfea725e2e4baa71fffd6c968a958577848d2131f39b60043400223"}, + {file = "coverage-7.6.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:225667980479a17db1048cb2bf8bfb39b8e5be8f164b8f6628b64f78a72cf9d3"}, + {file = "coverage-7.6.1-cp313-cp313t-win32.whl", hash = "sha256:170d444ab405852903b7d04ea9ae9b98f98ab6d7e63e1115e82620807519797f"}, + {file = "coverage-7.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9f222de8cded79c49bf184bdbc06630d4c58eec9459b939b4a690c82ed05657"}, + {file = "coverage-7.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6db04803b6c7291985a761004e9060b2bca08da6d04f26a7f2294b8623a0c1a0"}, + {file = "coverage-7.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f1adfc8ac319e1a348af294106bc6a8458a0f1633cc62a1446aebc30c5fa186a"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a95324a9de9650a729239daea117df21f4b9868ce32e63f8b650ebe6cef5595b"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b43c03669dc4618ec25270b06ecd3ee4fa94c7f9b3c14bae6571ca00ef98b0d3"}, + {file = "coverage-7.6.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8929543a7192c13d177b770008bc4e8119f2e1f881d563fc6b6305d2d0ebe9de"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a09ece4a69cf399510c8ab25e0950d9cf2b42f7b3cb0374f95d2e2ff594478a6"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9054a0754de38d9dbd01a46621636689124d666bad1936d76c0341f7d71bf569"}, + {file = "coverage-7.6.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0dbde0f4aa9a16fa4d754356a8f2e36296ff4d83994b2c9d8398aa32f222f989"}, + {file = "coverage-7.6.1-cp38-cp38-win32.whl", hash = "sha256:da511e6ad4f7323ee5702e6633085fb76c2f893aaf8ce4c51a0ba4fc07580ea7"}, + {file = "coverage-7.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:3f1156e3e8f2872197af3840d8ad307a9dd18e615dc64d9ee41696f287c57ad8"}, + {file = "coverage-7.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abd5fd0db5f4dc9289408aaf34908072f805ff7792632250dcb36dc591d24255"}, + {file = "coverage-7.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:547f45fa1a93154bd82050a7f3cddbc1a7a4dd2a9bf5cb7d06f4ae29fe94eaf8"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645786266c8f18a931b65bfcefdbf6952dd0dea98feee39bd188607a9d307ed2"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e0b2df163b8ed01d515807af24f63de04bebcecbd6c3bfeff88385789fdf75a"}, + {file = "coverage-7.6.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609b06f178fe8e9f89ef676532760ec0b4deea15e9969bf754b37f7c40326dbc"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:702855feff378050ae4f741045e19a32d57d19f3e0676d589df0575008ea5004"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2bdb062ea438f22d99cba0d7829c2ef0af1d768d1e4a4f528087224c90b132cb"}, + {file = "coverage-7.6.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9c56863d44bd1c4fe2abb8a4d6f5371d197f1ac0ebdee542f07f35895fc07f36"}, + {file = "coverage-7.6.1-cp39-cp39-win32.whl", hash = "sha256:6e2cd258d7d927d09493c8df1ce9174ad01b381d4729a9d8d4e38670ca24774c"}, + {file = "coverage-7.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:06a737c882bd26d0d6ee7269b20b12f14a8704807a01056c80bb881a4b2ce6ca"}, + {file = "coverage-7.6.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:e9a6e0eb86070e8ccaedfbd9d38fec54864f3125ab95419970575b42af7541df"}, + {file = "coverage-7.6.1.tar.gz", hash = "sha256:953510dfb7b12ab69d20135a0662397f077c59b1e6379a768e97c59d852ee51d"}, +] + +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + +[package.extras] +toml = ["tomli"] + [[package]] name = "cryptography" version = "41.0.4" @@ -3031,6 +3118,21 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "mypy-protobuf" +version = "3.6.0" +description = "Generate mypy stub files from protobuf specs" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mypy-protobuf-3.6.0.tar.gz", hash = "sha256:02f242eb3409f66889f2b1a3aa58356ec4d909cdd0f93115622e9e70366eca3c"}, + {file = "mypy_protobuf-3.6.0-py3-none-any.whl", hash = "sha256:56176e4d569070e7350ea620262478b49b7efceba4103d468448f1d21492fd6c"}, +] + +[package.dependencies] +protobuf = ">=4.25.3" +types-protobuf = ">=4.24" + [[package]] name = "natsort" version = "8.4.0" @@ -3812,24 +3914,22 @@ testing = ["google-api-core[grpc] (>=1.31.5)"] [[package]] name = "protobuf" -version = "4.24.4" +version = "4.25.5" description = "" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "protobuf-4.24.4-cp310-abi3-win32.whl", hash = "sha256:ec9912d5cb6714a5710e28e592ee1093d68c5ebfeda61983b3f40331da0b1ebb"}, - {file = "protobuf-4.24.4-cp310-abi3-win_amd64.whl", hash = "sha256:1badab72aa8a3a2b812eacfede5020472e16c6b2212d737cefd685884c191085"}, - {file = "protobuf-4.24.4-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8e61a27f362369c2f33248a0ff6896c20dcd47b5d48239cb9720134bef6082e4"}, - {file = "protobuf-4.24.4-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:bffa46ad9612e6779d0e51ae586fde768339b791a50610d85eb162daeb23661e"}, - {file = "protobuf-4.24.4-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:b493cb590960ff863743b9ff1452c413c2ee12b782f48beca77c8da3e2ffe9d9"}, - {file = "protobuf-4.24.4-cp37-cp37m-win32.whl", hash = "sha256:dbbed8a56e56cee8d9d522ce844a1379a72a70f453bde6243e3c86c30c2a3d46"}, - {file = "protobuf-4.24.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6b7d2e1c753715dcfe9d284a25a52d67818dd43c4932574307daf836f0071e37"}, - {file = "protobuf-4.24.4-cp38-cp38-win32.whl", hash = "sha256:02212557a76cd99574775a81fefeba8738d0f668d6abd0c6b1d3adcc75503dbe"}, - {file = "protobuf-4.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:2fa3886dfaae6b4c5ed2730d3bf47c7a38a72b3a1f0acb4d4caf68e6874b947b"}, - {file = "protobuf-4.24.4-cp39-cp39-win32.whl", hash = "sha256:b77272f3e28bb416e2071186cb39efd4abbf696d682cbb5dc731308ad37fa6dd"}, - {file = "protobuf-4.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:9fee5e8aa20ef1b84123bb9232b3f4a5114d9897ed89b4b8142d81924e05d79b"}, - {file = "protobuf-4.24.4-py3-none-any.whl", hash = "sha256:80797ce7424f8c8d2f2547e2d42bfbb6c08230ce5832d6c099a37335c9c90a92"}, - {file = "protobuf-4.24.4.tar.gz", hash = "sha256:5a70731910cd9104762161719c3d883c960151eea077134458503723b60e3667"}, + {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"}, + {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"}, + {file = "protobuf-4.25.5-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173"}, + {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d"}, + {file = "protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331"}, + {file = "protobuf-4.25.5-cp38-cp38-win32.whl", hash = "sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1"}, + {file = "protobuf-4.25.5-cp38-cp38-win_amd64.whl", hash = "sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a"}, + {file = "protobuf-4.25.5-cp39-cp39-win32.whl", hash = "sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f"}, + {file = "protobuf-4.25.5-cp39-cp39-win_amd64.whl", hash = "sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45"}, + {file = "protobuf-4.25.5-py3-none-any.whl", hash = "sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41"}, + {file = "protobuf-4.25.5.tar.gz", hash = "sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584"}, ] [[package]] @@ -4489,6 +4589,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-cov" +version = "5.0.0" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857"}, + {file = "pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652"}, +] + +[package.dependencies] +coverage = {version = ">=5.2.1", extras = ["toml"]} +pytest = ">=4.6" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] + [[package]] name = "pytest-forked" version = "1.6.0" @@ -5650,6 +5768,17 @@ files = [ {file = "twisted_iocpsupport-1.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:300437af17396a945a58dcfffd77863303a8b6d9e65c6e81f1d2eed55b50d444"}, ] +[[package]] +name = "types-protobuf" +version = "5.29.1.20241207" +description = "Typing stubs for protobuf" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types_protobuf-5.29.1.20241207-py3-none-any.whl", hash = "sha256:92893c42083e9b718c678badc0af7a9a1307b92afe1599e5cba5f3d35b668b2f"}, + {file = "types_protobuf-5.29.1.20241207.tar.gz", hash = "sha256:2ebcadb8ab3ef2e3e2f067e0882906d64ba0dc65fc5b0fd7a8b692315b4a0be9"}, +] + [[package]] name = "types-psycopg2" version = "2.9.21.20240218" @@ -6472,4 +6601,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "e216234bd35e71ef0c8e5a498c2cc616df417c5b14658b00aed9d935ba5a782e" +content-hash = "38baefccadc2b1ebc8c7b4f8702035dd2c60d88c5cd582b148a1cd01c52860ca" diff --git a/pyproject.toml b/pyproject.toml index 5ebf64bd4..75beddcc8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,6 @@ twisted = "22.10.0" pytest-forked = "^1.6.0" pendulum = "^3.0.0" types-protobuf = "^5.27.0.20240907" -devtools = "^0.12.2" pytest-cov = "^5.0.0" mypy-protobuf = "^3.6.0" diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 95db8d3f1..9a9223d07 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -34,34 +34,18 @@ StopReplication, ) -# Favoring 1.x over 0.5.x imports -try: - from dlt.common.libs.sql_alchemy import Engine, MetaData, Table # type: ignore[attr-defined] -except ImportError: - from sqlalchemy import Engine, Table, MetaData +from dlt.common.libs.sql_alchemy import Engine, MetaData, Table from sqlalchemy import Connection as ConnectionSqla, event - -try: - from dlt.sources.sql_database import ( # type: ignore[import-not-found] - ReflectionLevel, - TableBackend, - TQueryAdapter, - TTypeAdapter, - arrow_helpers as arrow, - engine_from_credentials, - sql_table, - ) -except ImportError: - from ..sql_database import ( # type: ignore[import-untyped] - sql_table, - engine_from_credentials, - TQueryAdapter, - TTypeAdapter, - ReflectionLevel, - TableBackend, - arrow_helpers as arrow, - ) +from dlt.sources.sql_database import ( + ReflectionLevel, + TableBackend, + TQueryAdapter, + TTypeAdapter, + arrow_helpers as arrow, + engine_from_credentials, + sql_table, +) from .pg_logicaldec_pb2 import Op, RowMessage, TypeInfo from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val @@ -84,7 +68,7 @@ class SqlTableOptions(TypedDict, total=False): chunk_size: Optional[int] defer_table_reflect: Optional[bool] detect_precision_hints: Optional[bool] - included_columns: Optional[Sequence[str]] + included_columns: Optional[List[str]] metadata: Optional[MetaData] query_adapter_callback: Optional[TQueryAdapter] reflection_level: Optional[ReflectionLevel] diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index d4d82474c..8d0af0fe0 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -8,6 +8,7 @@ from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import TColumnSchema, TColumnType from dlt.destinations import postgres +from dlt.destinations.impl.postgres.factory import PostgresTypeMapper from .pg_logicaldec_pb2 import DatumMessage, TypeInfo @@ -15,8 +16,7 @@ "bigint": 0, "binary": b" ", "bool": True, - "complex": [0], - "json": [0], # type: ignore[dict-item] + "json": [0], "date": pendulum.Date(1970, 1, 1), "decimal": Decimal(0), "double": 0.0, @@ -101,19 +101,11 @@ def _get_precision_and_scale( return None, None -# FIXME Hack to get it to work with 0.5.x and 1.x @lru_cache(maxsize=None) def _from_db_type() -> Callable[[str, Optional[int], Optional[int]], TColumnType]: - try: - from dlt.destinations.impl.postgres.factory import PostgresTypeMapper # type: ignore - - type_mapper = PostgresTypeMapper(postgres().capabilities()) - return type_mapper.from_destination_type # type: ignore[no-any-return] - except ImportError: - from dlt.destinations.impl.postgres.postgres import PostgresTypeMapper - - type_mapper = PostgresTypeMapper(postgres().capabilities()) - return type_mapper.from_db_type # type: ignore[no-any-return] + """Gets column type from db type""" + type_mapper = PostgresTypeMapper(postgres().capabilities()) + return type_mapper.from_destination_type def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: @@ -188,10 +180,4 @@ def _to_dlt_val( return data_type_handlers[data_type](raw_value) raw_type = _DATUM_RAW_TYPES[datum] - try: - return coerce_value(data_type, raw_type, raw_value) - except ValueError: - # FIXME Hack to get it to work with 0.5.x and 1.x - if data_type == "json": - return coerce_value("complex", raw_type, raw_value) - raise + return coerce_value(data_type, raw_type, raw_value) diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 7a1e436e5..161a0dd12 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -16,7 +16,7 @@ "col7": b"binary data \n \r \x8e", # "col8": 2**56 + 92093890840, # TODO: uncommment and make it work "col9": { - "complex": [1, 2, 3, "a"], + "json": [1, 2, 3, "a"], "link": ( "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" @@ -51,7 +51,7 @@ {"name": "col6", "data_type": "decimal", "nullable": False}, {"name": "col7", "data_type": "binary", "nullable": False}, # {"name": "col8", "data_type": "wei", "nullable": False}, - {"name": "col9", "data_type": "json", "nullable": False, "variant": True}, # type: ignore[typeddict-item] + {"name": "col9", "data_type": "json", "nullable": False, "variant": True}, {"name": "col10", "data_type": "date", "nullable": False}, {"name": "col11", "data_type": "time", "nullable": False}, {"name": "col1_null", "data_type": "bigint", "nullable": True}, @@ -62,7 +62,7 @@ {"name": "col6_null", "data_type": "decimal", "nullable": True}, {"name": "col7_null", "data_type": "binary", "nullable": True}, # {"name": "col8_null", "data_type": "wei", "nullable": True}, - {"name": "col9_null", "data_type": "json", "nullable": True, "variant": True}, # type: ignore[typeddict-item] + {"name": "col9_null", "data_type": "json", "nullable": True, "variant": True}, {"name": "col10_null", "data_type": "date", "nullable": True}, {"name": "col11_null", "data_type": "time", "nullable": True}, { @@ -94,11 +94,6 @@ {"name": "col11_precision", "data_type": "time", "precision": 6, "nullable": False}, ] -if "complex" in DATA_TYPES: - for col_schema in TABLE_UPDATE: - if col_schema["data_type"] == "json": - col_schema["data_type"] = "complex" - TABLE_UPDATE_COLUMNS_SCHEMA: TTableSchemaColumns = {t["name"]: t for t in TABLE_UPDATE} ROW_MESSAGES: List[dict] = [ @@ -165,7 +160,7 @@ "columnType": 3802, "datumString": ( '{"link": "?commen\\ntU\\nrn=urn%3Ali%3Acomment%3A%28acti\\n \\u0006 \\\\vity%3A69\'08444473\\n\\n551163392' - '%2C6n \\r \x8e9085", "complex": [1, 2, 3, "a"]}' + '%2C6n \\r \x8e9085", "json": [1, 2, 3, "a"]}' ), }, { @@ -270,7 +265,7 @@ { "col4": pendulum.parse("2022-05-23T13:26:45.176451+00:00"), "col9": { - "complex": [1, 2, 3, "a"], + "json": [1, 2, 3, "a"], "link": ( "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" @@ -346,7 +341,7 @@ "name": "items", "columns": { "col4": {"data_type": "timestamp", "name": "col4", "nullable": False}, - "col9": {"data_type": "complex", "name": "col9", "nullable": False}, + "col9": {"data_type": "json", "name": "col9", "nullable": False}, "col10": {"data_type": "date", "name": "col10", "nullable": False}, "col11": {"data_type": "time", "name": "col11", "nullable": False}, "col12": {"data_type": "timestamp", "name": "col12", "nullable": True}, @@ -391,7 +386,7 @@ "col_ts": {"data_type": "timestamp", "name": "col_ts"}, "col_tstz": {"data_type": "timestamp", "name": "col_tstz"}, "col_num": {"data_type": "decimal", "name": "col_num"}, - "col_json": {"data_type": "complex", "name": "col_json"}, + "col_json": {"data_type": "json", "name": "col_json"}, "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, "_pg_deleted_ts": { "data_type": "timestamp", diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index 43fe3547c..cfd813966 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -68,7 +68,7 @@ def test_compare_schemas(): "nullable": False, }, "col7": {"name": "col7", "data_type": "binary", "nullable": False}, - "col9": {"name": "col9", "data_type": "complex", "nullable": False}, + "col9": {"name": "col9", "data_type": "json", "nullable": False}, "col10": {"name": "col10", "data_type": "date", "nullable": False}, "col11": {"name": "col11", "data_type": "time", "nullable": False}, "col1_null": { @@ -95,7 +95,7 @@ def test_compare_schemas(): "col7_null": {"name": "col7_null", "data_type": "binary", "nullable": True}, "col9_null": { "name": "col9_null", - "data_type": "complex", + "data_type": "json", "nullable": True, }, "col10_null": {"name": "col10_null", "data_type": "date", "nullable": True}, @@ -165,7 +165,7 @@ def test_compare_schemas(): "col5": {"name": "col5", "data_type": "text"}, "col6": {"name": "col6", "data_type": "decimal"}, "col7": {"name": "col7", "data_type": "binary"}, - "col9": {"name": "col9", "data_type": "complex"}, + "col9": {"name": "col9", "data_type": "json"}, "col10": {"name": "col10", "data_type": "date"}, "col11": {"name": "col11", "data_type": "time"}, "col1_null": {"name": "col1_null", "data_type": "bigint", "precision": 64}, @@ -175,7 +175,7 @@ def test_compare_schemas(): "col5_null": {"name": "col5_null", "data_type": "text"}, "col6_null": {"name": "col6_null", "data_type": "decimal"}, "col7_null": {"name": "col7_null", "data_type": "binary"}, - "col9_null": {"name": "col9_null", "data_type": "complex"}, + "col9_null": {"name": "col9_null", "data_type": "json"}, "col10_null": {"name": "col10_null", "data_type": "date"}, "col11_null": {"name": "col11_null", "data_type": "time"}, "col1_precision": { diff --git a/tests/utils.py b/tests/utils.py index 35287898c..be845765b 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -377,4 +377,4 @@ def data_item_length(data: TDataItem) -> int: elif isinstance(data, pa.Table) or isinstance(data, pa.RecordBatch): return data.num_rows else: - raise TypeError("Unsupported data type.") \ No newline at end of file + raise TypeError("Unsupported data type.") From 7024ce773536d7280089808a2534846976868dfc Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 19 Dec 2024 17:07:28 +0100 Subject: [PATCH 68/88] fix: corrected bug regarding column names --- sources/pg_legacy_replication/helpers.py | 47 +++++++++++-------- sources/pg_legacy_replication/schema_types.py | 15 +++--- tests/pg_legacy_replication/cases.py | 21 +++++++-- 3 files changed, 50 insertions(+), 33 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 9a9223d07..0c2cca983 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -19,6 +19,7 @@ import dlt import psycopg2 from dlt.common import logger +from dlt.common.libs.sql_alchemy import Engine, MetaData, Table, sa from dlt.common.pendulum import pendulum from dlt.common.schema.typing import TColumnSchema, TTableSchema, TTableSchemaColumns from dlt.common.schema.utils import merge_column @@ -26,17 +27,6 @@ from dlt.extract import DltResource, DltSource from dlt.extract.items import DataItemWithMeta from dlt.sources.credentials import ConnectionStringCredentials -from psycopg2.extensions import connection as ConnectionExt, cursor -from psycopg2.extras import ( - LogicalReplicationConnection, - ReplicationCursor, - ReplicationMessage, - StopReplication, -) - - -from dlt.common.libs.sql_alchemy import Engine, MetaData, Table -from sqlalchemy import Connection as ConnectionSqla, event from dlt.sources.sql_database import ( ReflectionLevel, TableBackend, @@ -46,8 +36,15 @@ engine_from_credentials, sql_table, ) +from psycopg2.extensions import connection as ConnectionExt, cursor +from psycopg2.extras import ( + LogicalReplicationConnection, + ReplicationCursor, + ReplicationMessage, + StopReplication, +) -from .pg_logicaldec_pb2 import Op, RowMessage, TypeInfo +from .pg_logicaldec_pb2 import DatumMessage, Op, RowMessage, TypeInfo from .schema_types import _epoch_micros_to_datetime, _to_dlt_column_schema, _to_dlt_val @@ -151,8 +148,8 @@ def init_replication( engine = _configure_engine(credentials, rep_conn) - @event.listens_for(engine, "begin") - def on_begin(conn: ConnectionSqla) -> None: + @sa.event.listens_for(engine, "begin") + def on_begin(conn: sa.Connection) -> None: cur = conn.connection.cursor() if slot is None: # Using the same isolation level that pg_backup uses @@ -181,7 +178,7 @@ def _configure_engine( engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) setattr(engine, "rep_conn", rep_conn) # noqa - @event.listens_for(engine, "engine_disposed") + @sa.event.listens_for(engine, "engine_disposed") def on_engine_disposed(engine: Engine) -> None: delattr(engine, "rep_conn") @@ -584,12 +581,13 @@ def infer_table_schema( tuples = msg.new_tuple if is_change else msg.old_tuple # Filter and map columns, conditionally using `new_typeinfo` when available - columns: TTableSchemaColumns = { - col.column_name: _to_dlt_column_schema( - col, msg.new_typeinfo[i] if is_change and msg.new_typeinfo else None + columns = { + col_name: _to_dlt_column_schema( + col_name, datum=col, type_info=msg.new_typeinfo[i] if is_change else None ) for i, col in enumerate(tuples) - if not included_columns or col.column_name in included_columns + if (col_name := _actual_column_name(col)) + and (not included_columns or col_name in included_columns) } # Add replication columns @@ -651,7 +649,7 @@ def gen_data_item( data_item["_pg_deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) for data in row: - col_name = data.column_name + col_name = _actual_column_name(data) if not included_columns or col_name in included_columns: data_item[col_name] = _to_dlt_val( data, @@ -662,6 +660,15 @@ def gen_data_item( return data_item +def _actual_column_name(column: DatumMessage) -> str: + """Certain column names are quoted since they are reserved keywords, + however let the destination decide on how to normalize them""" + col_name = column.column_name + if col_name.startswith('"') and col_name.endswith('"'): + col_name = col_name[1:-1] + return col_name + + ALLOWED_COL_SCHEMA_FIELDS: Set[str] = { "name", "data_type", diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 8d0af0fe0..2e21fbc4d 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -7,7 +7,6 @@ from dlt.common.data_types.type_helpers import coerce_value from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import TColumnSchema, TColumnType -from dlt.destinations import postgres from dlt.destinations.impl.postgres.factory import PostgresTypeMapper from .pg_logicaldec_pb2 import DatumMessage, TypeInfo @@ -102,10 +101,10 @@ def _get_precision_and_scale( @lru_cache(maxsize=None) -def _from_db_type() -> Callable[[str, Optional[int], Optional[int]], TColumnType]: - """Gets column type from db type""" - type_mapper = PostgresTypeMapper(postgres().capabilities()) - return type_mapper.from_destination_type +def _type_mapper() -> PostgresTypeMapper: + from dlt.destinations import postgres + + return PostgresTypeMapper(postgres().capabilities()) def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: @@ -122,15 +121,15 @@ def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: ) precision, scale = _get_precision_and_scale(type_id, modifier) - return _from_db_type()(pg_type, precision, scale) + return _type_mapper().from_destination_type(pg_type, precision, scale) def _to_dlt_column_schema( - datum: DatumMessage, type_info: Optional[TypeInfo] + col_name: str, datum: DatumMessage, type_info: Optional[TypeInfo] ) -> TColumnSchema: """Converts decoderbuf's datum value/typeinfo to dlt column schema.""" column_schema: TColumnSchema = { - "name": datum.column_name, + "name": col_name, **_to_dlt_column_type( datum.column_type, type_info.modifier if type_info else None ), diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 161a0dd12..5bcf23298 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -105,22 +105,27 @@ "newTuple": [ { "columnName": "id_y", - "columnType": "20", - "datumInt64": "2", + "columnType": 20, + "datumInt64": 2, }, { "columnName": "val_y", - "columnType": "16", + "columnType": 16, "datumBool": False, }, + { + "columnName": '"primary"', + "columnType": 16, + "datumBool": True, + }, { "columnName": "_dlt_load_id", - "columnType": "1043", + "columnType": 1043, "datumString": "1728662646.2657657", }, { "columnName": "_dlt_id", - "columnType": "1043", + "columnType": 1043, "datumString": "gGjifTMTAUs5ag", }, ], @@ -133,6 +138,10 @@ "modifier": "boolean", "valueOptional": True, }, + { + "modifier": "boolean", + "valueOptional": True, + }, { "modifier": "character varying", "valueOptional": False, @@ -256,6 +265,7 @@ { "id_y": 2, "val_y": False, + "primary": True, "_dlt_id": "gGjifTMTAUs5ag", "_dlt_load_id": "1728662646.2657657", "_pg_lsn": 1, @@ -312,6 +322,7 @@ "precision": 64, }, "val_y": {"data_type": "bool", "name": "val_y", "nullable": True}, + "primary": {"data_type": "bool", "name": "primary", "nullable": True}, "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, "_dlt_load_id": { "data_type": "text", From 63b1de0f4896ebc13c342a9b5358741c93be7886 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 19 Dec 2024 19:07:38 +0100 Subject: [PATCH 69/88] chore: minor fixes --- sources/pg_legacy_replication/helpers.py | 12 ++++++------ sources/pg_legacy_replication/schema_types.py | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 0c2cca983..a23e8e823 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -60,9 +60,9 @@ class ReplicationOptions(TypedDict, total=False): class SqlTableOptions(TypedDict, total=False): - backend: Optional[TableBackend] + backend: TableBackend backend_kwargs: Optional[Dict[str, Any]] - chunk_size: Optional[int] + chunk_size: int defer_table_reflect: Optional[bool] detect_precision_hints: Optional[bool] included_columns: Optional[List[str]] @@ -393,7 +393,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: table_name = msg.table.split(".")[1] table_schema = self.get_table_schema(msg, table_name) data_item = gen_data_item( - msg, table_schema["columns"], lsn, **self.table_options.get(table_name) + msg, table_schema["columns"], lsn, **self.table_options.get(table_name, {}) ) self.data_items[table_name].append(data_item) @@ -409,7 +409,7 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: if current_hash == self.last_table_hashes.get(table_name): return self.last_table_schema[table_name] - new_schema = infer_table_schema(msg, **self.table_options.get(table_name)) + new_schema = infer_table_schema(msg, **self.table_options.get(table_name, {})) if last_schema is None: # Cache the inferred schema and hash if it is not already cached self.last_table_schema[table_name] = new_schema @@ -573,7 +573,7 @@ def infer_table_schema( include_commit_ts: bool = False, include_tx_id: bool = False, included_columns: Optional[Set[str]] = None, - **kwargs: Any, + **_: Any, ) -> TTableSchema: """Infers the table schema from the replication message and optional hints""" # Choose the correct source based on operation type @@ -632,7 +632,7 @@ def gen_data_item( include_commit_ts: bool = False, include_tx_id: bool = False, included_columns: Optional[Set[str]] = None, - **kwargs: Any, + **_: Any, ) -> TDataItem: """Generates data item from a row message and corresponding metadata.""" data_item: TDataItem = {} diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 2e21fbc4d..6c8a5d05f 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -82,7 +82,7 @@ def _get_precision_and_scale( type_id: int, modifier: Optional[str] -) -> Optional[Tuple[Optional[int], Optional[int]]]: +) -> Tuple[Optional[int], Optional[int]]: """Get precision from postgres type attributes and modifiers.""" if type_id in _FIXED_PRECISION_TYPES: return _FIXED_PRECISION_TYPES[type_id] @@ -119,6 +119,7 @@ def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: logger.warning( "No type found for type_id '%s' and modifier '%s'", type_id, modifier ) + pg_type = "character varying" precision, scale = _get_precision_and_scale(type_id, modifier) return _type_mapper().from_destination_type(pg_type, precision, scale) From e8b2a0cbf38bde5ec59ff34d6626ba60e94c67c9 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 20 Dec 2024 17:23:01 +0100 Subject: [PATCH 70/88] chore: small perf fixes and aligning with more adt --- sources/pg_legacy_replication/__init__.py | 13 +++++++------ sources/pg_legacy_replication/helpers.py | 19 ++++++++++--------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 949f4b76d..5c3f23430 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -1,11 +1,12 @@ """Replicates postgres tables in batch using logical decoding.""" -from typing import Any, Callable, Dict, Iterable, Optional, Sequence, Union +from typing import Any, Callable, Dict, Iterable, Mapping, Optional, Sequence, Union import dlt from dlt.extract import DltResource from dlt.extract.items import TDataItem from dlt.sources.credentials import ConnectionStringCredentials +from collections import defaultdict from .helpers import ( BackendHandler, @@ -24,7 +25,7 @@ def replication_source( schema: str, table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, - table_options: Optional[Dict[str, ReplicationOptions]] = None, + table_options: Optional[Mapping[str, ReplicationOptions]] = None, target_batch_size: int = 1000, flush_slot: bool = True, ) -> Iterable[DltResource]: @@ -74,6 +75,7 @@ def replication_source( Data items for changes published in the publication. """ table_names = [table_names] if isinstance(table_names, str) else table_names or [] + table_options = defaultdict(lambda: ReplicationOptions(), table_options or {}) @dlt.resource(name=lambda args: args["slot_name"], standalone=True) def replication_resource(slot_name: str) -> Iterable[TDataItem]: @@ -97,8 +99,8 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: table_qnames=table_qnames, upto_lsn=upto_lsn, start_lsn=start_lsn, - target_batch_size=target_batch_size, table_options=table_options, + target_batch_size=target_batch_size, ) yield from gen if gen.generated_all: @@ -109,9 +111,8 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: wal_reader = replication_resource(slot_name) for table in table_names: - table_opts = table_options.get(table, {}) if table_options else {} yield dlt.transformer( - _create_table_dispatch(table=table, table_options=table_opts), + _create_table_dispatch(table, table_options=table_options.get(table)), data_from=wal_reader, name=table, ) @@ -132,4 +133,4 @@ def _create_table_dispatch( "cleanup_snapshot_resources", "init_replication", "replication_source", -] +] \ No newline at end of file diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index a23e8e823..962b880a0 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -8,6 +8,7 @@ Iterable, Iterator, List, + Mapping, NamedTuple, Optional, Sequence, @@ -81,7 +82,7 @@ def init_replication( table_names: Optional[Union[str, Sequence[str]]] = None, credentials: ConnectionStringCredentials = dlt.secrets.value, take_snapshots: bool = False, - table_options: Optional[Dict[str, SqlTableOptions]] = None, + table_options: Optional[Mapping[str, SqlTableOptions]] = None, reset: bool = False, ) -> Iterable[DltResource]: """Initializes replication for one, several, or all tables within a schema. @@ -174,7 +175,7 @@ def _configure_engine( Configures the SQLAlchemy engine. Also attaches the replication connection in order to prevent it being garbage collected and closed. """ - engine: Engine = engine_from_credentials(credentials, may_dispose_after_use=False) + engine: Engine = engine_from_credentials(credentials) engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) setattr(engine, "rep_conn", rep_conn) # noqa @@ -321,13 +322,13 @@ def __init__( self, upto_lsn: int, table_qnames: Set[str], + table_options: Mapping[str, ReplicationOptions], target_batch_size: int = 1000, - table_options: Optional[Dict[str, ReplicationOptions]] = None, ) -> None: self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.table_options = table_options or {} + self.table_options = table_options self.consumed_all: bool = False # maps table names to list of data items @@ -393,7 +394,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: table_name = msg.table.split(".")[1] table_schema = self.get_table_schema(msg, table_name) data_item = gen_data_item( - msg, table_schema["columns"], lsn, **self.table_options.get(table_name, {}) + msg, table_schema["columns"], lsn, **self.table_options.get(table_name) ) self.data_items[table_name].append(data_item) @@ -409,7 +410,7 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: if current_hash == self.last_table_hashes.get(table_name): return self.last_table_schema[table_name] - new_schema = infer_table_schema(msg, **self.table_options.get(table_name, {})) + new_schema = infer_table_schema(msg, **self.table_options.get(table_name)) if last_schema is None: # Cache the inferred schema and hash if it is not already cached self.last_table_schema[table_name] = new_schema @@ -445,9 +446,9 @@ class ItemGenerator: slot_name: str table_qnames: Set[str] upto_lsn: int - start_lsn: int = 0 + start_lsn: int + table_options: Mapping[str, ReplicationOptions] target_batch_size: int = 1000 - table_options: Optional[Dict[str, ReplicationOptions]] = None last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -465,8 +466,8 @@ def __iter__(self) -> Iterator[TableItems]: consumer = MessageConsumer( upto_lsn=self.upto_lsn, table_qnames=self.table_qnames, - target_batch_size=self.target_batch_size, table_options=self.table_options, + target_batch_size=self.target_batch_size, ) try: cur.consume_stream(consumer) From 4c3312974b13ce657575fedbd2bef5cb1a913e54 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 20 Dec 2024 19:28:44 +0100 Subject: [PATCH 71/88] chore: refactoring and cleaning --- sources/pg_legacy_replication/__init__.py | 14 +++---- sources/pg_legacy_replication/helpers.py | 28 ++++++------- sources/pg_legacy_replication_pipeline.py | 2 +- .../test_pg_replication.py | 42 ++++++++----------- 4 files changed, 39 insertions(+), 47 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 5c3f23430..5a3d335e8 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -25,7 +25,7 @@ def replication_source( schema: str, table_names: Union[str, Sequence[str]], credentials: ConnectionStringCredentials = dlt.secrets.value, - table_options: Optional[Mapping[str, ReplicationOptions]] = None, + repl_options: Optional[Mapping[str, ReplicationOptions]] = None, target_batch_size: int = 1000, flush_slot: bool = True, ) -> Iterable[DltResource]: @@ -75,7 +75,7 @@ def replication_source( Data items for changes published in the publication. """ table_names = [table_names] if isinstance(table_names, str) else table_names or [] - table_options = defaultdict(lambda: ReplicationOptions(), table_options or {}) + repl_options = defaultdict(lambda: ReplicationOptions(), repl_options or {}) @dlt.resource(name=lambda args: args["slot_name"], standalone=True) def replication_resource(slot_name: str) -> Iterable[TDataItem]: @@ -99,7 +99,7 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: table_qnames=table_qnames, upto_lsn=upto_lsn, start_lsn=start_lsn, - table_options=table_options, + repl_options=repl_options, target_batch_size=target_batch_size, ) yield from gen @@ -112,17 +112,17 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: for table in table_names: yield dlt.transformer( - _create_table_dispatch(table, table_options=table_options.get(table)), + _create_table_dispatch(table, repl_options=repl_options.get(table)), data_from=wal_reader, name=table, ) def _create_table_dispatch( - table: str, table_options: ReplicationOptions + table: str, repl_options: ReplicationOptions ) -> Callable[[TDataItem], Any]: """Creates a dispatch handler that processes data items based on a specified table and optional column hints.""" - handler = BackendHandler(table, table_options) + handler = BackendHandler(table, repl_options) # FIXME Uhhh.. why do I have to do this? handler.__qualname__ = "BackendHandler.__call__" # type: ignore[attr-defined] return handler @@ -133,4 +133,4 @@ def _create_table_dispatch( "cleanup_snapshot_resources", "init_replication", "replication_source", -] \ No newline at end of file +] diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 962b880a0..0d8d0ba11 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -51,7 +51,7 @@ class ReplicationOptions(TypedDict, total=False): backend: Optional[TableBackend] - backend_kwargs: Optional[Dict[str, Any]] + backend_kwargs: Optional[Mapping[str, Any]] column_hints: Optional[TTableSchemaColumns] include_lsn: Optional[bool] # Default is true include_deleted_ts: Optional[bool] # Default is true @@ -322,13 +322,13 @@ def __init__( self, upto_lsn: int, table_qnames: Set[str], - table_options: Mapping[str, ReplicationOptions], + repl_options: Mapping[str, ReplicationOptions], target_batch_size: int = 1000, ) -> None: self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size - self.table_options = table_options + self.repl_options = repl_options self.consumed_all: bool = False # maps table names to list of data items @@ -394,14 +394,14 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: table_name = msg.table.split(".")[1] table_schema = self.get_table_schema(msg, table_name) data_item = gen_data_item( - msg, table_schema["columns"], lsn, **self.table_options.get(table_name) + msg, table_schema["columns"], lsn, **self.repl_options.get(table_name) ) self.data_items[table_name].append(data_item) def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: last_schema = self.last_table_schema.get(table_name) - # Used cached schema if the operation is a delete since the inferred one will always be less precise + # Used cached schema if the operation is a DELETE since the inferred one will always be less precise if msg.op == Op.DELETE and last_schema: return last_schema @@ -410,7 +410,7 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: if current_hash == self.last_table_hashes.get(table_name): return self.last_table_schema[table_name] - new_schema = infer_table_schema(msg, **self.table_options.get(table_name)) + new_schema = infer_table_schema(msg, **self.repl_options.get(table_name)) if last_schema is None: # Cache the inferred schema and hash if it is not already cached self.last_table_schema[table_name] = new_schema @@ -447,7 +447,7 @@ class ItemGenerator: table_qnames: Set[str] upto_lsn: int start_lsn: int - table_options: Mapping[str, ReplicationOptions] + repl_options: Mapping[str, ReplicationOptions] target_batch_size: int = 1000 last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -460,13 +460,11 @@ def __iter__(self) -> Iterator[TableItems]: Does not advance the slot. """ cur = _get_rep_conn(self.credentials).cursor() - cur.start_replication( - slot_name=self.slot_name, start_lsn=self.start_lsn, decode=False - ) + cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) consumer = MessageConsumer( upto_lsn=self.upto_lsn, table_qnames=self.table_qnames, - table_options=self.table_options, + repl_options=self.repl_options, target_batch_size=self.target_batch_size, ) try: @@ -501,7 +499,7 @@ def flush_batch( @dataclass class BackendHandler: table: str - table_options: ReplicationOptions + repl_options: ReplicationOptions def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: """Yields replication messages from ItemGenerator. @@ -518,14 +516,14 @@ def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: # Apply column hints if provided columns = schema["columns"] - if column_hints := self.table_options.get("column_hints"): + if column_hints := self.repl_options.get("column_hints"): for col_name, col_hint in column_hints.items(): if col_name in columns: columns[col_name] = merge_column(columns[col_name], col_hint) # Process based on backend data = table_items.items - backend = self.table_options.get("backend", "sqlalchemy") + backend = self.repl_options.get("backend", "sqlalchemy") try: if backend == "sqlalchemy": yield from self.emit_schema_and_items(columns, data) @@ -560,7 +558,7 @@ def emit_arrow_table( tuple(item.get(column, None) for column in list(columns.keys())) for item in items ] - tz = self.table_options.get("backend_kwargs", {}).get("tz", "UTC") + tz = self.repl_options.get("backend_kwargs", {}).get("tz", "UTC") yield dlt.mark.with_table_name( arrow.row_tuples_to_arrow(rows, columns=columns, tz=tz), self.table, diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index 957fb41a4..b77f65188 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -162,7 +162,7 @@ def replicate_with_column_selection() -> None: slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - table_options={ + repl_options={ "tbl_x": {"included_columns": {"c1", "c2"}} }, # columns not specified here are excluded from generated data items ) diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index c333b7b56..ca103dd1c 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -66,7 +66,7 @@ def tbl_y(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - table_options={ + repl_options={ "tbl_x": {"backend": backend}, "tbl_y": {"backend": backend}, }, @@ -207,7 +207,7 @@ def tbl_y(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y"), - table_options={ + repl_options={ "tbl_x": {"backend": backend}, "tbl_y": {"backend": backend}, }, @@ -302,7 +302,7 @@ def items(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", - table_options={"items": {"backend": backend}}, + repl_options={"items": {"backend": backend}}, ) changes.items.apply_hints( write_disposition="merge", primary_key="col1", columns=merge_hints @@ -407,7 +407,7 @@ def test_unmapped_data_types( slot_name=slot_name, schema=src_pl.dataset_name, table_names="data_types", - table_options={"data_types": {"backend": backend}}, + repl_options={"data_types": {"backend": backend}}, ) # insert record in source table to create replication item @@ -487,7 +487,7 @@ def tbl_z(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - table_options=table_options, + repl_options=table_options, ) # update three postgres tables @@ -555,19 +555,6 @@ def tbl_z(data): ) # initialize replication and create resources - table_options: Dict[str, ReplicationOptions] = { - "tbl_x": { - "backend": backend, - "column_hints": {"another_col_x": {"data_type": "double"}}, - }, - "tbl_y": { - "backend": backend, - "column_hints": {"another_col_y": {"precision": 32}}, - }, - "tbl_z": {"backend": backend}, - # tbl_z is not specified, hence all columns should be included - } - snapshots = init_replication( slot_name=slot_name, schema=src_pl.dataset_name, @@ -577,7 +564,6 @@ def tbl_z(data): "tbl_x": {"backend": backend}, "tbl_y": {"backend": backend}, "tbl_z": {"backend": backend}, - # tbl_z is not specified, hence all columns should be included }, ) if init_load: @@ -588,7 +574,17 @@ def tbl_z(data): slot_name=slot_name, schema=src_pl.dataset_name, table_names=("tbl_x", "tbl_y", "tbl_z"), - table_options=table_options, + repl_options={ + "tbl_x": { + "backend": backend, + "column_hints": {"another_col_x": {"data_type": "double"}}, + }, + "tbl_y": { + "backend": backend, + "column_hints": {"another_col_y": {"precision": 32}}, + }, + "tbl_z": {"backend": backend}, + }, ) # update three postgres tables @@ -604,7 +600,6 @@ def tbl_z(data): dest_pl = dlt.pipeline( pipeline_name="dest_pl", destination=destination_name, dev_mode=True ) - if init_load: dest_pl.run(snapshots) cleanup_snapshot_resources(snapshots) @@ -626,7 +621,6 @@ def tbl_z(data): ] == "bigint" ) - dest_pl.run(changes) assert ( dest_pl.default_schema.get_table_columns("tbl_x")["another_col_x"]["data_type"] @@ -675,7 +669,7 @@ def test_table_schema_change( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", - table_options={"items": {"backend": backend}}, + repl_options={"items": {"backend": backend}}, ) dest_pl = dlt.pipeline( pipeline_name="dest_pl", destination=destination_name, dev_mode=True @@ -725,7 +719,7 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str], backend: TableBackend) - schema=src_pl.dataset_name, table_names="items", target_batch_size=50, - table_options={"items": {"backend": backend}}, + repl_options={"items": {"backend": backend}}, ) # create destination pipeline and resource From 0b7c1518fc66a193b971f5531f67f73f4f5e0d28 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sun, 22 Dec 2024 20:24:12 +0100 Subject: [PATCH 72/88] chore: finished docstrings --- sources/pg_legacy_replication/__init__.py | 84 ++++---- sources/pg_legacy_replication/helpers.py | 191 ++++++++---------- sources/pg_legacy_replication/schema_types.py | 7 +- 3 files changed, 133 insertions(+), 149 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 5a3d335e8..d91909dfb 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -29,50 +29,54 @@ def replication_source( target_batch_size: int = 1000, flush_slot: bool = True, ) -> Iterable[DltResource]: - """Source yielding data items for changes in one or more postgres tables. + """ + Defines a dlt source for replicating Postgres tables using logical replication. + This source reads from a replication slot and pipes the changes using transformers. - Relies on a replication slot that publishes DML operations (i.e. `insert`, `update`, and `delete`). - Maintains LSN of last consumed message in state to track progress. - - At start of the run, advances the slot upto last consumed message in previous run. + - At start of the run, advances the slot upto last consumed message in previous run (for pg>10 only) - Processes in batches to limit memory usage. Args: - slot_name (str): Name of the replication slot to consume replication messages from. - credentials (ConnectionStringCredentials): Postgres database credentials. - included_columns (Optional[Dict[str, TColumnNames]]): Maps table name(s) to - sequence of names of columns to include in the generated data items. - Any column not in the sequence is excluded. If not provided, all columns - are included. For example: - ``` - include_columns={ - "table_x": ["col_a", "col_c"], - "table_y": ["col_x", "col_y", "col_z"], - } - ``` - columns (Optional[Dict[str, TTableSchemaColumns]]): Maps - table name(s) to column hints to apply on the replicated table(s). For example: - ``` - columns={ - "table_x": {"col_a": {"data_type": "complex"}}, - "table_y": {"col_y": {"precision": 32}}, - } - ``` - target_batch_size (int): Desired number of data items yielded in a batch. - Can be used to limit the data items in memory. Note that the number of - data items yielded can be (far) greater than `target_batch_size`, because - all messages belonging to the same transaction are always processed in - the same batch, regardless of the number of messages in the transaction - and regardless of the value of `target_batch_size`. The number of data - items can also be smaller than `target_batch_size` when the replication - slot is exhausted before a batch is full. - flush_slot (bool): Whether processed messages are discarded from the replication - slot. Recommended value is True. Be careful when setting False—not flushing - can eventually lead to a “disk full” condition on the server, because - the server retains all the WAL segments that might be needed to stream - the changes via all the currently open replication slots. - - Yields: - Data items for changes published in the publication. + slot_name (str): + The name of the logical replication slot used to fetch WAL changes. + schema (str): + Name of the schema to replicate tables from. + table_names (Union[str, Sequence[str]]): + The name(s) of the tables to replicate. Can be a single table name or a list of table names. + credentials (ConnectionStringCredentials): + Database credentials for connecting to the Postgres instance. + repl_options (Optional[Mapping[str, ReplicationOptions]], optional): + A mapping of table names to `ReplicationOptions`, allowing for fine-grained control over + replication behavior for each table. + + Each `ReplicationOptions` dictionary can include the following keys: + - `backend` (Optional[TableBackend]): Specifies the backend to use for table replication. + - `backend_kwargs` (Optional[Mapping[str, Any]]): Additional configuration options for the backend. + - `column_hints` (Optional[TTableSchemaColumns]): A dictionary of hints for column types or properties. + - `include_lsn` (Optional[bool]): Whether to include the LSN (Log Sequence Number) + in the replicated data. Defaults to `True`. + - `include_deleted_ts` (Optional[bool]): Whether to include a timestamp for deleted rows. + Defaults to `True`. + - `include_commit_ts` (Optional[bool]): Whether to include the commit timestamp of each change. + - `include_tx_id` (Optional[bool]): Whether to include the transaction ID of each change. + - `included_columns` (Optional[Set[str]]): A set of specific columns to include in the replication. + If not specified, all columns are included. + target_batch_size (int, optional): + The target size of each batch of replicated data items. Defaults to `1000`. + flush_slot (bool, optional): + If `True`, advances the replication slot to the last processed LSN + to prevent replaying already replicated changes. Defaults to `True`. + + Yields: + Iterable[DltResource]: + A collection of `DltResource` objects, each corresponding to a table being replicated. + + Notes: + - The `repl_options` parameter allows fine-tuning of replication behavior, such as column filtering + or write disposition configuration, per table. + - The replication process is incremental, ensuring only new changes are processed after the last commit LSN. """ table_names = [table_names] if isinstance(table_names, str) else table_names or [] repl_options = defaultdict(lambda: ReplicationOptions(), repl_options or {}) @@ -85,7 +89,7 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: advance_slot(start_lsn, slot_name, credentials) # continue until last message in replication slot - upto_lsn = get_max_lsn(slot_name, credentials) + upto_lsn = get_max_lsn(credentials) if upto_lsn is None: return @@ -133,4 +137,4 @@ def _create_table_dispatch( "cleanup_snapshot_resources", "init_replication", "replication_source", -] +] \ No newline at end of file diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 0d8d0ba11..99a241c75 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -85,54 +85,39 @@ def init_replication( table_options: Optional[Mapping[str, SqlTableOptions]] = None, reset: bool = False, ) -> Iterable[DltResource]: - """Initializes replication for one, several, or all tables within a schema. - - Can be called repeatedly with the same `slot_name`: - - creates a replication slot and publication with provided names if they do not exist yet - - skips creation of slot and publication if they already exist (unless`reset` is set to `False`) - - supports addition of new tables by extending `table_names` - - removing tables is not supported, i.e. exluding a table from `table_names` - will not remove it from the publication - - switching from a table selection to an entire schema is possible by omitting - the `table_names` argument - - changing `publish` has no effect (altering the published DML operations is not supported) - - table snapshots can only be persisted on the first call (because the snapshot - is exported when the slot is created) + """ + Initializes a replication session for Postgres using logical replication. + Optionally, snapshots of specified tables can be taken during initialization. Args: - slot_name (str): Name of the replication slot to create if it does not exist yet. - schema (str): Name of the schema to replicate tables from. - table_names (Optional[Union[str, Sequence[str]]]): Name(s) of the table(s) - to include in the publication. If not provided, all tables in the schema - are included. - credentials (ConnectionStringCredentials): Postgres database credentials. - take_snapshots (bool): Whether the table states in the snapshot exported - during replication slot creation are persisted to tables. If true, a - snapshot table is created in Postgres for all included tables, and corresponding - resources (`DltResource` objects) for these tables are created and returned. - The resources can be used to perform an initial load of all data present - in the tables at the moment the replication slot got created. - included_columns (Optional[Dict[str, Sequence[str]]]): Maps table name(s) to - sequence of names of columns to include in the snapshot table(s). - Any column not in the sequence is excluded. If not provided, all columns - are included. For example: - ``` - included_columns={ - "table_x": ["col_a", "col_c"], - "table_y": ["col_x", "col_y", "col_z"], - } - ``` - Argument is only used if `take_snapshots` is `True`. - ``` - Argument is only used if `take_snapshots` is `True`. - reset (bool): If set to True, the existing slot and publication are dropped - and recreated. Has no effect if a slot and publication with the provided - names do not yet exist. + slot_name (str): + The name of the logical replication slot to be used or created. + schema (str): + Name of the schema to replicate tables from. + table_names (Optional[Union[str, Sequence[str]]]): + The name(s) of the table(s) to replicate. Can be a single table name or a list of table names. + If not provided, no tables will be replicated unless `take_snapshots` is `True`. + credentials (ConnectionStringCredentials): + Database credentials for connecting to the Postgres instance. + take_snapshots (bool): + Whether to take initial snapshots of the specified tables. + Defaults to `False`. + table_options (Optional[Mapping[str, SqlTableOptions]]): + Additional options for configuring replication for specific tables. + These are the exact same parameters for the `dlt.sources.sql_database.sql_table` function. + Argument is only used if `take_snapshots` is `True`. + reset (bool, optional): + If `True`, drops the existing replication slot before creating a new one. + Use with caution, as this will clear existing replication state. + Defaults to `False`. Returns: - None if `take_snapshots` is `False` - - a `DltResource` object or a list of `DltResource` objects for the snapshot - table(s) if `take_snapshots` is `True` and the replication slot did not yet exist + - a list of `DltResource` objects for the snapshot table(s) if `take_snapshots` is `True`. + + Notes: + - If `reset` is `True`, the existing replication slot will be dropped before creating a new one. + - When `take_snapshots` is `True`, the function configures a snapshot isolation level for consistent table snapshots. """ rep_conn = _get_rep_conn(credentials) rep_cur = rep_conn.cursor() @@ -180,8 +165,8 @@ def _configure_engine( setattr(engine, "rep_conn", rep_conn) # noqa @sa.event.listens_for(engine, "engine_disposed") - def on_engine_disposed(engine: Engine) -> None: - delattr(engine, "rep_conn") + def on_engine_disposed(e: Engine) -> None: + delattr(e, "rep_conn") return engine @@ -230,15 +215,12 @@ def drop_replication_slot(name: str, cur: ReplicationCursor) -> None: ) -def get_max_lsn( - slot_name: str, - credentials: ConnectionStringCredentials, -) -> Optional[int]: - """Returns maximum Log Sequence Number (LSN) in replication slot. +def get_max_lsn(credentials: ConnectionStringCredentials) -> Optional[int]: + """ + Returns maximum Log Sequence Number (LSN). Returns None if the replication slot is empty. Does not consume the slot, i.e. messages are not flushed. - Raises error if the replication slot or publication does not exist. """ cur = _get_conn(credentials).cursor() loc_fn = ( @@ -248,18 +230,13 @@ def get_max_lsn( ) # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) cur.execute(f"SELECT {loc_fn}() - '0/0' as max_lsn;") - # lsn_field = "location" if get_pg_version(cur) < 100000 else "lsn" - # cur.execute( - # f"SELECT MAX({lsn_field} - '0/0') AS max_lsn " # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) - # f"FROM pg_logical_slot_peek_binary_changes('{slot_name}', NULL, NULL);" - # ) lsn: int = cur.fetchone()[0] cur.connection.close() return lsn def lsn_int_to_hex(lsn: int) -> str: - """Convert integer LSN to postgres' hexadecimal representation.""" + """Convert integer LSN to postgres hexadecimal representation.""" # https://stackoverflow.com/questions/66797767/lsn-external-representation. return f"{lsn >> 32 & 4294967295:X}/{lsn & 4294967295:08X}" @@ -269,7 +246,8 @@ def advance_slot( slot_name: str, credentials: ConnectionStringCredentials, ) -> None: - """Advances position in the replication slot. + """ + Advances position in the replication slot. Flushes all messages upto (and including) the message with LSN = `upto_lsn`. This function is used as alternative to psycopg2's `send_feedback` method, because @@ -303,7 +281,8 @@ def _get_conn( def _get_rep_conn( credentials: ConnectionStringCredentials, ) -> LogicalReplicationConnection: - """Returns a psycopg2 LogicalReplicationConnection to interact with postgres replication functionality. + """ + Returns a psycopg2 LogicalReplicationConnection to interact with postgres replication functionality. Raises error if the user does not have the REPLICATION attribute assigned. """ @@ -311,7 +290,8 @@ def _get_rep_conn( class MessageConsumer: - """Consumes messages from a ReplicationCursor sequentially. + """ + Consumes messages from a ReplicationCursor sequentially. Generates data item for each `insert`, `update`, and `delete` message. Processes in batches to limit memory usage. @@ -360,11 +340,12 @@ def process_msg(self, msg: ReplicationMessage) -> None: assert row_msg.op != Op.UNKNOWN, f"Unsupported operation : {row_msg}" if row_msg.op == Op.BEGIN: - self.last_commit_ts = _epoch_micros_to_datetime(row_msg.commit_time) + # self.last_commit_ts = _epoch_micros_to_datetime(row_msg.commit_time) + pass elif row_msg.op == Op.COMMIT: - self.process_commit(msg.data_start) + self.process_commit(lsn=msg.data_start) else: # INSERT, UPDATE or DELETE - self.process_change(row_msg, msg.data_start) + self.process_change(row_msg, lsn=msg.data_start) except StopReplication: raise except Exception: @@ -374,7 +355,8 @@ def process_msg(self, msg: ReplicationMessage) -> None: raise def process_commit(self, lsn: int) -> None: - """Updates object state when Commit message is observed. + """ + Updates object state when Commit message is observed. Raises StopReplication when `upto_lsn` or `target_batch_size` is reached. """ @@ -399,6 +381,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: self.data_items[table_name].append(data_item) def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: + """Given a row message, calculates or fetches a table schema.""" last_schema = self.last_table_schema.get(table_name) # Used cached schema if the operation is a DELETE since the inferred one will always be less precise @@ -453,11 +436,12 @@ class ItemGenerator: generated_all: bool = False def __iter__(self) -> Iterator[TableItems]: - """Yields replication messages from MessageConsumer. + """ + Yields data items/schema from MessageConsumer. - Starts replication of messages published by the publication from the replication slot. - Maintains LSN of last consumed Commit message in object state. - Does not advance the slot. + Starts replication of messages from the replication slot. + Maintains LSN of last consumed commit message in object state. + Advances the slot only when all messages have been consumed. """ cur = _get_rep_conn(self.credentials).cursor() cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) @@ -470,7 +454,6 @@ def __iter__(self) -> Iterator[TableItems]: try: cur.consume_stream(consumer) except StopReplication: # completed batch or reached `upto_lsn` - logger.info("Flushing batch of %s events", self.target_batch_size) yield from self.flush_batch(cur, consumer) finally: cur.connection.close() @@ -498,24 +481,22 @@ def flush_batch( @dataclass class BackendHandler: + """ + Consumes messages from ItemGenerator once a batch is ready for emitting. + + It is mainly responsible for emitting schema and dict data times or transforming + into arrow tables. + """ + table: str repl_options: ReplicationOptions def __call__(self, table_items: TableItems) -> Iterable[DataItemWithMeta]: - """Yields replication messages from ItemGenerator. - - Args: - table_items: An object containing schema and items for the table. - - Yields: - DataItemWithMeta: Processed data items based on the table and backend. - """ - schema = table_items.schema - if schema["name"] != self.table: + if table_items.schema["name"] != self.table: return # Apply column hints if provided - columns = schema["columns"] + columns = table_items.schema["columns"] if column_hints := self.repl_options.get("column_hints"): for col_name, col_hint in column_hints.items(): if col_name in columns: @@ -574,13 +555,14 @@ def infer_table_schema( included_columns: Optional[Set[str]] = None, **_: Any, ) -> TTableSchema: - """Infers the table schema from the replication message and optional hints""" + """Infers the table schema from the replication message and optional hints.""" # Choose the correct source based on operation type is_change = msg.op != Op.DELETE tuples = msg.new_tuple if is_change else msg.old_tuple + schema = TTableSchema(name=msg.table.split(".")[1]) # Filter and map columns, conditionally using `new_typeinfo` when available - columns = { + schema["columns"] = { col_name: _to_dlt_column_schema( col_name, datum=col, type_info=msg.new_typeinfo[i] if is_change else None ) @@ -591,35 +573,32 @@ def infer_table_schema( # Add replication columns if include_lsn: - columns["_pg_lsn"] = { + schema["columns"]["_pg_lsn"] = { "data_type": "bigint", "name": "_pg_lsn", "nullable": True, } if include_deleted_ts: - columns["_pg_deleted_ts"] = { + schema["columns"]["_pg_deleted_ts"] = { "data_type": "timestamp", "name": "_pg_deleted_ts", "nullable": True, } if include_commit_ts: - columns["_pg_commit_ts"] = { + schema["columns"]["_pg_commit_ts"] = { "data_type": "timestamp", "name": "_pg_commit_ts", "nullable": True, } if include_tx_id: - columns["_pg_tx_id"] = { + schema["columns"]["_pg_tx_id"] = { "data_type": "bigint", "name": "_pg_tx_id", "nullable": True, "precision": 32, } - return { - "name": msg.table.split(".")[1], - "columns": columns, - } + return schema def gen_data_item( @@ -643,25 +622,26 @@ def gen_data_item( data_item["_pg_tx_id"] = msg.transaction_id # Select the relevant row tuple based on operation type - row = msg.new_tuple if msg.op != Op.DELETE else msg.old_tuple - if msg.op == Op.DELETE and include_deleted_ts: + is_delete = msg.op == Op.DELETE + row = msg.old_tuple if is_delete else msg.new_tuple + if is_delete and include_deleted_ts: data_item["_pg_deleted_ts"] = _epoch_micros_to_datetime(msg.commit_time) for data in row: col_name = _actual_column_name(data) if not included_columns or col_name in included_columns: data_item[col_name] = _to_dlt_val( - data, - column_schema[col_name]["data_type"], - for_delete=msg.op == Op.DELETE, + data, column_schema[col_name]["data_type"], for_delete=is_delete ) return data_item def _actual_column_name(column: DatumMessage) -> str: - """Certain column names are quoted since they are reserved keywords, - however let the destination decide on how to normalize them""" + """ + Certain column names are quoted since they are reserved keywords, + however let the destination decide on how to normalize them + """ col_name = column.column_name if col_name.startswith('"') and col_name.endswith('"'): col_name = col_name[1:-1] @@ -678,13 +658,15 @@ def _actual_column_name(column: DatumMessage) -> str: def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: - """Compares the last schema with the new one and chooses the more + """ + Compares the last schema with the new one and chooses the more precise one if they are relatively equal or else raises a - AssertionError due to an incompatible schema change""" + AssertionError due to an incompatible schema change + """ table_name = last["name"] assert table_name == new["name"], "Table names do not match" - table_schema: TTableSchema = {"name": table_name, "columns": {}} + table_schema = TTableSchema(name=table_name, columns={}) last_cols, new_cols = last["columns"], new["columns"] assert len(last_cols) == len( new_cols @@ -696,15 +678,12 @@ def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: s2 is not None and s1["data_type"] == s2["data_type"] ), f"Incompatible schema for column '{name}'" - # Ensure new has no fields outside of allowed fields + # Ensure new has no fields outside allowed fields extra_fields = set(s2.keys()) - ALLOWED_COL_SCHEMA_FIELDS assert not extra_fields, f"Unexpected fields {extra_fields} in column '{name}'" # Select the more precise schema by comparing nullable, precision, and scale - col_schema: TColumnSchema = { - "name": name, - "data_type": s1["data_type"], - } + col_schema = TColumnSchema(name=name, data_type=s1["data_type"]) if "nullable" in s1 or "nullable" in s2: col_schema["nullable"] = s1.get("nullable", s2.get("nullable")) if "precision" in s1 or "precision" in s2: @@ -715,4 +694,4 @@ def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: # Update with the more detailed schema per column table_schema["columns"][name] = col_schema - return table_schema + return table_schema \ No newline at end of file diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 6c8a5d05f..d2cec4ba0 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -20,7 +20,7 @@ "decimal": Decimal(0), "double": 0.0, "text": "", - "time": pendulum.Time(0, 0, 0), + "time": pendulum.Time(), "timestamp": pendulum.from_timestamp(0), "wei": 0, } @@ -108,7 +108,8 @@ def _type_mapper() -> PostgresTypeMapper: def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: - """Converts postgres type OID to dlt column type. + """ + Converts postgres type OID to dlt column type. Type OIDs not in _PG_TYPES mapping default to "text" type. """ @@ -150,7 +151,7 @@ def _epoch_micros_to_datetime(microseconds_since_1970: int) -> pendulum.DateTime def _microseconds_to_time(microseconds: int) -> pendulum.Time: - return pendulum.Time(0).add(microseconds=microseconds) + return pendulum.Time().add(microseconds=microseconds) def _epoch_days_to_date(epoch_days: int) -> pendulum.Date: From ec72e3620611752e5ed2875ff4b7bab8234aa631 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Sun, 22 Dec 2024 23:39:30 +0100 Subject: [PATCH 73/88] bugfix: misuse of defaultdict --- sources/pg_legacy_replication/__init__.py | 4 ++-- sources/pg_legacy_replication/helpers.py | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index d91909dfb..8c8778ba3 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -116,7 +116,7 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: for table in table_names: yield dlt.transformer( - _create_table_dispatch(table, repl_options=repl_options.get(table)), + _create_table_dispatch(table, repl_options=repl_options[table]), data_from=wal_reader, name=table, ) @@ -137,4 +137,4 @@ def _create_table_dispatch( "cleanup_snapshot_resources", "init_replication", "replication_source", -] \ No newline at end of file +] diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 99a241c75..d9a076332 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -4,6 +4,7 @@ from typing import ( Any, Callable, + DefaultDict, Dict, Iterable, Iterator, @@ -302,7 +303,7 @@ def __init__( self, upto_lsn: int, table_qnames: Set[str], - repl_options: Mapping[str, ReplicationOptions], + repl_options: DefaultDict[str, ReplicationOptions], target_batch_size: int = 1000, ) -> None: self.upto_lsn = upto_lsn @@ -376,7 +377,7 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: table_name = msg.table.split(".")[1] table_schema = self.get_table_schema(msg, table_name) data_item = gen_data_item( - msg, table_schema["columns"], lsn, **self.repl_options.get(table_name) + msg, table_schema["columns"], lsn, **self.repl_options[table_name] ) self.data_items[table_name].append(data_item) @@ -393,7 +394,7 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: if current_hash == self.last_table_hashes.get(table_name): return self.last_table_schema[table_name] - new_schema = infer_table_schema(msg, **self.repl_options.get(table_name)) + new_schema = infer_table_schema(msg, **self.repl_options[table_name]) if last_schema is None: # Cache the inferred schema and hash if it is not already cached self.last_table_schema[table_name] = new_schema @@ -430,7 +431,7 @@ class ItemGenerator: table_qnames: Set[str] upto_lsn: int start_lsn: int - repl_options: Mapping[str, ReplicationOptions] + repl_options: DefaultDict[str, ReplicationOptions] target_batch_size: int = 1000 last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -694,4 +695,4 @@ def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: # Update with the more detailed schema per column table_schema["columns"][name] = col_schema - return table_schema \ No newline at end of file + return table_schema From ecc6089fb459dc25d9555ef4b5b7ce5098a67bea Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 23 Dec 2024 04:31:31 +0100 Subject: [PATCH 74/88] Finally done with docs --- sources/pg_legacy_replication/README.md | 115 +++++++++++++----- .../pg_legacy_replication/requirements.txt | 4 +- 2 files changed, 85 insertions(+), 34 deletions(-) diff --git a/sources/pg_legacy_replication/README.md b/sources/pg_legacy_replication/README.md index d661854ef..f6c9de239 100644 --- a/sources/pg_legacy_replication/README.md +++ b/sources/pg_legacy_replication/README.md @@ -1,27 +1,49 @@ -# Postgres replication -[Postgres](https://www.postgresql.org/) is one of the most popular relational database management systems. This verified source uses Postgres' replication functionality to efficiently process changes in tables (a process often referred to as _Change Data Capture_ or CDC). It uses [logical decoding](https://www.postgresql.org/docs/current/logicaldecoding.html) and the standard built-in `pgoutput` [output plugin](https://www.postgresql.org/docs/current/logicaldecoding-output-plugin.html). - -Resources that can be loaded using this verified source are: - -| Name | Description | -|----------------------|-------------------------------------------------| -| replication_resource | Load published messages from a replication slot | +# Postgres legacy replication +[Postgres](https://www.postgresql.org/) is one of the most popular relational database management systems. This verified source uses Postgres' replication functionality to efficiently process changes +in tables (a process often referred to as _Change Data Capture_ or CDC). It uses [logical decoding](https://www.postgresql.org/docs/current/logicaldecoding.html) and the optional `decoderbufs` +[output plugin](https://github.com/debezium/postgres-decoderbufs), which is a shared library which must be built or enabled. + +| Source | Description | +|---------------------|-------------------------------------------------| +| replication_source | Load published messages from a replication slot | + +## Install decoderbufs + +Instructions can be found [here](https://github.com/debezium/postgres-decoderbufs?tab=readme-ov-file#building) + +Below is an example installation in a docker image: +```Dockerfile +FROM postgres:14 + +# Install dependencies required to build decoderbufs +RUN apt-get update +RUN apt-get install -f -y \ + software-properties-common \ + build-essential \ + pkg-config \ + git + +RUN apt-get install -f -y \ + postgresql-server-dev-14 \ + libprotobuf-c-dev && \ + rm -rf /var/lib/apt/lists/* + +ARG decoderbufs_version=v1.7.0.Final +RUN git clone https://github.com/debezium/postgres-decoderbufs -b $decoderbufs_version --single-branch && \ + cd postgres-decoderbufs && \ + make && make install && \ + cd .. && \ + rm -rf postgres-decoderbufs +``` ## Initialize the pipeline ```bash -dlt init pg_replication duckdb +$ dlt init pg_legacy_replication duckdb ``` This uses `duckdb` as destination, but you can choose any of the supported [destinations](https://dlthub.com/docs/dlt-ecosystem/destinations/). -## Add `sql_database` source - -```bash -dlt init sql_database duckdb -``` - -This source depends on the [sql_database](../sql_database/README.md) verified source internally to perform initial loads. This step can be skipped if you don't do initial loads. ## Set up user The Postgres user needs to have the `LOGIN` and `REPLICATION` attributes assigned: @@ -30,30 +52,21 @@ The Postgres user needs to have the `LOGIN` and `REPLICATION` attributes assigne CREATE ROLE replication_user WITH LOGIN REPLICATION; ``` -It also needs `CREATE` privilege on the database: +It also needs various read only privileges on the database (by first connecting to the database): ```sql -GRANT CREATE ON DATABASE dlt_data TO replication_user; -``` - -### Set up RDS -1. You must enable replication for RDS Postgres instance via **Parameter Group**: https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_PostgreSQL.Replication.ReadReplicas.html -2. `WITH LOGIN REPLICATION;` does not work on RDS, instead do: -```sql -GRANT rds_replication TO replication_user; -``` -3. Do not fallback to non SSL connection by setting connection parameters: -```toml -sources.pg_replication.credentials="postgresql://loader:password@host.rds.amazonaws.com:5432/dlt_data?sslmode=require&connect_timeout=300" +\connect dlt_data +GRANT USAGE ON SCHEMA schema_name TO replication_user; +GRANT SELECT ON ALL TABLES IN SCHEMA public TO replication_user; +ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO replication_user; ``` - ## Add credentials 1. Open `.dlt/secrets.toml`. 2. Enter your Postgres credentials: ```toml - [sources.pg_replication] + [sources.pg_legacy_replication] credentials="postgresql://replication_user:<>@localhost:5432/dlt_data" ``` 3. Enter credentials for your chosen destination as per the [docs](https://dlthub.com/docs/dlt-ecosystem/destinations/). @@ -69,7 +82,7 @@ sources.pg_replication.credentials="postgresql://loader:password@host.rds.amazon 1. Now the pipeline can be run by using the command: ```bash - python pg_replication_pipeline.py + python pg_legacy_replication_pipeline.py ``` 1. To make sure that everything is loaded as expected, use the command: @@ -77,3 +90,41 @@ sources.pg_replication.credentials="postgresql://loader:password@host.rds.amazon ```bash dlt pipeline pg_replication_pipeline show ``` + +# Differences between `pg_legacy_replication` and `pg_replication` + +## Overview + +`pg_legacy_replication` is a fork of the verified `pg_replication` source. The primary goal of this fork is to provide logical replication capabilities for Postgres instances running versions +earlier than 10, when the `pgoutput` plugin was not yet available. This fork draws inspiration from the original `pg_replication` source and the `decoderbufs` library, +which is actively maintained by Debezium. + +## Key Differences from `pg_replication` + +### Replication User Ownership Requirements +One of the limitations of native Postgre replication is that the replication user must **own** the tables in order to add them to a **publication**. +Additionally, once a table is added to a publication, it cannot be removed, requiring the creation of a new replication slot, which results in the loss of any state tracking. + +### Limitations in `pg_replication` +The current pg_replication implementation has several limitations: +- It supports only a single initial snapshot of the data. +- It requires `CREATE` access to the source database in order to perform the initial snapshot. +- **Superuser** access is required to replicate entire Postgres schemas. + While the `pg_legacy_replication` source theoretically reads the entire WAL across all schemas, the current implementation using dlt transformers restricts this functionality. + In practice, this has not been a common use case. +- The implementation is opinionated in its approach to data transfer. Specifically, when updates or deletes are required, it defaults to a `merge` write disposition, + which replicates live data without tracking changes over time. + +### Features of `pg_legacy_replication` + +This fork of `pg_replication` addresses the aforementioned limitations and introduces the following improvements: +- Adheres to the dlt philosophy by treating the WAL as an upstream resources. This replication stream is then transformed into various DLT resources, with customizable options for write disposition, + file formats, type hints, etc., specified at the resource level rather than at the source level. +- Supports an initial snapshot of all tables using the transaction slot isolation level. Additionally, ad-hoc snapshots can be performed using the serializable deferred isolation level, + similar to `pg_dump`. +- Emphasizes the use of `pyarrow` and parquet formats for efficient data storage and transfer. A dedicated backend has been implemented to support these formats. +- Replication messages are decoded using Protocol Buffers (protobufs) in C, rather than relying on native Python byte buffer parsing. This ensures greater efficiency and performance. + +## Next steps +- Add support for the [wal2json](https://github.com/eulerto/wal2json) replication plugin. This is particularly important for environments such as **Amazon RDS**, which supports `wal2json`, +- as opposed to on-premise or Google Cloud SQL instances that support `decoderbufs`. \ No newline at end of file diff --git a/sources/pg_legacy_replication/requirements.txt b/sources/pg_legacy_replication/requirements.txt index 98459d020..85f40b3e5 100644 --- a/sources/pg_legacy_replication/requirements.txt +++ b/sources/pg_legacy_replication/requirements.txt @@ -1,4 +1,4 @@ -dlt>=0.5.12 +dlt>=1.3.0 psycopg2-binary>=2.9.9 protobuf>=5 -sqlalchemy>=1.4 +sqlalchemy>=1.4 \ No newline at end of file From dd5a63bf35f05809f64bd589f95f67d7931b586d Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 30 Dec 2024 14:42:12 +0100 Subject: [PATCH 75/88] fix: wasn't able to execute local tests without these settings --- sources/.dlt/example.secrets.toml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sources/.dlt/example.secrets.toml b/sources/.dlt/example.secrets.toml index a0e8963e0..4a9590cfe 100644 --- a/sources/.dlt/example.secrets.toml +++ b/sources/.dlt/example.secrets.toml @@ -16,7 +16,11 @@ location = "US" ### Sources [sources] +# local postgres +helpers.credentials="postgresql://loader:loader@localhost:5432/dlt_data" +pg_legacy_replication.credentials="postgresql://loader:loader@localhost:5432/dlt_data" + ## chess pipeline # the section below defines secrets for "chess_dlt_config_example" source in chess/__init__.py [sources.chess] -secret_str="secret string" # a string secret +secret_str="secret string" # a string secret \ No newline at end of file From d3774231de85452e9fbb05eef13727469f038a02 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 14 Jan 2025 20:36:32 +0100 Subject: [PATCH 76/88] feat: added basic support for scalar array types --- sources/pg_legacy_replication/schema_types.py | 28 +++++++ tests/pg_legacy_replication/cases.py | 80 ++++++++++++++++++- 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index d2cec4ba0..cd565e0ef 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -116,6 +116,8 @@ def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: pg_type = _PG_TYPES.get(type_id) if pg_type in _MISSING_TYPES: return {"data_type": _MISSING_TYPES[pg_type]} + if modifier and modifier.endswith("[]"): + return {"data_type": "json"} if pg_type is None: logger.warning( "No type found for type_id '%s' and modifier '%s'", type_id, modifier @@ -181,4 +183,30 @@ def _to_dlt_val( return data_type_handlers[data_type](raw_value) raw_type = _DATUM_RAW_TYPES[datum] + if _is_scalar_pg_array(data_type, raw_type, raw_value): + raw_type, raw_value = "text", _pg_array_to_json_str(raw_value) + return coerce_value(data_type, raw_type, raw_value) + + +def _is_scalar_pg_array( + data_type: TDataType, raw_type: TDataType, raw_value: Any +) -> bool: + return ( + data_type == "json" + and raw_type == "binary" + and raw_value.startswith(b"{") + and raw_value.endswith(b"}") + ) + + +def _pg_array_to_json_str(raw_value: bytes) -> str: + """ + Decode the byte string to a regular string and strip the curly braces + """ + content = raw_value[1:-1].decode() + csv = ",".join( + f'"{element}"' if element.isalpha() else element + for element in content.split(",") + ) + return f"[{csv}]" diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 5bcf23298..08440911e 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -1,8 +1,8 @@ +from base64 import b64encode from typing import List import pendulum from dlt.common import Decimal -from dlt.common.data_types.typing import DATA_TYPES from dlt.common.schema import TColumnSchema, TTableSchema, TTableSchemaColumns from dlt.common.typing import TDataItem @@ -259,6 +259,49 @@ }, ], }, + { + "transactionId": 754, + "commitTime": "1736873892023448", + "table": "src_pl_dataset_202501140458116348.data_types", + "op": "INSERT", + "newTuple": [ + {"columnName": "bit_col", "columnType": 1560, "datumString": "1"}, + { + "columnName": "box_col", + "columnType": 603, + "datumBytes": b64encode(b"(1,1),(0,0)").decode(), + }, + { + "columnName": "uuid_col", + "columnType": 2950, + "datumString": "6e1f5de1-1093-4bfe-98e4-62ac56b2db54", + }, + { + "columnName": "text_a", + "columnType": 1009, + "datumBytes": b64encode(b"{a,b}").decode(), + }, + ], + "newTypeinfo": [ + { + "modifier": "bit(1)", + "valueOptional": True, + }, + { + "modifier": "box", + "valueOptional": True, + }, + { + "modifier": "uuid", + "valueOptional": True, + }, + { + "modifier": "text[]", + "valueOptional": True, + }, + ], + "oldTuple": [], + }, ] DATA_ITEMS: List[TDataItem] = [ @@ -309,6 +352,15 @@ "_pg_commit_ts": pendulum.parse("2024-10-19T00:56:23.354856+00:00"), "_pg_tx_id": 932, }, + { + "bit_col": "1", + "box_col": "KDEsMSksKDAsMCk=", + "uuid_col": "6e1f5de1-1093-4bfe-98e4-62ac56b2db54", + "text_a": ["a", "b"], + "_pg_lsn": 1, + "_pg_commit_ts": pendulum.parse("2025-01-14T16:58:12.023448+00:00"), + "_pg_tx_id": 754, + }, ] TABLE_SCHEMAS: List[TTableSchema] = [ @@ -417,4 +469,30 @@ }, }, }, + { + "name": "data_types", + "columns": { + "bit_col": {"data_type": "text", "name": "bit_col", "nullable": True}, + "box_col": {"data_type": "text", "name": "box_col", "nullable": True}, + "uuid_col": {"data_type": "text", "name": "uuid_col", "nullable": True}, + "text_a": {"data_type": "json", "name": "text_a", "nullable": True}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_commit_ts": { + "data_type": "timestamp", + "name": "_pg_commit_ts", + "nullable": True, + }, + "_pg_tx_id": { + "data_type": "bigint", + "name": "_pg_tx_id", + "nullable": True, + "precision": 32, + }, + }, + }, ] From acdf4469aaf8455c541a994fa6ad53a307443041 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 15 Jan 2025 12:43:14 +0100 Subject: [PATCH 77/88] chore: slight perf improvments for pg_arrays --- sources/pg_legacy_replication/schema_types.py | 24 ++++++++----------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index cd565e0ef..5389e10e3 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -183,20 +183,19 @@ def _to_dlt_val( return data_type_handlers[data_type](raw_value) raw_type = _DATUM_RAW_TYPES[datum] - if _is_scalar_pg_array(data_type, raw_type, raw_value): - raw_type, raw_value = "text", _pg_array_to_json_str(raw_value) + if raw_type == "binary" and _is_scalar_pg_array(data_type, raw_value): + raw_type = "text" + raw_value = _pg_array_to_json_str(raw_value) return coerce_value(data_type, raw_type, raw_value) -def _is_scalar_pg_array( - data_type: TDataType, raw_type: TDataType, raw_value: Any -) -> bool: +def _is_scalar_pg_array(data_type: TDataType, raw_value: bytes) -> bool: return ( - data_type == "json" - and raw_type == "binary" - and raw_value.startswith(b"{") - and raw_value.endswith(b"}") + len(raw_value) > 1 + and data_type == "json" + and raw_value[0] == ord("{") + and raw_value[-1] == ord("}") ) @@ -204,9 +203,6 @@ def _pg_array_to_json_str(raw_value: bytes) -> str: """ Decode the byte string to a regular string and strip the curly braces """ - content = raw_value[1:-1].decode() - csv = ",".join( - f'"{element}"' if element.isalpha() else element - for element in content.split(",") - ) + without_braces = raw_value[1:-1].decode() + csv = ",".join(f'"{x}"' if x.isalpha() else x for x in without_braces.split(",")) return f"[{csv}]" From a3dc99df0547a4e261e550eb1fd4e3f58c4821ad Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 16 Jan 2025 09:44:53 +0100 Subject: [PATCH 78/88] fix: it turns out pg_arrays are annoying found temp workaround --- sources/pg_legacy_replication/schema_types.py | 20 ++++++++++++------- tests/pg_legacy_replication/cases.py | 6 ++++-- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 5389e10e3..e93ddb1c3 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -1,6 +1,7 @@ +import json import re from functools import lru_cache -from typing import Any, Callable, Dict, Optional, Tuple +from typing import Any, Callable, List, Dict, Optional, Tuple import pendulum from dlt.common import Decimal, logger @@ -184,8 +185,7 @@ def _to_dlt_val( raw_type = _DATUM_RAW_TYPES[datum] if raw_type == "binary" and _is_scalar_pg_array(data_type, raw_value): - raw_type = "text" - raw_value = _pg_array_to_json_str(raw_value) + return _pg_array_to_json_array(raw_value) return coerce_value(data_type, raw_type, raw_value) @@ -199,10 +199,16 @@ def _is_scalar_pg_array(data_type: TDataType, raw_value: bytes) -> bool: ) -def _pg_array_to_json_str(raw_value: bytes) -> str: +def _pg_array_to_json_array(raw_value: bytes) -> List[Any]: """ - Decode the byte string to a regular string and strip the curly braces + Decode the byte string into a scalar array """ without_braces = raw_value[1:-1].decode() - csv = ",".join(f'"{x}"' if x.isalpha() else x for x in without_braces.split(",")) - return f"[{csv}]" + + def safe_load(x: str) -> Any: + try: + return json.loads(x) + except json.JSONDecodeError: + return x + + return [safe_load(x) for x in without_braces.split(",")] diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 08440911e..337418ad7 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -279,7 +279,9 @@ { "columnName": "text_a", "columnType": 1009, - "datumBytes": b64encode(b"{a,b}").decode(), + "datumBytes": b64encode( + b'{"Network administration",GNS3,BGP}' + ).decode(), }, ], "newTypeinfo": [ @@ -356,7 +358,7 @@ "bit_col": "1", "box_col": "KDEsMSksKDAsMCk=", "uuid_col": "6e1f5de1-1093-4bfe-98e4-62ac56b2db54", - "text_a": ["a", "b"], + "text_a": ["Network administration", "GNS3", "BGP"], "_pg_lsn": 1, "_pg_commit_ts": pendulum.parse("2025-01-14T16:58:12.023448+00:00"), "_pg_tx_id": 754, From c9c5bcbf2d22822c63211ea6cd1b037f2a135e82 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 22 Jan 2025 14:23:49 +0100 Subject: [PATCH 79/88] refactor: all sqlalchemy event code is done at engine configuration --- sources/pg_legacy_replication/helpers.py | 36 +++++++++++++++--------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index d9a076332..76be86481 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -133,19 +133,9 @@ def init_replication( assert table_names is not None - engine = _configure_engine(credentials, rep_conn) - - @sa.event.listens_for(engine, "begin") - def on_begin(conn: sa.Connection) -> None: - cur = conn.connection.cursor() - if slot is None: - # Using the same isolation level that pg_backup uses - cur.execute( - "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE, READ ONLY, DEFERRABLE" - ) - else: - cur.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") - cur.execute(f"SET TRANSACTION SNAPSHOT '{slot['snapshot_name']}'") + engine = _configure_engine( + credentials, rep_conn, slot.get("snapshot_name") if slot else None + ) table_names = [table_names] if isinstance(table_names, str) else table_names or [] @@ -155,16 +145,34 @@ def on_begin(conn: sa.Connection) -> None: def _configure_engine( - credentials: ConnectionStringCredentials, rep_conn: LogicalReplicationConnection + credentials: ConnectionStringCredentials, + rep_conn: LogicalReplicationConnection, + snapshot_name: Optional[str], ) -> Engine: """ Configures the SQLAlchemy engine. Also attaches the replication connection in order to prevent it being garbage collected and closed. + + Args: + snapshot_name (str, optional): This is used during the initial first table snapshot allowing + all transactions to run with the same consistent snapshot. """ engine: Engine = engine_from_credentials(credentials) engine.execution_options(stream_results=True, max_row_buffer=2 * 50000) setattr(engine, "rep_conn", rep_conn) # noqa + @sa.event.listens_for(engine, "begin") + def on_begin(conn: sa.Connection) -> None: + cur = conn.connection.cursor() + if snapshot_name is None: + # Using the same isolation level that pg_backup uses + cur.execute( + "SET TRANSACTION ISOLATION LEVEL SERIALIZABLE, READ ONLY, DEFERRABLE" + ) + else: + cur.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") + cur.execute(f"SET TRANSACTION SNAPSHOT '{snapshot_name}'") + @sa.event.listens_for(engine, "engine_disposed") def on_engine_disposed(e: Engine) -> None: delattr(e, "rep_conn") From d695afb1e67ba60bf613cbd6b017c334ea00c5f7 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 22 Jan 2025 17:14:00 +0100 Subject: [PATCH 80/88] chore: bumped python to 3.9; small refactorings --- poetry.lock | 67 +---------------------- pyproject.toml | 3 +- sources/pg_legacy_replication/__init__.py | 2 +- sources/pg_legacy_replication/helpers.py | 56 +++++++++---------- 4 files changed, 32 insertions(+), 96 deletions(-) diff --git a/poetry.lock b/poetry.lock index b10a9c72b..4096d4472 100644 --- a/poetry.lock +++ b/poetry.lock @@ -403,34 +403,6 @@ files = [ {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, ] -[[package]] -name = "backports-zoneinfo" -version = "0.2.1" -description = "Backport of the standard library zoneinfo module" -optional = false -python-versions = ">=3.6" -files = [ - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:da6013fd84a690242c310d77ddb8441a559e9cb3d3d59ebac9aca1a57b2e18bc"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:89a48c0d158a3cc3f654da4c2de1ceba85263fafb861b98b59040a5086259722"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:1c5742112073a563c81f786e77514969acb58649bcdf6cdf0b4ed31a348d4546"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-win32.whl", hash = "sha256:e8236383a20872c0cdf5a62b554b27538db7fa1bbec52429d8d106effbaeca08"}, - {file = "backports.zoneinfo-0.2.1-cp36-cp36m-win_amd64.whl", hash = "sha256:8439c030a11780786a2002261569bdf362264f605dfa4d65090b64b05c9f79a7"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:f04e857b59d9d1ccc39ce2da1021d196e47234873820cbeaad210724b1ee28ac"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:17746bd546106fa389c51dbea67c8b7c8f0d14b5526a579ca6ccf5ed72c526cf"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:5c144945a7752ca544b4b78c8c41544cdfaf9786f25fe5ffb10e838e19a27570"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-win32.whl", hash = "sha256:e55b384612d93be96506932a786bbcde5a2db7a9e6a4bb4bffe8b733f5b9036b"}, - {file = "backports.zoneinfo-0.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a76b38c52400b762e48131494ba26be363491ac4f9a04c1b7e92483d169f6582"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:8961c0f32cd0336fb8e8ead11a1f8cd99ec07145ec2931122faaac1c8f7fd987"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e81b76cace8eda1fca50e345242ba977f9be6ae3945af8d46326d776b4cf78d1"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7b0a64cda4145548fed9efc10322770f929b944ce5cee6c0dfe0c87bf4c0c8c9"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-win32.whl", hash = "sha256:1b13e654a55cd45672cb54ed12148cd33628f672548f373963b0bff67b217328"}, - {file = "backports.zoneinfo-0.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:4a0f800587060bf8880f954dbef70de6c11bbe59c673c3d818921f042f9954a6"}, - {file = "backports.zoneinfo-0.2.1.tar.gz", hash = "sha256:fadbfe37f74051d024037f223b8e001611eac868b5c5b06144ef4d8b799862f2"}, -] - -[package.extras] -tzdata = ["tzdata"] - [[package]] name = "bandit" version = "1.7.5" @@ -730,7 +702,6 @@ files = [ clickhouse-connect = ">=0.5.7" duckdb = ">=0.7.1" fastapi = "0.85.1" -graphlib-backport = {version = ">=1.0.3", markers = "python_version < \"3.9\""} hnswlib = ">=0.7" numpy = ">=1.21.6" onnxruntime = ">=1.14.1" @@ -1186,7 +1157,6 @@ gcsfs = {version = ">=2022.4.0", optional = true, markers = "extra == \"gcp\" or gitpython = ">=3.1.29" giturlparse = ">=0.10.0" google-cloud-bigquery = {version = ">=2.26.0", optional = true, markers = "extra == \"gcp\" or extra == \"bigquery\""} -graphlib-backport = {version = "*", markers = "python_version < \"3.9\""} grpcio = {version = ">=1.50.0", optional = true, markers = "extra == \"gcp\" or extra == \"bigquery\""} hexbytes = ">=0.2.2" humanize = ">=4.4.0" @@ -1272,7 +1242,6 @@ files = [ ] [package.dependencies] -importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.9\""} natsort = ">=7.0.1" typing-extensions = ">=3.7.4.1" @@ -2043,17 +2012,6 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4 [package.extras] grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] -[[package]] -name = "graphlib-backport" -version = "1.0.3" -description = "Backport of the Python 3.9 graphlib module for Python 3.6+" -optional = false -python-versions = ">=3.6,<4.0" -files = [ - {file = "graphlib_backport-1.0.3-py3-none-any.whl", hash = "sha256:24246967b9e7e6a91550bc770e6169585d35aa32790258579a8a3899a8c18fde"}, - {file = "graphlib_backport-1.0.3.tar.gz", hash = "sha256:7bb8fc7757b8ae4e6d8000a26cd49e9232aaa9a3aa57edb478474b8424bfaae2"}, -] - [[package]] name = "greenlet" version = "2.0.2" @@ -2420,24 +2378,6 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker perf = ["ipython"] testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] -[[package]] -name = "importlib-resources" -version = "6.4.0" -description = "Read resources from Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "importlib_resources-6.4.0-py3-none-any.whl", hash = "sha256:50d10f043df931902d4194ea07ec57960f66a80449ff867bfe782b4c486ba78c"}, - {file = "importlib_resources-6.4.0.tar.gz", hash = "sha256:cdb2b453b8046ca4e3798eb1d84f3cce1446a0e8e7b5ef4efb600f19fc398145"}, -] - -[package.dependencies] -zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["jaraco.test (>=5.4)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"] - [[package]] name = "incremental" version = "22.10.0" @@ -3714,8 +3654,6 @@ files = [ ] [package.dependencies] -"backports.zoneinfo" = {version = ">=0.2.1", markers = "python_version < \"3.9\""} -importlib-resources = {version = ">=5.9.0", markers = "python_version < \"3.9\""} python-dateutil = ">=2.6" tzdata = ">=2020.1" @@ -5026,7 +4964,6 @@ files = [ [package.dependencies] markdown-it-py = ">=2.2.0" pygments = ">=2.13.0,<3.0.0" -typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] @@ -6600,5 +6537,5 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" -python-versions = ">=3.8.1,<3.13" -content-hash = "38baefccadc2b1ebc8c7b4f8702035dd2c60d88c5cd582b148a1cd01c52860ca" +python-versions = ">=3.9,<3.13" +content-hash = "6a657c817cec2ef5e110c455fd86ec73ce82e1e97dea77613ba4400238608594" diff --git a/pyproject.toml b/pyproject.toml index 75beddcc8..13beebf21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,9 +11,8 @@ readme = "README.md" packages = [{include = "sources"}] [tool.poetry.dependencies] -python = ">=3.8.1,<3.13" +python = ">=3.9,<3.13" dlt = {version = "1.3.0", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb"]} -graphlib-backport = {version = "*", python = "<3.9"} [tool.poetry.group.dltpure.dependencies] dlt = {version = "1.3.0", allow-prereleases = true} diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 8c8778ba3..d31ef8d25 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -85,7 +85,7 @@ def replication_source( def replication_resource(slot_name: str) -> Iterable[TDataItem]: # start where we left off in previous run start_lsn = dlt.current.resource_state().get("last_commit_lsn", 0) - if flush_slot: + if flush_slot and start_lsn > 0: advance_slot(start_lsn, slot_name, credentials) # continue until last message in replication slot diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 76be86481..f9e11330a 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -231,17 +231,17 @@ def get_max_lsn(credentials: ConnectionStringCredentials) -> Optional[int]: Returns None if the replication slot is empty. Does not consume the slot, i.e. messages are not flushed. """ - cur = _get_conn(credentials).cursor() - loc_fn = ( - "pg_current_xlog_location" - if get_pg_version(cur) < 100000 - else "pg_current_wal_lsn" - ) - # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) - cur.execute(f"SELECT {loc_fn}() - '0/0' as max_lsn;") - lsn: int = cur.fetchone()[0] - cur.connection.close() - return lsn + with _get_conn(credentials) as conn: + cur = conn.cursor() + loc_fn = ( + "pg_current_xlog_location" + if get_pg_version(cur) < 100000 + else "pg_current_wal_lsn" + ) + # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) + cur.execute(f"SELECT {loc_fn}() - '0/0' as max_lsn;") + lsn: int = cur.fetchone()[0] + return lsn def lsn_int_to_hex(lsn: int) -> str: @@ -262,13 +262,14 @@ def advance_slot( This function is used as alternative to psycopg2's `send_feedback` method, because the behavior of that method seems odd when used outside of `consume_stream`. """ - if upto_lsn != 0: - cur = _get_conn(credentials).cursor() + assert upto_lsn > 0 + with _get_conn(credentials) as conn: + cur = conn.cursor() + # There is unfortunately no way in pg9.6 to manually advance the replication slot if get_pg_version(cur) > 100000: cur.execute( f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" ) - cur.connection.close() def _get_conn( @@ -452,20 +453,19 @@ def __iter__(self) -> Iterator[TableItems]: Maintains LSN of last consumed commit message in object state. Advances the slot only when all messages have been consumed. """ - cur = _get_rep_conn(self.credentials).cursor() - cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) - consumer = MessageConsumer( - upto_lsn=self.upto_lsn, - table_qnames=self.table_qnames, - repl_options=self.repl_options, - target_batch_size=self.target_batch_size, - ) - try: - cur.consume_stream(consumer) - except StopReplication: # completed batch or reached `upto_lsn` - yield from self.flush_batch(cur, consumer) - finally: - cur.connection.close() + with _get_rep_conn(self.credentials) as conn: + cur = conn.cursor() + cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) + consumer = MessageConsumer( + upto_lsn=self.upto_lsn, + table_qnames=self.table_qnames, + repl_options=self.repl_options, + target_batch_size=self.target_batch_size, + ) + try: + cur.consume_stream(consumer) + except StopReplication: # completed batch or reached `upto_lsn` + yield from self.flush_batch(cur, consumer) def flush_batch( self, cur: ReplicationCursor, consumer: MessageConsumer From 8f45283beedc095270a723c5ac2c764618da254b Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 22 Jan 2025 17:50:59 +0100 Subject: [PATCH 81/88] refactor: init_replication is now in pkg ns --- sources/pg_legacy_replication/__init__.py | 79 ++++++++++++++++++- sources/pg_legacy_replication/helpers.py | 79 +------------------ sources/pg_legacy_replication/schema_types.py | 2 +- sources/pg_legacy_replication_pipeline.py | 3 +- 4 files changed, 82 insertions(+), 81 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index d31ef8d25..1968e7883 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -1,21 +1,26 @@ """Replicates postgres tables in batch using logical decoding.""" -from typing import Any, Callable, Dict, Iterable, Mapping, Optional, Sequence, Union +from collections import defaultdict +from typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Union import dlt from dlt.extract import DltResource from dlt.extract.items import TDataItem from dlt.sources.credentials import ConnectionStringCredentials -from collections import defaultdict +from dlt.sources.sql_database import sql_table from .helpers import ( BackendHandler, ItemGenerator, ReplicationOptions, + SqlTableOptions, advance_slot, cleanup_snapshot_resources, + configure_engine, + create_replication_slot, + drop_replication_slot, get_max_lsn, - init_replication, + get_rep_conn, ) @@ -132,6 +137,74 @@ def _create_table_dispatch( return handler +@dlt.source +def init_replication( + slot_name: str, + schema: str, + table_names: Optional[Union[str, Sequence[str]]] = None, + credentials: ConnectionStringCredentials = dlt.secrets.value, + take_snapshots: bool = False, + table_options: Optional[Mapping[str, SqlTableOptions]] = None, + reset: bool = False, +) -> Iterable[DltResource]: + """ + Initializes a replication session for Postgres using logical replication. + Optionally, snapshots of specified tables can be taken during initialization. + + Args: + slot_name (str): + The name of the logical replication slot to be used or created. + schema (str): + Name of the schema to replicate tables from. + table_names (Optional[Union[str, Sequence[str]]]): + The name(s) of the table(s) to replicate. Can be a single table name or a list of table names. + If not provided, no tables will be replicated unless `take_snapshots` is `True`. + credentials (ConnectionStringCredentials): + Database credentials for connecting to the Postgres instance. + take_snapshots (bool): + Whether to take initial snapshots of the specified tables. + Defaults to `False`. + table_options (Optional[Mapping[str, SqlTableOptions]]): + Additional options for configuring replication for specific tables. + These are the exact same parameters for the `dlt.sources.sql_database.sql_table` function. + Argument is only used if `take_snapshots` is `True`. + reset (bool, optional): + If `True`, drops the existing replication slot before creating a new one. + Use with caution, as this will clear existing replication state. + Defaults to `False`. + + Returns: + - None if `take_snapshots` is `False` + - a list of `DltResource` objects for the snapshot table(s) if `take_snapshots` is `True`. + + Notes: + - If `reset` is `True`, the existing replication slot will be dropped before creating a new one. + - When `take_snapshots` is `True`, the function configures a snapshot isolation level for consistent table snapshots. + """ + rep_conn = get_rep_conn(credentials) + rep_cur = rep_conn.cursor() + if reset: + drop_replication_slot(slot_name, rep_cur) + slot = create_replication_slot(slot_name, rep_cur) + + # Close connection if no snapshots are needed + if not take_snapshots: + rep_conn.close() + return + + assert table_names is not None + + engine = configure_engine( + credentials, rep_conn, slot.get("snapshot_name") if slot else None + ) + + table_names = [table_names] if isinstance(table_names, str) else table_names or [] + + for table in table_names: + table_args = (table_options or {}).get(table, {}).copy() + yield sql_table(credentials=engine, table=table, schema=schema, **table_args) + + __all__ = [ "ReplicationOptions", "cleanup_snapshot_resources", diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index f9e11330a..8fbc169cb 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -15,7 +15,6 @@ Sequence, Set, TypedDict, - Union, ) import dlt @@ -26,7 +25,7 @@ from dlt.common.schema.typing import TColumnSchema, TTableSchema, TTableSchemaColumns from dlt.common.schema.utils import merge_column from dlt.common.typing import TDataItem -from dlt.extract import DltResource, DltSource +from dlt.extract import DltSource from dlt.extract.items import DataItemWithMeta from dlt.sources.credentials import ConnectionStringCredentials from dlt.sources.sql_database import ( @@ -36,7 +35,6 @@ TTypeAdapter, arrow_helpers as arrow, engine_from_credentials, - sql_table, ) from psycopg2.extensions import connection as ConnectionExt, cursor from psycopg2.extras import ( @@ -75,76 +73,7 @@ class SqlTableOptions(TypedDict, total=False): type_adapter_callback: Optional[TTypeAdapter] -@dlt.sources.config.with_config(sections=("sources", "pg_legacy_replication")) -@dlt.source -def init_replication( - slot_name: str, - schema: str, - table_names: Optional[Union[str, Sequence[str]]] = None, - credentials: ConnectionStringCredentials = dlt.secrets.value, - take_snapshots: bool = False, - table_options: Optional[Mapping[str, SqlTableOptions]] = None, - reset: bool = False, -) -> Iterable[DltResource]: - """ - Initializes a replication session for Postgres using logical replication. - Optionally, snapshots of specified tables can be taken during initialization. - - Args: - slot_name (str): - The name of the logical replication slot to be used or created. - schema (str): - Name of the schema to replicate tables from. - table_names (Optional[Union[str, Sequence[str]]]): - The name(s) of the table(s) to replicate. Can be a single table name or a list of table names. - If not provided, no tables will be replicated unless `take_snapshots` is `True`. - credentials (ConnectionStringCredentials): - Database credentials for connecting to the Postgres instance. - take_snapshots (bool): - Whether to take initial snapshots of the specified tables. - Defaults to `False`. - table_options (Optional[Mapping[str, SqlTableOptions]]): - Additional options for configuring replication for specific tables. - These are the exact same parameters for the `dlt.sources.sql_database.sql_table` function. - Argument is only used if `take_snapshots` is `True`. - reset (bool, optional): - If `True`, drops the existing replication slot before creating a new one. - Use with caution, as this will clear existing replication state. - Defaults to `False`. - - Returns: - - None if `take_snapshots` is `False` - - a list of `DltResource` objects for the snapshot table(s) if `take_snapshots` is `True`. - - Notes: - - If `reset` is `True`, the existing replication slot will be dropped before creating a new one. - - When `take_snapshots` is `True`, the function configures a snapshot isolation level for consistent table snapshots. - """ - rep_conn = _get_rep_conn(credentials) - rep_cur = rep_conn.cursor() - if reset: - drop_replication_slot(slot_name, rep_cur) - slot = create_replication_slot(slot_name, rep_cur) - - # Close connection if no snapshots are needed - if not take_snapshots: - rep_conn.close() - return - - assert table_names is not None - - engine = _configure_engine( - credentials, rep_conn, slot.get("snapshot_name") if slot else None - ) - - table_names = [table_names] if isinstance(table_names, str) else table_names or [] - - for table in table_names: - table_args = (table_options or {}).get(table, {}).copy() - yield sql_table(credentials=engine, table=table, schema=schema, **table_args) - - -def _configure_engine( +def configure_engine( credentials: ConnectionStringCredentials, rep_conn: LogicalReplicationConnection, snapshot_name: Optional[str], @@ -288,7 +217,7 @@ def _get_conn( ) -def _get_rep_conn( +def get_rep_conn( credentials: ConnectionStringCredentials, ) -> LogicalReplicationConnection: """ @@ -453,7 +382,7 @@ def __iter__(self) -> Iterator[TableItems]: Maintains LSN of last consumed commit message in object state. Advances the slot only when all messages have been consumed. """ - with _get_rep_conn(self.credentials) as conn: + with get_rep_conn(self.credentials) as conn: cur = conn.cursor() cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) consumer = MessageConsumer( diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index e93ddb1c3..418c8b6a8 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -1,7 +1,7 @@ import json import re from functools import lru_cache -from typing import Any, Callable, List, Dict, Optional, Tuple +from typing import Any, Callable, Dict, List, Optional, Tuple import pendulum from dlt.common import Decimal, logger diff --git a/sources/pg_legacy_replication_pipeline.py b/sources/pg_legacy_replication_pipeline.py index b77f65188..be38414bc 100644 --- a/sources/pg_legacy_replication_pipeline.py +++ b/sources/pg_legacy_replication_pipeline.py @@ -2,8 +2,7 @@ from dlt.common.destination import Destination from dlt.destinations.impl.postgres.configuration import PostgresCredentials -from pg_legacy_replication import replication_source -from pg_legacy_replication.helpers import init_replication +from pg_legacy_replication import init_replication, replication_source PG_CREDS = dlt.secrets.get("sources.pg_replication.credentials", PostgresCredentials) From 41f8ded561a5c3ee4dd3bd52575ae34338d6e260 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 29 Jan 2025 00:23:26 +0100 Subject: [PATCH 82/88] fix: corrected bugs regarding inferring nullability wrong; refactored tests used to choose best schema --- sources/pg_legacy_replication/helpers.py | 19 +- sources/pg_legacy_replication/schema_types.py | 18 +- tests/pg_legacy_replication/cases.py | 515 +++++++++++++++++- tests/pg_legacy_replication/test_helpers.py | 225 +------- 4 files changed, 552 insertions(+), 225 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 8fbc169cb..a821cd548 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -569,7 +569,7 @@ def gen_data_item( col_name = _actual_column_name(data) if not included_columns or col_name in included_columns: data_item[col_name] = _to_dlt_val( - data, column_schema[col_name]["data_type"], for_delete=is_delete + data, column_schema[col_name], for_delete=is_delete ) return data_item @@ -601,19 +601,18 @@ def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: precise one if they are relatively equal or else raises a AssertionError due to an incompatible schema change """ - table_name = last["name"] - assert table_name == new["name"], "Table names do not match" + assert last["name"] == new["name"], "Table names do not match" - table_schema = TTableSchema(name=table_name, columns={}) + table_schema = TTableSchema(name=last["name"], columns={}) last_cols, new_cols = last["columns"], new["columns"] assert len(last_cols) == len( new_cols - ), f"Columns mismatch last:{last['columns']} new:{new['columns']}" + ), f"Columns mismatch last:{last_cols} new:{new_cols}" for name, s1 in last_cols.items(): s2 = new_cols.get(name) assert ( - s2 is not None and s1["data_type"] == s2["data_type"] + s2 and s1["data_type"] == s2["data_type"] ), f"Incompatible schema for column '{name}'" # Ensure new has no fields outside allowed fields @@ -623,7 +622,13 @@ def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: # Select the more precise schema by comparing nullable, precision, and scale col_schema = TColumnSchema(name=name, data_type=s1["data_type"]) if "nullable" in s1 or "nullable" in s2: - col_schema["nullable"] = s1.get("nullable", s2.get("nullable")) + # Get nullable values (could be True, False, or None) + s1_null = s1.get("nullable") + s2_null = s2.get("nullable") + if s1_null is not None and s2_null is not None: + col_schema["nullable"] = s1_null or s2_null # Default is True + else: + col_schema["nullable"] = s1_null if s1_null is not None else s2_null if "precision" in s1 or "precision" in s2: col_schema["precision"] = s1.get("precision", s2.get("precision")) if "scale" in s1 or "scale" in s2: diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 418c8b6a8..355dd31d4 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -143,8 +143,6 @@ def _to_dlt_column_schema( # Set nullable attribute if type_info is available if type_info: column_schema["nullable"] = type_info.value_optional - elif datum.WhichOneof("datum"): # Or simply guess as this is a very rare case - column_schema["nullable"] = False return column_schema @@ -170,13 +168,16 @@ def _epoch_days_to_date(epoch_days: int) -> pendulum.Date: def _to_dlt_val( - val: DatumMessage, data_type: TDataType, *, for_delete: bool = False + val: DatumMessage, col_schema: TColumnSchema, *, for_delete: bool = False ) -> Any: """Converts decoderbuf's datum value into dlt-compatible data value.""" - datum = val.WhichOneof("datum") + data_type = col_schema["data_type"] + assert data_type is not None + datum = _get_datum_attr(val) if datum is None: - return _DUMMY_VALS[data_type] if for_delete else None - if datum == "datum_missing": + nullable = col_schema.get("nullable", False) + if for_delete and not nullable: + return _DUMMY_VALS[data_type] return None raw_value = getattr(val, datum) @@ -212,3 +213,8 @@ def safe_load(x: str) -> Any: return x return [safe_load(x) for x in without_braces.split(",")] + + +def _get_datum_attr(val: DatumMessage) -> Optional[str]: + datum = val.WhichOneof("datum") + return None if datum is None and datum == "datum_missing" else datum diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 337418ad7..e55456c83 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -1,5 +1,6 @@ from base64 import b64encode -from typing import List +from enum import IntEnum +from typing import List, Tuple import pendulum from dlt.common import Decimal @@ -433,12 +434,7 @@ { "name": "tbl_x", "columns": { - "id_x": { - "data_type": "bigint", - "name": "id_x", - "precision": 64, - "nullable": False, - }, + "id_x": {"data_type": "bigint", "name": "id_x", "precision": 64}, "val_x": {"data_type": "text", "name": "val_x"}, "col_bool": {"data_type": "bool", "name": "col_bool"}, "col_bytea": {"data_type": "binary", "name": "col_bytea"}, @@ -498,3 +494,508 @@ }, }, ] + + +class SchemaChoice(IntEnum): + first = 0 + second = 1 + error = -1 + + +SIMILAR_SCHEMAS: List[Tuple[TTableSchema, TTableSchema, SchemaChoice]] = [ + ( + { + "name": "items", + "columns": { + "col1": { + "name": "col1", + "data_type": "bigint", + "precision": 64, + "nullable": False, + }, + "col2": {"name": "col2", "data_type": "double", "nullable": False}, + "col3": {"name": "col3", "data_type": "bool", "nullable": False}, + "col4": {"name": "col4", "data_type": "timestamp", "nullable": False}, + "col5": {"name": "col5", "data_type": "text", "nullable": False}, + "col6": { + "name": "col6", + "data_type": "decimal", + "precision": 38, + "scale": 9, + "nullable": False, + }, + "col7": {"name": "col7", "data_type": "binary", "nullable": False}, + "col9": {"name": "col9", "data_type": "json", "nullable": False}, + "col10": {"name": "col10", "data_type": "date", "nullable": False}, + "col11": {"name": "col11", "data_type": "time", "nullable": False}, + "col1_null": { + "name": "col1_null", + "data_type": "bigint", + "precision": 64, + "nullable": True, + }, + "col2_null": { + "name": "col2_null", + "data_type": "double", + "nullable": True, + }, + "col3_null": { + "name": "col3_null", + "data_type": "bool", + "nullable": True, + }, + "col4_null": { + "name": "col4_null", + "data_type": "timestamp", + "nullable": True, + }, + "col5_null": { + "name": "col5_null", + "data_type": "text", + "nullable": True, + }, + "col6_null": { + "name": "col6_null", + "data_type": "decimal", + "precision": 38, + "scale": 9, + "nullable": True, + }, + "col7_null": { + "name": "col7_null", + "data_type": "binary", + "nullable": True, + }, + "col9_null": { + "name": "col9_null", + "data_type": "json", + "nullable": True, + }, + "col10_null": { + "name": "col10_null", + "data_type": "date", + "nullable": True, + }, + "col11_null": { + "name": "col11_null", + "data_type": "time", + "nullable": True, + }, + "col1_precision": { + "name": "col1_precision", + "data_type": "bigint", + "precision": 16, + "nullable": False, + }, + "col4_precision": { + "name": "col4_precision", + "data_type": "timestamp", + "precision": 3, + "nullable": False, + }, + "col5_precision": { + "name": "col5_precision", + "data_type": "text", + "precision": 25, + "nullable": False, + }, + "col6_precision": { + "name": "col6_precision", + "data_type": "decimal", + "precision": 6, + "scale": 2, + "nullable": False, + }, + "col7_precision": { + "name": "col7_precision", + "data_type": "binary", + "nullable": False, + }, + "col11_precision": { + "name": "col11_precision", + "data_type": "time", + "precision": 3, + "nullable": False, + }, + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + }, + "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + }, + }, + { + "name": "items", + "columns": { + "col1": { + "name": "col1", + "data_type": "bigint", + "precision": 64, + "nullable": False, + }, + "col2": {"name": "col2", "data_type": "double"}, + "col3": {"name": "col3", "data_type": "bool"}, + "col4": {"name": "col4", "data_type": "timestamp"}, + "col5": {"name": "col5", "data_type": "text"}, + "col6": {"name": "col6", "data_type": "decimal"}, + "col7": {"name": "col7", "data_type": "binary"}, + "col9": {"name": "col9", "data_type": "json"}, + "col10": {"name": "col10", "data_type": "date"}, + "col11": {"name": "col11", "data_type": "time"}, + "col1_null": { + "name": "col1_null", + "data_type": "bigint", + "precision": 64, + }, + "col2_null": {"name": "col2_null", "data_type": "double"}, + "col3_null": {"name": "col3_null", "data_type": "bool"}, + "col4_null": {"name": "col4_null", "data_type": "timestamp"}, + "col5_null": {"name": "col5_null", "data_type": "text"}, + "col6_null": {"name": "col6_null", "data_type": "decimal"}, + "col7_null": {"name": "col7_null", "data_type": "binary"}, + "col9_null": {"name": "col9_null", "data_type": "json"}, + "col10_null": {"name": "col10_null", "data_type": "date"}, + "col11_null": {"name": "col11_null", "data_type": "time"}, + "col1_precision": { + "name": "col1_precision", + "data_type": "bigint", + "precision": 16, + }, + "col4_precision": {"name": "col4_precision", "data_type": "timestamp"}, + "col5_precision": {"name": "col5_precision", "data_type": "text"}, + "col6_precision": {"name": "col6_precision", "data_type": "decimal"}, + "col7_precision": {"name": "col7_precision", "data_type": "binary"}, + "col11_precision": {"name": "col11_precision", "data_type": "time"}, + "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text"}, + "_dlt_id": {"name": "_dlt_id", "data_type": "text"}, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + }, + }, + SchemaChoice.first, + ), + ( + { + "name": "items", + "columns": { + "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "c1": { + "data_type": "bigint", + "name": "c1", + "nullable": True, + "precision": 64, + }, + "c2": { + "data_type": "bigint", + "name": "c2", + "nullable": True, + "precision": 64, + }, + "c3": { + "data_type": "bigint", + "name": "c3", + "nullable": True, + "precision": 64, + }, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + }, + }, + { + "name": "items", + "columns": { + "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, + "_dlt_load_id": { + "data_type": "text", + "name": "_dlt_load_id", + "nullable": False, + }, + "c1": { + "data_type": "bigint", + "name": "c1", + "nullable": True, + "precision": 64, + }, + "c2": { + "data_type": "bigint", + "name": "c2", + "nullable": True, + "precision": 64, + }, + "c3": { + "data_type": "bigint", + "name": "c3", + "nullable": True, + "precision": 64, + }, + # Added c4 column + "c4": { + "data_type": "bigint", + "name": "c4", + "nullable": True, + "precision": 64, + }, + "_pg_deleted_ts": { + "data_type": "timestamp", + "name": "_pg_deleted_ts", + "nullable": True, + }, + "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, + }, + }, + SchemaChoice.error, + ), + ( + { + "name": "scale_teams", + "columns": { + "id": { + "name": "id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "user_id": { + "name": "user_id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "begin_at": { + "name": "begin_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "created_at": { + "name": "created_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "updated_at": { + "name": "updated_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "scale_id": { + "name": "scale_id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "team_id": { + "name": "team_id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "comment": {"name": "comment", "nullable": True, "data_type": "text"}, + "old_feedback": { + "name": "old_feedback", + "nullable": True, + "data_type": "text", + }, + "feedback_rating": { + "name": "feedback_rating", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "final_mark": { + "name": "final_mark", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "truant_id": { + "name": "truant_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "flag_id": { + "name": "flag_id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "token": {"name": "token", "nullable": True, "data_type": "text"}, + "ip": {"name": "ip", "nullable": True, "data_type": "text"}, + "internship_id": { + "name": "internship_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "filled_at": { + "name": "filled_at", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_lsn": {"name": "_pg_lsn", "nullable": True, "data_type": "bigint"}, + "_pg_deleted_ts": { + "name": "_pg_deleted_ts", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_commit_ts": { + "name": "_pg_commit_ts", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_tx_id": { + "name": "_pg_tx_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + }, + }, + }, + { + "name": "scale_teams", + "columns": { + "id": { + "name": "id", + "nullable": False, + "data_type": "bigint", + "precision": 32, + }, + "user_id": { + "name": "user_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "begin_at": { + "name": "begin_at", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "created_at": { + "name": "created_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "updated_at": { + "name": "updated_at", + "nullable": False, + "data_type": "timestamp", + "precision": 6, + }, + "scale_id": { + "name": "scale_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "team_id": { + "name": "team_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "comment": {"name": "comment", "nullable": True, "data_type": "text"}, + "old_feedback": { + "name": "old_feedback", + "nullable": True, + "data_type": "text", + }, + "feedback_rating": { + "name": "feedback_rating", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "final_mark": { + "name": "final_mark", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "truant_id": { + "name": "truant_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "flag_id": { + "name": "flag_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "token": {"name": "token", "nullable": True, "data_type": "text"}, + "ip": {"name": "ip", "nullable": True, "data_type": "text"}, + "internship_id": { + "name": "internship_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "filled_at": { + "name": "filled_at", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_lsn": {"name": "_pg_lsn", "nullable": True, "data_type": "bigint"}, + "_pg_deleted_ts": { + "name": "_pg_deleted_ts", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_commit_ts": { + "name": "_pg_commit_ts", + "nullable": True, + "data_type": "timestamp", + "precision": 6, + }, + "_pg_tx_id": { + "name": "_pg_tx_id", + "nullable": True, + "data_type": "bigint", + "precision": 32, + }, + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + }, + }, + }, + SchemaChoice.second, + ), +] diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index cfd813966..b93d2486d 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -2,14 +2,19 @@ from dlt.common.schema.typing import TTableSchema from dlt.common.typing import TDataItem from google.protobuf.json_format import ParseDict as parse_dict - from sources.pg_legacy_replication.helpers import ( - infer_table_schema, - gen_data_item, compare_schemas, + gen_data_item, + infer_table_schema, ) from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage -from .cases import ROW_MESSAGES, DATA_ITEMS, TABLE_SCHEMAS +from .cases import ( + DATA_ITEMS, + ROW_MESSAGES, + SIMILAR_SCHEMAS, + TABLE_SCHEMAS, + SchemaChoice, +) @pytest.mark.parametrize("data, expected_schema", zip(ROW_MESSAGES, TABLE_SCHEMAS)) @@ -46,204 +51,14 @@ def test_gen_data_item(data, data_item: TDataItem): ) -def test_compare_schemas(): - s1: TTableSchema = { - "name": "items", - "columns": { - "col1": { - "name": "col1", - "data_type": "bigint", - "precision": 64, - "nullable": False, - }, - "col2": {"name": "col2", "data_type": "double", "nullable": False}, - "col3": {"name": "col3", "data_type": "bool", "nullable": False}, - "col4": {"name": "col4", "data_type": "timestamp", "nullable": False}, - "col5": {"name": "col5", "data_type": "text", "nullable": False}, - "col6": { - "name": "col6", - "data_type": "decimal", - "precision": 38, - "scale": 9, - "nullable": False, - }, - "col7": {"name": "col7", "data_type": "binary", "nullable": False}, - "col9": {"name": "col9", "data_type": "json", "nullable": False}, - "col10": {"name": "col10", "data_type": "date", "nullable": False}, - "col11": {"name": "col11", "data_type": "time", "nullable": False}, - "col1_null": { - "name": "col1_null", - "data_type": "bigint", - "precision": 64, - "nullable": True, - }, - "col2_null": {"name": "col2_null", "data_type": "double", "nullable": True}, - "col3_null": {"name": "col3_null", "data_type": "bool", "nullable": True}, - "col4_null": { - "name": "col4_null", - "data_type": "timestamp", - "nullable": True, - }, - "col5_null": {"name": "col5_null", "data_type": "text", "nullable": True}, - "col6_null": { - "name": "col6_null", - "data_type": "decimal", - "precision": 38, - "scale": 9, - "nullable": True, - }, - "col7_null": {"name": "col7_null", "data_type": "binary", "nullable": True}, - "col9_null": { - "name": "col9_null", - "data_type": "json", - "nullable": True, - }, - "col10_null": {"name": "col10_null", "data_type": "date", "nullable": True}, - "col11_null": {"name": "col11_null", "data_type": "time", "nullable": True}, - "col1_precision": { - "name": "col1_precision", - "data_type": "bigint", - "precision": 16, - "nullable": False, - }, - "col4_precision": { - "name": "col4_precision", - "data_type": "timestamp", - "precision": 3, - "nullable": False, - }, - "col5_precision": { - "name": "col5_precision", - "data_type": "text", - "precision": 25, - "nullable": False, - }, - "col6_precision": { - "name": "col6_precision", - "data_type": "decimal", - "precision": 6, - "scale": 2, - "nullable": False, - }, - "col7_precision": { - "name": "col7_precision", - "data_type": "binary", - "nullable": False, - }, - "col11_precision": { - "name": "col11_precision", - "data_type": "time", - "precision": 3, - "nullable": False, - }, - "_dlt_load_id": { - "name": "_dlt_load_id", - "data_type": "text", - "nullable": False, - }, - "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}, - "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, - "_pg_deleted_ts": { - "data_type": "timestamp", - "name": "_pg_deleted_ts", - "nullable": True, - }, - }, - } - s2: TTableSchema = { - "name": "items", - "columns": { - "col1": { - "name": "col1", - "data_type": "bigint", - "precision": 64, - "nullable": False, - }, - "col2": {"name": "col2", "data_type": "double"}, - "col3": {"name": "col3", "data_type": "bool"}, - "col4": {"name": "col4", "data_type": "timestamp"}, - "col5": {"name": "col5", "data_type": "text"}, - "col6": {"name": "col6", "data_type": "decimal"}, - "col7": {"name": "col7", "data_type": "binary"}, - "col9": {"name": "col9", "data_type": "json"}, - "col10": {"name": "col10", "data_type": "date"}, - "col11": {"name": "col11", "data_type": "time"}, - "col1_null": {"name": "col1_null", "data_type": "bigint", "precision": 64}, - "col2_null": {"name": "col2_null", "data_type": "double"}, - "col3_null": {"name": "col3_null", "data_type": "bool"}, - "col4_null": {"name": "col4_null", "data_type": "timestamp"}, - "col5_null": {"name": "col5_null", "data_type": "text"}, - "col6_null": {"name": "col6_null", "data_type": "decimal"}, - "col7_null": {"name": "col7_null", "data_type": "binary"}, - "col9_null": {"name": "col9_null", "data_type": "json"}, - "col10_null": {"name": "col10_null", "data_type": "date"}, - "col11_null": {"name": "col11_null", "data_type": "time"}, - "col1_precision": { - "name": "col1_precision", - "data_type": "bigint", - "precision": 16, - }, - "col4_precision": {"name": "col4_precision", "data_type": "timestamp"}, - "col5_precision": {"name": "col5_precision", "data_type": "text"}, - "col6_precision": {"name": "col6_precision", "data_type": "decimal"}, - "col7_precision": {"name": "col7_precision", "data_type": "binary"}, - "col11_precision": {"name": "col11_precision", "data_type": "time"}, - "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text"}, - "_dlt_id": {"name": "_dlt_id", "data_type": "text"}, - "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, - "_pg_deleted_ts": { - "data_type": "timestamp", - "name": "_pg_deleted_ts", - "nullable": True, - }, - }, - } - assert compare_schemas(s1, s2) == s1 - assert compare_schemas(s2, s1) == s1 - - s1 = { - "columns": { - "_dlt_id": {"data_type": "text", "name": "_dlt_id", "nullable": False}, - "_dlt_load_id": { - "data_type": "text", - "name": "_dlt_load_id", - "nullable": False, - }, - "c1": { - "data_type": "bigint", - "name": "c1", - "nullable": True, - "precision": 64, - }, - "c2": { - "data_type": "bigint", - "name": "c2", - "nullable": True, - "precision": 64, - }, - "c3": { - "data_type": "bigint", - "name": "c3", - "nullable": True, - "precision": 64, - }, - "_pg_deleted_ts": { - "data_type": "timestamp", - "name": "_pg_deleted_ts", - "nullable": True, - }, - "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, - }, - "name": "items", - } - from copy import deepcopy - - s2 = deepcopy(s1) - s2["columns"]["c4"] = { - "data_type": "bigint", - "name": "c4", - "nullable": True, - "precision": 64, - } - with pytest.raises(AssertionError): - compare_schemas(s1, s2) +@pytest.mark.parametrize("s1, s2, choice", SIMILAR_SCHEMAS) +def test_compare_schemas(s1: TTableSchema, s2: TTableSchema, choice: SchemaChoice): + if choice == SchemaChoice.error: + with pytest.raises(AssertionError): + compare_schemas(s1, s2) + with pytest.raises(AssertionError): + compare_schemas(s2, s1) + else: + expected_schema = (s1, s2)[choice] + assert compare_schemas(s1, s2) == expected_schema + assert compare_schemas(s2, s1) == expected_schema From 129b18afcf54d468db2efbf5cbf64d3a59306484 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 29 Jan 2025 11:58:13 +0100 Subject: [PATCH 83/88] fix: rolling back on managing conn lifecycle using context mgrs: it doesn't work as per https://www.psycopg.org/docs/usage.html#with-statement --- sources/pg_legacy_replication/helpers.py | 37 ++++++++++++++---------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index a821cd548..33f2ac2bc 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -160,8 +160,8 @@ def get_max_lsn(credentials: ConnectionStringCredentials) -> Optional[int]: Returns None if the replication slot is empty. Does not consume the slot, i.e. messages are not flushed. """ - with _get_conn(credentials) as conn: - cur = conn.cursor() + cur = _get_conn(credentials).cursor() + try: loc_fn = ( "pg_current_xlog_location" if get_pg_version(cur) < 100000 @@ -171,6 +171,8 @@ def get_max_lsn(credentials: ConnectionStringCredentials) -> Optional[int]: cur.execute(f"SELECT {loc_fn}() - '0/0' as max_lsn;") lsn: int = cur.fetchone()[0] return lsn + finally: + cur.connection.close() def lsn_int_to_hex(lsn: int) -> str: @@ -192,13 +194,15 @@ def advance_slot( the behavior of that method seems odd when used outside of `consume_stream`. """ assert upto_lsn > 0 - with _get_conn(credentials) as conn: - cur = conn.cursor() + cur = _get_conn(credentials).cursor() + try: # There is unfortunately no way in pg9.6 to manually advance the replication slot if get_pg_version(cur) > 100000: cur.execute( f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" ) + finally: + cur.connection.close() def _get_conn( @@ -382,19 +386,20 @@ def __iter__(self) -> Iterator[TableItems]: Maintains LSN of last consumed commit message in object state. Advances the slot only when all messages have been consumed. """ - with get_rep_conn(self.credentials) as conn: - cur = conn.cursor() + cur = get_rep_conn(self.credentials).cursor() + consumer = MessageConsumer( + upto_lsn=self.upto_lsn, + table_qnames=self.table_qnames, + repl_options=self.repl_options, + target_batch_size=self.target_batch_size, + ) + try: cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) - consumer = MessageConsumer( - upto_lsn=self.upto_lsn, - table_qnames=self.table_qnames, - repl_options=self.repl_options, - target_batch_size=self.target_batch_size, - ) - try: - cur.consume_stream(consumer) - except StopReplication: # completed batch or reached `upto_lsn` - yield from self.flush_batch(cur, consumer) + cur.consume_stream(consumer) + except StopReplication: # completed batch or reached `upto_lsn` + yield from self.flush_batch(cur, consumer) + finally: + cur.connection.close() def flush_batch( self, cur: ReplicationCursor, consumer: MessageConsumer From 9083611e2b3b306e61c759f3e66feec9468f5621 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Thu, 30 Jan 2025 12:18:57 +0100 Subject: [PATCH 84/88] fix: corrected regression with occasional datum_missinng values --- sources/pg_legacy_replication/schema_types.py | 4 +++- tests/pg_legacy_replication/cases.py | 4 ++++ tests/pg_legacy_replication/conftest.py | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 355dd31d4..4f1b3477a 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -217,4 +217,6 @@ def safe_load(x: str) -> Any: def _get_datum_attr(val: DatumMessage) -> Optional[str]: datum = val.WhichOneof("datum") - return None if datum is None and datum == "datum_missing" else datum + if datum is None or datum == "datum_missing": + return None + return datum diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index e55456c83..e078dcb6c 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -185,6 +185,7 @@ }, {"columnName": "col12", "columnType": 1114}, {"columnName": "col13", "columnType": 700}, + {"columnName": "col14", "columnType": 1043, "datum_missing": True}, ], "newTypeinfo": [ {"modifier": "timestamp with time zone", "valueOptional": False}, @@ -193,6 +194,7 @@ {"modifier": "time without time zone", "valueOptional": False}, {"modifier": "timestamp without time zone", "valueOptional": True}, {"modifier": "real", "valueOptional": True}, + {"modifier": "character varying", "valueOptional": True}, ], }, { @@ -331,6 +333,7 @@ "col11": pendulum.parse("13:26:45.176451", strict=False).time(), "col12": None, "col13": None, + "col14": None, "_pg_lsn": 1, "_pg_commit_ts": pendulum.parse("2024-10-21T09:37:03.666542+00:00"), "_pg_tx_id": 2018, @@ -412,6 +415,7 @@ "col11": {"data_type": "time", "name": "col11", "nullable": False}, "col12": {"data_type": "timestamp", "name": "col12", "nullable": True}, "col13": {"data_type": "double", "name": "col13", "nullable": True}, + "col14": {"data_type": "text", "name": "col14", "nullable": True}, "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, "_pg_deleted_ts": { "data_type": "timestamp", diff --git a/tests/pg_legacy_replication/conftest.py b/tests/pg_legacy_replication/conftest.py index b3d5372ae..dcd1a0f16 100644 --- a/tests/pg_legacy_replication/conftest.py +++ b/tests/pg_legacy_replication/conftest.py @@ -1,3 +1,4 @@ +import faulthandler import pytest from typing import Iterator, Tuple @@ -6,6 +7,10 @@ from dlt.common.utils import uniq_id +def pytest_configure(): + faulthandler.enable() + + @pytest.fixture() def src_config() -> Iterator[Tuple[dlt.Pipeline, str]]: # random slot to enable parallel runs From 864b746ca1bee730a9f69656e008ff48dfa4000e Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Fri, 7 Feb 2025 15:08:42 +0100 Subject: [PATCH 85/88] fix: add support for ordinary json pg_type --- sources/pg_legacy_replication/schema_types.py | 2 ++ tests/pg_legacy_replication/cases.py | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 4f1b3477a..db98093aa 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -34,6 +34,7 @@ 21: "smallint", 23: "integer", 25: "text", + 114: "json", 700: "real", 701: "double precision", 1043: "character varying", @@ -47,6 +48,7 @@ """Maps postgres type OID to type string.""" _MISSING_TYPES: Dict[str, TDataType] = { + "json": "json", "real": "double", "text": "text", "timestamp without time zone": "timestamp", diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index e078dcb6c..6fcd522e6 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -268,7 +268,11 @@ "table": "src_pl_dataset_202501140458116348.data_types", "op": "INSERT", "newTuple": [ - {"columnName": "bit_col", "columnType": 1560, "datumString": "1"}, + { + "columnName": "bit_col", + "columnType": 1560, + "datumString": "1", + }, { "columnName": "box_col", "columnType": 603, @@ -286,6 +290,11 @@ b'{"Network administration",GNS3,BGP}' ).decode(), }, + { + "columnName": "json_col", + "columnType": 114, + "datum_string": '{"a":[null,1]}', + }, ], "newTypeinfo": [ { @@ -304,6 +313,10 @@ "modifier": "text[]", "valueOptional": True, }, + { + "modifier": "json", + "valueOptional": True, + }, ], "oldTuple": [], }, @@ -363,6 +376,7 @@ "box_col": "KDEsMSksKDAsMCk=", "uuid_col": "6e1f5de1-1093-4bfe-98e4-62ac56b2db54", "text_a": ["Network administration", "GNS3", "BGP"], + "json_col": {"a": [None, 1]}, "_pg_lsn": 1, "_pg_commit_ts": pendulum.parse("2025-01-14T16:58:12.023448+00:00"), "_pg_tx_id": 754, @@ -478,6 +492,7 @@ "box_col": {"data_type": "text", "name": "box_col", "nullable": True}, "uuid_col": {"data_type": "text", "name": "uuid_col", "nullable": True}, "text_a": {"data_type": "json", "name": "text_a", "nullable": True}, + "json_col": {"data_type": "json", "name": "json_col", "nullable": True}, "_pg_lsn": {"data_type": "bigint", "name": "_pg_lsn", "nullable": True}, "_pg_deleted_ts": { "data_type": "timestamp", From a591618e8a68248fc663672c370cfe7db28aba94 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Wed, 19 Feb 2025 21:11:51 +0100 Subject: [PATCH 86/88] fix: various fixes of bugs encountered during production - No longer inferring schemas if the first row_msg in a batch is a DELETE operation - Instead uses sqlalchemy to reflect the schema - In pg9.6, pg_current_xlog_location wasn't reliable which would cause the message consumer to hang until new data was flushed to WAL - Doesn't fix but was the cause for https://github.com/dlt-hub/dlt/issues/2229 (was able to reproduce in the added test case) - Some minor refactoring --- sources/pg_legacy_replication/__init__.py | 10 +- sources/pg_legacy_replication/helpers.py | 169 ++++++++++++------ sources/pg_legacy_replication/schema_types.py | 25 +-- tests/pg_legacy_replication/test_helpers.py | 27 +-- .../test_pg_replication.py | 63 +++++++ 5 files changed, 208 insertions(+), 86 deletions(-) diff --git a/sources/pg_legacy_replication/__init__.py b/sources/pg_legacy_replication/__init__.py index 1968e7883..aaf1f73ed 100644 --- a/sources/pg_legacy_replication/__init__.py +++ b/sources/pg_legacy_replication/__init__.py @@ -94,7 +94,7 @@ def replication_resource(slot_name: str) -> Iterable[TDataItem]: advance_slot(start_lsn, slot_name, credentials) # continue until last message in replication slot - upto_lsn = get_max_lsn(credentials) + upto_lsn = get_max_lsn(credentials, slot_name) if upto_lsn is None: return @@ -182,10 +182,10 @@ def init_replication( - When `take_snapshots` is `True`, the function configures a snapshot isolation level for consistent table snapshots. """ rep_conn = get_rep_conn(credentials) - rep_cur = rep_conn.cursor() - if reset: - drop_replication_slot(slot_name, rep_cur) - slot = create_replication_slot(slot_name, rep_cur) + with rep_conn.cursor() as rep_cur: + if reset: + drop_replication_slot(slot_name, rep_cur) + slot = create_replication_slot(slot_name, rep_cur) # Close connection if no snapshots are needed if not take_snapshots: diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 33f2ac2bc..10263feb1 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -1,6 +1,7 @@ import hashlib from collections import defaultdict from dataclasses import dataclass, field +from functools import partial from typing import ( Any, Callable, @@ -36,6 +37,7 @@ arrow_helpers as arrow, engine_from_credentials, ) +from dlt.sources.sql_database.schema_types import sqla_col_to_column_schema from psycopg2.extensions import connection as ConnectionExt, cursor from psycopg2.extras import ( LogicalReplicationConnection, @@ -153,26 +155,34 @@ def drop_replication_slot(name: str, cur: ReplicationCursor) -> None: ) -def get_max_lsn(credentials: ConnectionStringCredentials) -> Optional[int]: +def get_max_lsn( + credentials: ConnectionStringCredentials, slot_name: str +) -> Optional[int]: """ Returns maximum Log Sequence Number (LSN). Returns None if the replication slot is empty. Does not consume the slot, i.e. messages are not flushed. """ - cur = _get_conn(credentials).cursor() + conn = _get_conn(credentials) try: - loc_fn = ( - "pg_current_xlog_location" - if get_pg_version(cur) < 100000 - else "pg_current_wal_lsn" - ) - # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) - cur.execute(f"SELECT {loc_fn}() - '0/0' as max_lsn;") - lsn: int = cur.fetchone()[0] - return lsn + with conn.cursor() as cur: + pg_version = get_pg_version(cur) + lsn_field = "lsn" if pg_version >= 100000 else "location" + # subtract '0/0' to convert pg_lsn type to int (https://stackoverflow.com/a/73738472) + cur.execute( + f""" + SELECT {lsn_field} - '0/0' AS max_lsn + FROM pg_logical_slot_peek_binary_changes(%s, NULL, NULL) + ORDER BY {lsn_field} DESC + LIMIT 1; + """, + (slot_name,), + ) + row = cur.fetchone() + return row[0] if row else None # type: ignore[no-any-return] finally: - cur.connection.close() + conn.close() def lsn_int_to_hex(lsn: int) -> str: @@ -194,15 +204,16 @@ def advance_slot( the behavior of that method seems odd when used outside of `consume_stream`. """ assert upto_lsn > 0 - cur = _get_conn(credentials).cursor() + conn = _get_conn(credentials) try: - # There is unfortunately no way in pg9.6 to manually advance the replication slot - if get_pg_version(cur) > 100000: - cur.execute( - f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" - ) + with conn.cursor() as cur: + # There is unfortunately no way in pg9.6 to manually advance the replication slot + if get_pg_version(cur) > 100000: + cur.execute( + f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" + ) finally: - cur.connection.close() + conn.close() def _get_conn( @@ -243,11 +254,13 @@ class MessageConsumer: def __init__( self, + credentials: ConnectionStringCredentials, upto_lsn: int, table_qnames: Set[str], repl_options: DefaultDict[str, ReplicationOptions], target_batch_size: int = 1000, ) -> None: + self.credentials = credentials self.upto_lsn = upto_lsn self.table_qnames = table_qnames self.target_batch_size = target_batch_size @@ -280,15 +293,22 @@ def process_msg(self, msg: ReplicationMessage) -> None: row_msg = RowMessage() try: row_msg.ParseFromString(msg.payload) + lsn = msg.data_start assert row_msg.op != Op.UNKNOWN, f"Unsupported operation : {row_msg}" + logger.debug( + "op: %s, current lsn: %s, max lsn: %s", + Op.Name(row_msg.op), + lsn, + self.upto_lsn, + ) if row_msg.op == Op.BEGIN: # self.last_commit_ts = _epoch_micros_to_datetime(row_msg.commit_time) pass elif row_msg.op == Op.COMMIT: - self.process_commit(lsn=msg.data_start) + self.process_commit(lsn=lsn) else: # INSERT, UPDATE or DELETE - self.process_change(row_msg, lsn=msg.data_start) + self.process_change(row_msg, lsn=lsn) except StopReplication: raise except Exception: @@ -317,18 +337,23 @@ def process_change(self, msg: RowMessage, lsn: int) -> None: if msg.table not in self.table_qnames: return table_name = msg.table.split(".")[1] - table_schema = self.get_table_schema(msg, table_name) + table_schema = self.get_table_schema(msg) data_item = gen_data_item( msg, table_schema["columns"], lsn, **self.repl_options[table_name] ) self.data_items[table_name].append(data_item) - def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: + def get_table_schema(self, msg: RowMessage) -> TTableSchema: """Given a row message, calculates or fetches a table schema.""" + schema, table_name = msg.table.split(".") last_schema = self.last_table_schema.get(table_name) - # Used cached schema if the operation is a DELETE since the inferred one will always be less precise - if msg.op == Op.DELETE and last_schema: + # Used cached schema if the operation is a DELETE + if msg.op == Op.DELETE: + if last_schema is None: + # If absent than reflect it using sqlalchemy + last_schema = self._fetch_table_schema_with_sqla(schema, table_name) + self.last_table_schema[table_name] = last_schema return last_schema # Return cached schema if hash matches @@ -336,7 +361,7 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: if current_hash == self.last_table_hashes.get(table_name): return self.last_table_schema[table_name] - new_schema = infer_table_schema(msg, **self.repl_options[table_name]) + new_schema = infer_table_schema(msg, self.repl_options[table_name]) if last_schema is None: # Cache the inferred schema and hash if it is not already cached self.last_table_schema[table_name] = new_schema @@ -351,6 +376,33 @@ def get_table_schema(self, msg: RowMessage, table_name: str) -> TTableSchema: return new_schema + def _fetch_table_schema_with_sqla( + self, schema: str, table_name: str + ) -> TTableSchema: + """Last resort function used to fetch the table schema from the database""" + engine = engine_from_credentials(self.credentials) + to_col_schema = partial( + sqla_col_to_column_schema, reflection_level="full_with_precision" + ) + try: + metadata = MetaData(schema=schema) + table = Table(table_name, metadata, autoload_with=engine) + options = self.repl_options[table_name] + included_columns = options.get("included_columns") + columns = { + col["name"]: col + for c in table.columns + if (col := to_col_schema(c)) is not None + and (not included_columns or c.name in included_columns) + } + + return TTableSchema( + name=table_name, + columns=add_replication_columns(columns, **options), + ) + finally: + engine.dispose() + def hash_typeinfo(new_typeinfo: Sequence[TypeInfo]) -> int: """Generate a hash for the entire new_typeinfo list by hashing each TypeInfo message.""" @@ -386,20 +438,30 @@ def __iter__(self) -> Iterator[TableItems]: Maintains LSN of last consumed commit message in object state. Advances the slot only when all messages have been consumed. """ - cur = get_rep_conn(self.credentials).cursor() + conn = get_rep_conn(self.credentials) consumer = MessageConsumer( + credentials=self.credentials, upto_lsn=self.upto_lsn, table_qnames=self.table_qnames, repl_options=self.repl_options, target_batch_size=self.target_batch_size, ) + + cur = conn.cursor() try: cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) cur.consume_stream(consumer) except StopReplication: # completed batch or reached `upto_lsn` yield from self.flush_batch(cur, consumer) finally: - cur.connection.close() + logger.debug( + "Closing connection... last_commit_lsn: %s, generated_all: %s, feedback_ts: %s", + self.last_commit_lsn, + self.generated_all, + cur.feedback_timestamp, + ) + cur.close() + conn.close() def flush_batch( self, cur: ReplicationCursor, consumer: MessageConsumer @@ -489,65 +551,68 @@ def emit_arrow_table( ) -def infer_table_schema( - msg: RowMessage, - include_lsn: bool = True, - include_deleted_ts: bool = True, - include_commit_ts: bool = False, - include_tx_id: bool = False, - included_columns: Optional[Set[str]] = None, - **_: Any, -) -> TTableSchema: +def infer_table_schema(msg: RowMessage, options: ReplicationOptions) -> TTableSchema: """Infers the table schema from the replication message and optional hints.""" # Choose the correct source based on operation type - is_change = msg.op != Op.DELETE - tuples = msg.new_tuple if is_change else msg.old_tuple - schema = TTableSchema(name=msg.table.split(".")[1]) - - # Filter and map columns, conditionally using `new_typeinfo` when available - schema["columns"] = { + assert msg.op != Op.DELETE + included_columns = options.get("included_columns") + columns = { col_name: _to_dlt_column_schema( - col_name, datum=col, type_info=msg.new_typeinfo[i] if is_change else None + col_name, datum=col, type_info=msg.new_typeinfo[i] ) - for i, col in enumerate(tuples) + for i, col in enumerate(msg.new_tuple) if (col_name := _actual_column_name(col)) and (not included_columns or col_name in included_columns) } - # Add replication columns + return TTableSchema( + name=msg.table.split(".")[1], + columns=add_replication_columns(columns, **options), + ) + + +def add_replication_columns( + columns: TTableSchemaColumns, + *, + include_lsn: bool = True, + include_deleted_ts: bool = True, + include_commit_ts: bool = False, + include_tx_id: bool = False, + **_: Any, +) -> TTableSchemaColumns: if include_lsn: - schema["columns"]["_pg_lsn"] = { + columns["_pg_lsn"] = { "data_type": "bigint", "name": "_pg_lsn", "nullable": True, } if include_deleted_ts: - schema["columns"]["_pg_deleted_ts"] = { + columns["_pg_deleted_ts"] = { "data_type": "timestamp", "name": "_pg_deleted_ts", "nullable": True, } if include_commit_ts: - schema["columns"]["_pg_commit_ts"] = { + columns["_pg_commit_ts"] = { "data_type": "timestamp", "name": "_pg_commit_ts", "nullable": True, } if include_tx_id: - schema["columns"]["_pg_tx_id"] = { + columns["_pg_tx_id"] = { "data_type": "bigint", "name": "_pg_tx_id", "nullable": True, "precision": 32, } - - return schema + return columns def gen_data_item( msg: RowMessage, column_schema: TTableSchemaColumns, lsn: int, + *, include_lsn: bool = True, include_deleted_ts: bool = True, include_commit_ts: bool = False, diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index db98093aa..5b9fbdb64 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -84,14 +84,14 @@ def _get_precision_and_scale( - type_id: int, modifier: Optional[str] + type_id: int, modifier: str ) -> Tuple[Optional[int], Optional[int]]: """Get precision from postgres type attributes and modifiers.""" if type_id in _FIXED_PRECISION_TYPES: return _FIXED_PRECISION_TYPES[type_id] - # If modifier or pattern is missing, return defaults - if not modifier or (pattern := _VARYING_PRECISION_PATTERNS.get(type_id)) is None: + # If pattern is missing, return defaults + if (pattern := _VARYING_PRECISION_PATTERNS.get(type_id)) is None: return None, None if match := re.search(pattern, modifier): @@ -110,7 +110,7 @@ def _type_mapper() -> PostgresTypeMapper: return PostgresTypeMapper(postgres().capabilities()) -def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: +def _to_dlt_column_type(type_id: int, modifier: str) -> TColumnType: """ Converts postgres type OID to dlt column type. @@ -119,7 +119,7 @@ def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: pg_type = _PG_TYPES.get(type_id) if pg_type in _MISSING_TYPES: return {"data_type": _MISSING_TYPES[pg_type]} - if modifier and modifier.endswith("[]"): + if modifier.endswith("[]"): return {"data_type": "json"} if pg_type is None: logger.warning( @@ -132,22 +132,15 @@ def _to_dlt_column_type(type_id: int, modifier: Optional[str]) -> TColumnType: def _to_dlt_column_schema( - col_name: str, datum: DatumMessage, type_info: Optional[TypeInfo] + col_name: str, datum: DatumMessage, type_info: TypeInfo ) -> TColumnSchema: """Converts decoderbuf's datum value/typeinfo to dlt column schema.""" - column_schema: TColumnSchema = { + return { "name": col_name, - **_to_dlt_column_type( - datum.column_type, type_info.modifier if type_info else None - ), + "nullable": type_info.value_optional, + **_to_dlt_column_type(datum.column_type, type_info.modifier), } - # Set nullable attribute if type_info is available - if type_info: - column_schema["nullable"] = type_info.value_optional - - return column_schema - def _epoch_micros_to_datetime(microseconds_since_1970: int) -> pendulum.DateTime: return pendulum.from_timestamp(microseconds_since_1970 / 1_000_000) diff --git a/tests/pg_legacy_replication/test_helpers.py b/tests/pg_legacy_replication/test_helpers.py index b93d2486d..bd698d3f3 100644 --- a/tests/pg_legacy_replication/test_helpers.py +++ b/tests/pg_legacy_replication/test_helpers.py @@ -2,12 +2,14 @@ from dlt.common.schema.typing import TTableSchema from dlt.common.typing import TDataItem from google.protobuf.json_format import ParseDict as parse_dict + +from sources.pg_legacy_replication import ReplicationOptions from sources.pg_legacy_replication.helpers import ( compare_schemas, gen_data_item, infer_table_schema, ) -from sources.pg_legacy_replication.pg_logicaldec_pb2 import RowMessage +from sources.pg_legacy_replication.pg_logicaldec_pb2 import Op, RowMessage from .cases import ( DATA_ITEMS, ROW_MESSAGES, @@ -24,25 +26,24 @@ def test_infer_table_schema( ): row_msg = RowMessage() parse_dict(data, row_msg) - assert ( - infer_table_schema( - row_msg, - include_commit_ts=True, - include_tx_id=True, - ) - == expected_schema - ) + options = ReplicationOptions(include_commit_ts=True, include_tx_id=True) + if row_msg.op == Op.DELETE: + with pytest.raises(AssertionError): + infer_table_schema(row_msg, options) + else: + assert infer_table_schema(row_msg, options) == expected_schema -@pytest.mark.parametrize("data, data_item", zip(ROW_MESSAGES, DATA_ITEMS)) -def test_gen_data_item(data, data_item: TDataItem): +@pytest.mark.parametrize( + "data, data_item, schema", zip(ROW_MESSAGES, DATA_ITEMS, TABLE_SCHEMAS) +) +def test_gen_data_item(data, data_item: TDataItem, schema: TTableSchema): row_msg = RowMessage() parse_dict(data, row_msg) - column_schema = infer_table_schema(row_msg)["columns"] assert ( gen_data_item( row_msg, - column_schema, + schema["columns"], lsn=1, include_commit_ts=True, include_tx_id=True, diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index ca103dd1c..b3bd6e524 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -744,3 +744,66 @@ def test_batching(src_config: Tuple[dlt.Pipeline, str], backend: TableBackend) - src_pl.run(batch, table_name="items") extract_info = dest_pl.extract(changes) assert extract_info.asdict()["job_metrics"][0]["items_count"] == 100 + + +@pytest.mark.parametrize("destination_name", ALL_DESTINATIONS) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow"]) +def test_delete_schema_bug( + src_config: Tuple[dlt.Pipeline, str], destination_name: str, backend: TableBackend +) -> None: + src_pl, slot_name = src_config + + # create postgres table with 100 records + data = [{"id": key, "val": True} for key in range(1, 101)] + src_pl.run(data, table_name="items") + + add_pk(src_pl.sql_client, "items", "id") + + snapshots = init_replication( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("items",), + take_snapshots=True, + table_options={"items": {"backend": backend}}, + ) + + dest_pl = dlt.pipeline( + pipeline_name="dest_pl", destination=destination_name, dev_mode=True + ) + + # initial load + info = dest_pl.run(snapshots) + cleanup_snapshot_resources(snapshots) + assert_load_info(info) + assert load_table_counts(dest_pl, "items") == {"items": 100} + assert_loaded_data(dest_pl, "items", ["id", "val"], data, "id") + + changes = replication_source( + slot_name=slot_name, + schema=src_pl.dataset_name, + table_names=("items",), + target_batch_size=10, + repl_options={"items": {"backend": backend}}, + ) + changes.items.apply_hints( + write_disposition="merge", primary_key="id", columns=merge_hints + ) + + # process changes + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=1) + assert load_table_counts(dest_pl, "items") == {"items": 100} + assert_loaded_data(dest_pl, "items", ["id", "val"], data, "id") + + # delete the first 50 rows and update the next 50 rows + with src_pl.sql_client() as c: + qual_name = src_pl.sql_client().make_qualified_table_name("items") + to_delete = ",".join([str(x) for x in range(1, 51)]) + c.execute_sql(f"DELETE FROM {qual_name} WHERE id IN ({to_delete});") + to_update = ",".join([str(x) for x in range(51, 101)]) + c.execute_sql(f"UPDATE {qual_name} SET val = false WHERE id IN ({to_update});") + + # process changes + info = dest_pl.run(changes) + assert_load_info(info, expected_load_packages=2) + assert load_table_counts(dest_pl, "items") == {"items": 50} From 5d6790e83da10cd60910b337217ff55ffe76819f Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Mon, 3 Mar 2025 16:24:18 +0100 Subject: [PATCH 87/88] fix: various fixes related to pyarrow backends - refactor: changing closing semantics for db conns (using contextlib.closing) - fix: it is necessary sometimes to use the same reflection level as with the initial snapshot for arrow schemas - fix: timezone flag is now an acceptable seemless schema migration - fix: aligned precision for fixed integer types to match the ones inferred from the sql_database source (I guess to account for signed values) - chore: removed test case with changing the precision of a byte array with pyarrow (absurd one to begin with and no longer possible with the new rows_to_arrow implementation) --- sources/pg_legacy_replication/helpers.py | 67 +++++++++---------- sources/pg_legacy_replication/schema_types.py | 6 +- tests/pg_legacy_replication/cases.py | 15 +++-- .../test_pg_replication.py | 5 +- 4 files changed, 47 insertions(+), 46 deletions(-) diff --git a/sources/pg_legacy_replication/helpers.py b/sources/pg_legacy_replication/helpers.py index 10263feb1..8edd98502 100644 --- a/sources/pg_legacy_replication/helpers.py +++ b/sources/pg_legacy_replication/helpers.py @@ -1,5 +1,6 @@ import hashlib from collections import defaultdict +from contextlib import closing from dataclasses import dataclass, field from functools import partial from typing import ( @@ -164,8 +165,7 @@ def get_max_lsn( Returns None if the replication slot is empty. Does not consume the slot, i.e. messages are not flushed. """ - conn = _get_conn(credentials) - try: + with closing(_get_conn(credentials)) as conn: with conn.cursor() as cur: pg_version = get_pg_version(cur) lsn_field = "lsn" if pg_version >= 100000 else "location" @@ -181,8 +181,6 @@ def get_max_lsn( ) row = cur.fetchone() return row[0] if row else None # type: ignore[no-any-return] - finally: - conn.close() def lsn_int_to_hex(lsn: int) -> str: @@ -204,16 +202,13 @@ def advance_slot( the behavior of that method seems odd when used outside of `consume_stream`. """ assert upto_lsn > 0 - conn = _get_conn(credentials) - try: + with closing(_get_conn(credentials)) as conn: with conn.cursor() as cur: # There is unfortunately no way in pg9.6 to manually advance the replication slot if get_pg_version(cur) > 100000: cur.execute( f"SELECT * FROM pg_replication_slot_advance('{slot_name}', '{lsn_int_to_hex(upto_lsn)}');" ) - finally: - conn.close() def _get_conn( @@ -371,7 +366,7 @@ def get_table_schema(self, msg: RowMessage) -> TTableSchema: retained_schema = compare_schemas(last_schema, new_schema) self.last_table_schema[table_name] = retained_schema except AssertionError as e: - logger.debug(str(e)) + logger.info(str(e)) raise StopReplication return new_schema @@ -381,13 +376,14 @@ def _fetch_table_schema_with_sqla( ) -> TTableSchema: """Last resort function used to fetch the table schema from the database""" engine = engine_from_credentials(self.credentials) + options = self.repl_options[table_name] to_col_schema = partial( - sqla_col_to_column_schema, reflection_level="full_with_precision" + sqla_col_to_column_schema, + reflection_level=options.get("reflection_level", "full"), ) try: metadata = MetaData(schema=schema) table = Table(table_name, metadata, autoload_with=engine) - options = self.repl_options[table_name] included_columns = options.get("included_columns") columns = { col["name"]: col @@ -427,6 +423,7 @@ class ItemGenerator: start_lsn: int repl_options: DefaultDict[str, ReplicationOptions] target_batch_size: int = 1000 + keepalive_interval: Optional[int] = None last_commit_lsn: Optional[int] = field(default=None, init=False) generated_all: bool = False @@ -438,30 +435,27 @@ def __iter__(self) -> Iterator[TableItems]: Maintains LSN of last consumed commit message in object state. Advances the slot only when all messages have been consumed. """ - conn = get_rep_conn(self.credentials) - consumer = MessageConsumer( - credentials=self.credentials, - upto_lsn=self.upto_lsn, - table_qnames=self.table_qnames, - repl_options=self.repl_options, - target_batch_size=self.target_batch_size, - ) - - cur = conn.cursor() - try: - cur.start_replication(slot_name=self.slot_name, start_lsn=self.start_lsn) - cur.consume_stream(consumer) - except StopReplication: # completed batch or reached `upto_lsn` - yield from self.flush_batch(cur, consumer) - finally: - logger.debug( - "Closing connection... last_commit_lsn: %s, generated_all: %s, feedback_ts: %s", - self.last_commit_lsn, - self.generated_all, - cur.feedback_timestamp, - ) - cur.close() - conn.close() + with closing(get_rep_conn(self.credentials)) as rep_conn: + with rep_conn.cursor() as rep_cur: + try: + consumer = MessageConsumer( + credentials=self.credentials, + upto_lsn=self.upto_lsn, + table_qnames=self.table_qnames, + repl_options=self.repl_options, + target_batch_size=self.target_batch_size, + ) + rep_cur.start_replication(self.slot_name, start_lsn=self.start_lsn) + rep_cur.consume_stream(consumer, self.keepalive_interval) + except StopReplication: # completed batch or reached `upto_lsn` + yield from self.flush_batch(rep_cur, consumer) + finally: + logger.debug( + "Closing connection... last_commit_lsn: %s, generated_all: %s, feedback_ts: %s", + self.last_commit_lsn, + self.generated_all, + rep_cur.feedback_timestamp, + ) def flush_batch( self, cur: ReplicationCursor, consumer: MessageConsumer @@ -662,6 +656,7 @@ def _actual_column_name(column: DatumMessage) -> str: "nullable", "precision", "scale", + "timezone", } @@ -703,6 +698,8 @@ def compare_schemas(last: TTableSchema, new: TTableSchema) -> TTableSchema: col_schema["precision"] = s1.get("precision", s2.get("precision")) if "scale" in s1 or "scale" in s2: col_schema["scale"] = s1.get("scale", s2.get("scale")) + if "timezone" in s1 or "timezone" in s2: + col_schema["timezone"] = s1.get("timezone", s2.get("timezone")) # Update with the more detailed schema per column table_schema["columns"][name] = col_schema diff --git a/sources/pg_legacy_replication/schema_types.py b/sources/pg_legacy_replication/schema_types.py index 5b9fbdb64..5caeefa89 100644 --- a/sources/pg_legacy_replication/schema_types.py +++ b/sources/pg_legacy_replication/schema_types.py @@ -67,10 +67,10 @@ """Maps decoderbuf's datum msg type to dlt type.""" _FIXED_PRECISION_TYPES: Dict[int, Tuple[int, Optional[int]]] = { - 21: (16, None), # smallint - 23: (32, None), # integer + 21: (32, None), # smallint + 23: (64, None), # integer 20: (64, None), # bigint - 700: (32, None), # real + 700: (64, None), # real } """Dict for fixed precision types""" diff --git a/tests/pg_legacy_replication/cases.py b/tests/pg_legacy_replication/cases.py index 6fcd522e6..cc217930f 100644 --- a/tests/pg_legacy_replication/cases.py +++ b/tests/pg_legacy_replication/cases.py @@ -40,7 +40,7 @@ "col4_precision": "2022-05-23T13:26:46.167231+00:00", "col5_precision": "string data 2 \n \r \x8e 🦆", "col6_precision": Decimal("2323.34"), - "col7_precision": b"binary data 2 \n \r \x8e", + # "col7_precision": b"binary data 2 \n \r \x8e", # FIXME This is no longer possible in pyarrow and it's absurd to begin with "col11_precision": "13:26:45.176451", } TABLE_UPDATE: List[TColumnSchema] = [ @@ -86,12 +86,12 @@ "scale": 2, "nullable": False, }, - { - "name": "col7_precision", - "data_type": "binary", - "precision": 19, - "nullable": False, - }, + # { + # "name": "col7_precision", + # "data_type": "binary", + # "precision": 19, + # "nullable": False, + # }, # FIXME See comment above {"name": "col11_precision", "data_type": "time", "precision": 6, "nullable": False}, ] @@ -987,6 +987,7 @@ class SchemaChoice(IntEnum): "name": "filled_at", "nullable": True, "data_type": "timestamp", + "timezone": True, "precision": 6, }, "_pg_lsn": {"name": "_pg_lsn", "nullable": True, "data_type": "bigint"}, diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index b3bd6e524..044ebbc7c 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -298,11 +298,14 @@ def items(data): if init_load and give_hints: snapshot.items.apply_hints(columns=column_schema) + repl_options = {"items": {"backend": backend}} + if give_hints: + repl_options["items"]["column_hints"] = column_schema changes = replication_source( slot_name=slot_name, schema=src_pl.dataset_name, table_names="items", - repl_options={"items": {"backend": backend}}, + repl_options=repl_options, ) changes.items.apply_hints( write_disposition="merge", primary_key="col1", columns=merge_hints From f276b4ad7904b2e93ecc8ba47b6f90e86aed1f55 Mon Sep 17 00:00:00 2001 From: Nicolas ESTRADA Date: Tue, 1 Jul 2025 02:16:27 +0200 Subject: [PATCH 88/88] chore: updating poetry + dlt --- poetry.lock | 475 ++++++++++++++---- pyproject.toml | 4 +- .../test_pg_replication.py | 3 +- tests/test_dlt_init.py | 3 +- 4 files changed, 370 insertions(+), 115 deletions(-) diff --git a/poetry.lock b/poetry.lock index 4096d4472..4fbb40b85 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "adlfs" @@ -6,6 +6,7 @@ version = "2023.10.0" description = "Access Azure Datalake Gen1 with fsspec and dask" optional = false python-versions = ">=3.8" +groups = ["filesystem"] files = [ {file = "adlfs-2023.10.0-py3-none-any.whl", hash = "sha256:dfdc8cc782bd78262435fb1bc2a8cfdbdd80342bb1b1ae9dfff968de912b0b09"}, {file = "adlfs-2023.10.0.tar.gz", hash = "sha256:f5cf06c5b0074d17d43838d4c434791a98420d9e768b36a1a02c7b3930686543"}, @@ -28,6 +29,7 @@ version = "2.5.4" description = "Async client for aws services using botocore and aiohttp" optional = false python-versions = ">=3.7" +groups = ["filesystem"] files = [ {file = "aiobotocore-2.5.4-py3-none-any.whl", hash = "sha256:4b32218728ca3d0be83835b604603a0cd6c329066e884bb78149334267f92440"}, {file = "aiobotocore-2.5.4.tar.gz", hash = "sha256:60341f19eda77e41e1ab11eef171b5a98b5dbdb90804f5334b6f90e560e31fae"}, @@ -49,6 +51,7 @@ version = "3.8.6" description = "Async http client/server framework (asyncio)" optional = false python-versions = ">=3.6" +groups = ["main", "facebook_ads", "filesystem", "unstructured_data", "unstructured_data_lint"] files = [ {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:41d55fc043954cddbbd82503d9cc3f4814a40bcef30b3569bc7b5e34130718c1"}, {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1d84166673694841d8953f0a8d0c90e1087739d24632fe86b1a08819168b4566"}, @@ -149,7 +152,7 @@ multidict = ">=4.5,<7.0" yarl = ">=1.0,<2.0" [package.extras] -speedups = ["Brotli", "aiodns", "cchardet"] +speedups = ["Brotli", "aiodns", "cchardet ; python_version < \"3.10\""] [[package]] name = "aioitertools" @@ -157,6 +160,7 @@ version = "0.11.0" description = "itertools and builtins for AsyncIO and mixed iterables" optional = false python-versions = ">=3.6" +groups = ["filesystem"] files = [ {file = "aioitertools-0.11.0-py3-none-any.whl", hash = "sha256:04b95e3dab25b449def24d7df809411c10e62aab0cbe31a50ca4e68748c43394"}, {file = "aioitertools-0.11.0.tar.gz", hash = "sha256:42c68b8dd3a69c2bf7f2233bf7df4bb58b557bca5252ac02ed5187bbc67d6831"}, @@ -171,6 +175,7 @@ version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" optional = false python-versions = ">=3.7" +groups = ["main", "facebook_ads", "filesystem", "unstructured_data", "unstructured_data_lint"] files = [ {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, @@ -185,6 +190,7 @@ version = "4.0.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] files = [ {file = "anyio-4.0.0-py3-none-any.whl", hash = "sha256:cfdb2b588b9fc25ede96d8db56ed50848b0b649dca3dd1df0b11f683bb9e0b5f"}, {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"}, @@ -197,7 +203,7 @@ sniffio = ">=1.1" [package.extras] doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17) ; python_version < \"3.12\" and platform_python_implementation == \"CPython\" and platform_system != \"Windows\""] trio = ["trio (>=0.22)"] [[package]] @@ -206,6 +212,7 @@ version = "0.0.1" description = "" optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "argilla-0.0.1-py3-none-any.whl", hash = "sha256:8bdc3c505bcfb47ba4b91f5658034eae53bf7d4f9317980397605c0c55817396"}, {file = "argilla-0.0.1.tar.gz", hash = "sha256:5017854754e89f573b31af25b25b803f51cea9ca1fa0bcf00505dee1f45cf7c9"}, @@ -217,6 +224,7 @@ version = "3.2.2" description = "Asana API client" optional = false python-versions = "*" +groups = ["asana_dlt"] files = [ {file = "asana-3.2.2-py2.py3-none-any.whl", hash = "sha256:e8426ae5f5cda2c27d29874145acb589b91e673a84e3fbd45404679499d9604a"}, {file = "asana-3.2.2.tar.gz", hash = "sha256:3a0c64ad5baaa8c52465fe400cedbc873b2127a77df135af518fd8da1af8d6b9"}, @@ -232,6 +240,7 @@ version = "0.3.3" description = "Some handy helper functions for Python's AST module." optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "astatine-0.3.3-py3-none-any.whl", hash = "sha256:6d8c914f01fbea252cb8f31563f2e766a9ab03c02b9bcc37d18f7d9138828401"}, {file = "astatine-0.3.3.tar.gz", hash = "sha256:0c58a7844b5890ff16da07dbfeb187341d8324cb4378940f89d795cbebebce08"}, @@ -247,6 +256,7 @@ version = "2.4.0" description = "Annotate AST trees with source code positions" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "asttokens-2.4.0-py2.py3-none-any.whl", hash = "sha256:cf8fc9e61a86461aa9fb161a14a0841a03c405fa829ac6b202670b3495d2ce69"}, {file = "asttokens-2.4.0.tar.gz", hash = "sha256:2e0171b991b2c959acc6c49318049236844a5da1d65ba2672c4880c1c894834e"}, @@ -258,27 +268,13 @@ six = ">=1.12.0" [package.extras] test = ["astroid", "pytest"] -[[package]] -name = "astunparse" -version = "1.6.3" -description = "An AST unparser for Python" -optional = false -python-versions = "*" -files = [ - {file = "astunparse-1.6.3-py2.py3-none-any.whl", hash = "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8"}, - {file = "astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872"}, -] - -[package.dependencies] -six = ">=1.6.1,<2.0" -wheel = ">=0.23.0,<1.0" - [[package]] name = "async-timeout" version = "4.0.3" description = "Timeout context manager for asyncio programs" optional = false python-versions = ">=3.7" +groups = ["main", "facebook_ads", "filesystem", "unstructured_data", "unstructured_data_lint"] files = [ {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, @@ -290,6 +286,7 @@ version = "23.1.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.7" +groups = ["main", "dev", "facebook_ads", "filesystem", "salesforce", "scrapy", "unstructured_data", "unstructured_data_lint"] files = [ {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"}, {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"}, @@ -300,7 +297,7 @@ cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] dev = ["attrs[docs,tests]", "pre-commit"] docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope-interface"] tests = ["attrs[tests-no-zope]", "zope-interface"] -tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-no-zope = ["cloudpickle ; platform_python_implementation == \"CPython\"", "hypothesis", "mypy (>=1.1.1) ; platform_python_implementation == \"CPython\"", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins ; platform_python_implementation == \"CPython\" and python_version < \"3.11\"", "pytest-xdist[psutil]"] [[package]] name = "automat" @@ -308,6 +305,7 @@ version = "22.10.0" description = "Self-service finite-state machines for the programmer on the go." optional = false python-versions = "*" +groups = ["dev", "scrapy"] files = [ {file = "Automat-22.10.0-py2.py3-none-any.whl", hash = "sha256:c3164f8742b9dc440f3682482d32aaff7bb53f71740dd018533f9de286b64180"}, {file = "Automat-22.10.0.tar.gz", hash = "sha256:e56beb84edad19dcc11d30e8d9b895f75deeb5ef5e96b84a467066b3b84bb04e"}, @@ -326,6 +324,7 @@ version = "1.29.4" description = "Microsoft Azure Core Library for Python" optional = false python-versions = ">=3.7" +groups = ["filesystem"] files = [ {file = "azure-core-1.29.4.tar.gz", hash = "sha256:500b3aa9bf2e90c5ccc88bb105d056114ca0ce7d0ce73afb8bc4d714b2fc7568"}, {file = "azure_core-1.29.4-py3-none-any.whl", hash = "sha256:b03261bcba22c0b9290faf9999cedd23e849ed2577feee90515694cea6bc74bf"}, @@ -345,6 +344,7 @@ version = "0.0.53" description = "Azure Data Lake Store Filesystem Client Library for Python" optional = false python-versions = "*" +groups = ["filesystem"] files = [ {file = "azure-datalake-store-0.0.53.tar.gz", hash = "sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393"}, {file = "azure_datalake_store-0.0.53-py2.py3-none-any.whl", hash = "sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b"}, @@ -361,6 +361,7 @@ version = "1.14.1" description = "Microsoft Azure Identity Library for Python" optional = false python-versions = ">=3.7" +groups = ["filesystem"] files = [ {file = "azure-identity-1.14.1.zip", hash = "sha256:48e2a9dbdc59b4f095f841d867d9a8cbe4c1cdbbad8251e055561afd47b4a9b8"}, {file = "azure_identity-1.14.1-py3-none-any.whl", hash = "sha256:3a5bef8e9c3281e864e869739be8d67424bff616cddae96b546ca2a5168d863d"}, @@ -378,6 +379,7 @@ version = "12.18.3" description = "Microsoft Azure Blob Storage Client Library for Python" optional = false python-versions = ">=3.7" +groups = ["filesystem"] files = [ {file = "azure-storage-blob-12.18.3.tar.gz", hash = "sha256:d8ced0deee3367fa3d4f3d1a03cd9edadf4440c0a371f503d623fa6c807554ee"}, {file = "azure_storage_blob-12.18.3-py3-none-any.whl", hash = "sha256:c278dde2ac41857a68d615c9f2b36d894ba877a7e84d62795603c7e79d0bb5e9"}, @@ -398,6 +400,7 @@ version = "2.2.1" description = "Function decoration for backoff and retry" optional = false python-versions = ">=3.7,<4.0" +groups = ["unstructured_data"] files = [ {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, @@ -409,6 +412,7 @@ version = "1.7.5" description = "Security oriented static analyser for python code." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "bandit-1.7.5-py3-none-any.whl", hash = "sha256:75665181dc1e0096369112541a056c59d1c5f66f9bb74a8d686c3c362b83f549"}, {file = "bandit-1.7.5.tar.gz", hash = "sha256:bdfc739baa03b880c2d15d0431b31c658ffc348e907fe197e54e0389dd59e11e"}, @@ -422,8 +426,8 @@ rich = "*" stevedore = ">=1.20.0" [package.extras] -test = ["beautifulsoup4 (>=4.8.0)", "coverage (>=4.5.4)", "fixtures (>=3.0.0)", "flake8 (>=4.0.0)", "pylint (==1.9.4)", "stestr (>=2.5.0)", "testscenarios (>=0.5.0)", "testtools (>=2.3.0)", "tomli (>=1.1.0)"] -toml = ["tomli (>=1.1.0)"] +test = ["beautifulsoup4 (>=4.8.0)", "coverage (>=4.5.4)", "fixtures (>=3.0.0)", "flake8 (>=4.0.0)", "pylint (==1.9.4)", "stestr (>=2.5.0)", "testscenarios (>=0.5.0)", "testtools (>=2.3.0)", "tomli (>=1.1.0) ; python_version < \"3.11\""] +toml = ["tomli (>=1.1.0) ; python_version < \"3.11\""] yaml = ["PyYAML"] [[package]] @@ -432,6 +436,7 @@ version = "23.9.1" description = "The uncompromising code formatter." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "black-23.9.1-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:d6bc09188020c9ac2555a498949401ab35bb6bf76d4e0f8ee251694664df6301"}, {file = "black-23.9.1-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:13ef033794029b85dfea8032c9d3b92b42b526f1ff4bf13b2182ce4e917f5100"}, @@ -478,6 +483,7 @@ version = "1.31.17" description = "Low-level, data-driven core of boto 3." optional = false python-versions = ">= 3.7" +groups = ["filesystem"] files = [ {file = "botocore-1.31.17-py3-none-any.whl", hash = "sha256:6ac34a1d34aa3750e78b77b8596617e2bab938964694d651939dba2cbde2c12b"}, {file = "botocore-1.31.17.tar.gz", hash = "sha256:396459065dba4339eb4da4ec8b4e6599728eb89b7caaceea199e26f7d824a41c"}, @@ -497,6 +503,7 @@ version = "5.3.1" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" +groups = ["main", "filesystem", "google_ads", "google_analytics", "google_sheets"] files = [ {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, @@ -508,6 +515,7 @@ version = "2023.7.22" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main", "airtable", "asana_dlt", "dev", "dltpure", "facebook_ads", "filesystem", "google_ads", "google_analytics", "google_sheets", "salesforce", "scrapy", "stripe_analytics", "unstructured_data", "unstructured_data_lint"] files = [ {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, @@ -519,6 +527,7 @@ version = "1.16.0" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.8" +groups = ["main", "filesystem", "salesforce", "scrapy", "unstructured_data"] files = [ {file = "cffi-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6b3d6606d369fc1da4fd8c357d026317fbb9c9b75d36dc16e90e84c26854b088"}, {file = "cffi-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ac0f5edd2360eea2f1daa9e26a41db02dd4b0451b48f7c318e217ee092a213e9"}, @@ -573,6 +582,7 @@ files = [ {file = "cffi-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:3686dffb02459559c74dd3d81748269ffb0eb027c39a6fc99502de37d501faa8"}, {file = "cffi-1.16.0.tar.gz", hash = "sha256:bcb3ef43e58665bbda2fb198698fcae6776483e0c4a631aa5647806c25e02cc0"}, ] +markers = {main = "platform_python_implementation == \"PyPy\""} [package.dependencies] pycparser = "*" @@ -583,6 +593,7 @@ version = "5.2.0" description = "Universal encoding detector for Python 3" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, @@ -594,6 +605,7 @@ version = "3.3.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7.0" +groups = ["main", "airtable", "asana_dlt", "dev", "dltpure", "facebook_ads", "filesystem", "google_ads", "google_analytics", "google_sheets", "salesforce", "scrapy", "stripe_analytics", "unstructured_data", "unstructured_data_lint"] files = [ {file = "charset-normalizer-3.3.0.tar.gz", hash = "sha256:63563193aec44bce707e0c5ca64ff69fa72ed7cf34ce6e11d5127555756fd2f6"}, {file = "charset_normalizer-3.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:effe5406c9bd748a871dbcaf3ac69167c38d72db8c9baf3ff954c344f31c4cbe"}, @@ -693,6 +705,7 @@ version = "0.3.29" description = "Chroma." optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "chromadb-0.3.29-py3-none-any.whl", hash = "sha256:d681a3e4f3284715dd146774be84cad3d2f8c529bd004ba249e1d3deb70ac68e"}, {file = "chromadb-0.3.29.tar.gz", hash = "sha256:29d47835da494fc1b58da40abb1435689d4ba1c93df6c64664a5d91521cb80e9"}, @@ -722,6 +735,7 @@ version = "8.1.7" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["main", "dev", "dltpure", "unstructured_data"] files = [ {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, @@ -736,6 +750,7 @@ version = "0.6.14" description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" optional = false python-versions = "~=3.7" +groups = ["unstructured_data"] files = [ {file = "clickhouse-connect-0.6.14.tar.gz", hash = "sha256:0531bbd5b8bdee616bf1cca5ddcb0af86db12e2b48fd39257a8ecdf32200bd57"}, {file = "clickhouse_connect-0.6.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:04affbd255fb8b1e4a882ddc1336c86530976d05578f47bb65e3a53471d291e4"}, @@ -824,10 +839,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev", "dltpure", "pytest", "unstructured_data", "unstructured_data_lint"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or sys_platform == \"win32\"", dltpure = "platform_system == \"Windows\"", pytest = "sys_platform == \"win32\"", unstructured_data = "platform_system == \"Windows\" or sys_platform == \"win32\"", unstructured_data_lint = "platform_system == \"Windows\""} [[package]] name = "coloredlogs" @@ -835,6 +852,7 @@ version = "15.0.1" description = "Colored terminal output for Python's logging module" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["unstructured_data"] files = [ {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, @@ -852,6 +870,7 @@ version = "2.3.0" description = "Confluent's Python client for Apache Kafka" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "confluent-kafka-2.3.0.tar.gz", hash = "sha256:4069e7b56e0baf9db18c053a605213f0ab2d8f23715dca7b3bd97108df446ced"}, {file = "confluent_kafka-2.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5df845755cd3ebb9165ca00fd1d3a7d514c61e84d9fcbe7babb91193fe9b369c"}, @@ -890,10 +909,10 @@ files = [ ] [package.extras] -avro = ["avro (>=1.11.1,<2)", "fastavro (>=0.23.0,<1.0)", "fastavro (>=1.0)", "requests"] -dev = ["avro (>=1.11.1,<2)", "fastavro (>=0.23.0,<1.0)", "fastavro (>=1.0)", "flake8", "pytest", "pytest (==4.6.4)", "pytest-timeout", "requests"] -doc = ["avro (>=1.11.1,<2)", "fastavro (>=0.23.0,<1.0)", "fastavro (>=1.0)", "requests", "sphinx", "sphinx-rtd-theme"] -json = ["jsonschema", "pyrsistent", "pyrsistent (==0.16.1)", "requests"] +avro = ["avro (>=1.11.1,<2)", "fastavro (>=0.23.0,<1.0) ; python_version < \"3.0\"", "fastavro (>=1.0) ; python_version > \"3.0\"", "requests"] +dev = ["avro (>=1.11.1,<2)", "fastavro (>=0.23.0,<1.0) ; python_version < \"3.0\"", "fastavro (>=1.0) ; python_version > \"3.0\"", "flake8", "pytest (==4.6.4) ; python_version < \"3.0\"", "pytest ; python_version >= \"3.0\"", "pytest-timeout", "requests"] +doc = ["avro (>=1.11.1,<2)", "fastavro (>=0.23.0,<1.0) ; python_version < \"3.0\"", "fastavro (>=1.0) ; python_version > \"3.0\"", "requests", "sphinx", "sphinx-rtd-theme"] +json = ["jsonschema", "pyrsistent (==0.16.1) ; python_version < \"3.0\"", "pyrsistent ; python_version > \"3.0\"", "requests"] protobuf = ["protobuf", "requests"] schema-registry = ["requests"] @@ -903,6 +922,7 @@ version = "0.3.2" description = "" optional = false python-versions = "*" +groups = ["sql_database"] files = [ {file = "connectorx-0.3.2-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:98274242c64a2831a8b1c86e0fa2c46a557dd8cbcf00c3adcf5a602455fb02d7"}, {file = "connectorx-0.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e2b11ba49efd330a7348bef3ce09c98218eea21d92a12dd75cd8f0ade5c99ffc"}, @@ -928,6 +948,7 @@ version = "23.10.4" description = "Symbolic constants in Python" optional = false python-versions = ">=3.8" +groups = ["dev", "scrapy"] files = [ {file = "constantly-23.10.4-py3-none-any.whl", hash = "sha256:3fd9b4d1c3dc1ec9757f3c52aef7e53ad9323dbe39f51dfd4c43853b68dfa3f9"}, {file = "constantly-23.10.4.tar.gz", hash = "sha256:aa92b70a33e2ac0bb33cd745eb61776594dc48764b06c35e0efd050b7f1c7cbd"}, @@ -939,6 +960,7 @@ version = "7.6.1" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "coverage-7.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b06079abebbc0e89e6163b8e8f0e16270124c154dc6e4a47b413dd538859af16"}, {file = "coverage-7.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf4b19715bccd7ee27b6b120e7e9dd56037b9c0681dcc1adc9ba9db3d417fa36"}, @@ -1018,7 +1040,7 @@ files = [ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} [package.extras] -toml = ["tomli"] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] [[package]] name = "cryptography" @@ -1026,6 +1048,7 @@ version = "41.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" +groups = ["filesystem", "salesforce", "scrapy", "unstructured_data"] files = [ {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"}, {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"}, @@ -1071,6 +1094,7 @@ version = "1.2.0" description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0" optional = false python-versions = ">=3.7" +groups = ["scrapy"] files = [ {file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"}, {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"}, @@ -1082,6 +1106,7 @@ version = "2.2.1" description = "Library to convert python requests object to curl command." optional = false python-versions = "*" +groups = ["facebook_ads"] files = [ {file = "curlify-2.2.1.tar.gz", hash = "sha256:0d3f02e7235faf952de8ef45ef469845196d30632d5838bcd5aee217726ddd6d"}, ] @@ -1095,6 +1120,7 @@ version = "0.5.9" description = "Easily serialize dataclasses to and from JSON" optional = false python-versions = ">=3.6" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "dataclasses-json-0.5.9.tar.gz", hash = "sha256:e9ac87b73edc0141aafbce02b44e93553c3123ad574958f0fe52a534b6707e8e"}, {file = "dataclasses_json-0.5.9-py3-none-any.whl", hash = "sha256:1280542631df1c375b7bc92e5b86d39e06c44760d7e3571a537b3b8acabf2f0c"}, @@ -1106,7 +1132,7 @@ marshmallow-enum = ">=1.5.1,<2.0.0" typing-inspect = ">=0.4.0" [package.extras] -dev = ["flake8", "hypothesis", "ipython", "mypy (>=0.710)", "portray", "pytest (>=7.2.0)", "setuptools", "simplejson", "twine", "types-dataclasses", "wheel"] +dev = ["flake8", "hypothesis", "ipython", "mypy (>=0.710)", "portray", "pytest (>=7.2.0)", "setuptools", "simplejson", "twine", "types-dataclasses ; python_version == \"3.6\"", "wheel"] [[package]] name = "db-dtypes" @@ -1114,6 +1140,7 @@ version = "1.3.1" description = "Pandas Data Types for SQL systems (BigQuery, Spanner)" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "db_dtypes-1.3.1-py2.py3-none-any.whl", hash = "sha256:fbc9d1740d94aaf2b5ae24601cfc875a69b4635bb9d049e3c3036e9f10203af8"}, {file = "db_dtypes-1.3.1.tar.gz", hash = "sha256:a058f05dab100891f3e76a7a3db9ad0f107f18dd3d1bdd13680749a2f07eae77"}, @@ -1131,6 +1158,7 @@ version = "5.1.1" description = "Decorators for Humans" optional = false python-versions = ">=3.5" +groups = ["main", "filesystem"] files = [ {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, @@ -1138,17 +1166,17 @@ files = [ [[package]] name = "dlt" -version = "1.3.0" +version = "1.8.1" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." optional = false -python-versions = "<3.13,>=3.8.1" +python-versions = "<3.14,>=3.9" +groups = ["main", "dltpure"] files = [ - {file = "dlt-1.3.0-py3-none-any.whl", hash = "sha256:e2583ed0ad4a0d9941b8f9cb0e078f4443bcbeb0e1cf1cce586cf35107ccf266"}, - {file = "dlt-1.3.0.tar.gz", hash = "sha256:57eecee99ace25b6d37027a78f59f8c735d1913cc81f1101e1b47bf96fc544b8"}, + {file = "dlt-1.8.1-py3-none-any.whl", hash = "sha256:154699cc70e4263a294b576ca8d22bb7e153bfb872acabba08fcfecd9b9d285a"}, + {file = "dlt-1.8.1.tar.gz", hash = "sha256:6ff9c56d7ea416cd01bce874348023042a441d6f83b35495d234efd709d9fd77"}, ] [package.dependencies] -astunparse = ">=1.6.3" click = ">=7.1" db-dtypes = {version = ">=1.2.0", optional = true, markers = "extra == \"gcp\" or extra == \"bigquery\""} duckdb = {version = ">=0.9", optional = true, markers = "extra == \"duckdb\" or extra == \"motherduck\""} @@ -1165,50 +1193,54 @@ makefun = ">=1.15.0" orjson = {version = ">=3.6.7,<3.9.11 || >3.9.11,<3.9.12 || >3.9.12,<3.9.13 || >3.9.13,<3.9.14 || >3.9.14,<3.10.1 || >3.10.1,<4", markers = "platform_python_implementation != \"PyPy\""} packaging = ">=21.1" pathvalidate = ">=2.5.2" -pendulum = ">=2.1.2" +pendulum = {version = ">=2.1.2", markers = "python_version < \"3.13\""} pluggy = ">=1.3.0" -psycopg2-binary = {version = ">=2.9.1", optional = true, markers = "extra == \"postgres\" or extra == \"redshift\""} -psycopg2cffi = {version = ">=2.9.0", optional = true, markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"redshift\")"} -pyarrow = {version = ">=12.0.0", optional = true, markers = "extra == \"bigquery\" or extra == \"parquet\" or extra == \"motherduck\" or extra == \"athena\" or extra == \"synapse\" or extra == \"clickhouse\" or extra == \"dremio\" or extra == \"lancedb\" or extra == \"deltalake\""} +psycopg2-binary = {version = ">=2.9.1", optional = true, markers = "extra == \"postgres\" or extra == \"redshift\" or extra == \"postgis\""} +psycopg2cffi = {version = ">=2.9.0", optional = true, markers = "platform_python_implementation == \"PyPy\" and (extra == \"postgres\" or extra == \"redshift\" or extra == \"postgis\")"} +pyarrow = {version = ">=12.0.0,<18", optional = true, markers = "python_version >= \"3.9\" and python_version < \"3.13\" and (extra == \"bigquery\" or extra == \"parquet\" or extra == \"motherduck\" or extra == \"athena\" or extra == \"synapse\" or extra == \"clickhouse\" or extra == \"dremio\" or extra == \"lancedb\" or extra == \"deltalake\" or extra == \"pyiceberg\")"} pytz = ">=2022.6" +pywin32 = {version = ">=306", markers = "sys_platform == \"win32\""} PyYAML = ">=5.4.1" requests = ">=2.26.0" requirements-parser = ">=0.5.0" -semver = ">=2.13.0" +rich-argparse = ">=1.6.0,<2.0.0" +semver = ">=3.0.0" setuptools = ">=65.6.0" simplejson = ">=3.17.5" tenacity = ">=8.0.2" tomlkit = ">=0.11.3" -typing-extensions = ">=4.0.0" +typing-extensions = ">=4.8.0" tzdata = ">=2022.1" win-precise-time = {version = ">=1.4.2", markers = "os_name == \"nt\""} [package.extras] -athena = ["botocore (>=1.28)", "pyarrow (>=12.0.0)", "pyathena (>=2.9.6)", "s3fs (>=2022.4.0)"] -az = ["adlfs (>=2022.4.0)"] -bigquery = ["db-dtypes (>=1.2.0)", "gcsfs (>=2022.4.0)", "google-cloud-bigquery (>=2.26.0)", "grpcio (>=1.50.0)", "pyarrow (>=12.0.0)"] -cli = ["cron-descriptor (>=1.2.32)", "pipdeptree (>=2.9.0,<2.10)"] -clickhouse = ["adlfs (>=2022.4.0)", "clickhouse-connect (>=0.7.7)", "clickhouse-driver (>=0.2.7)", "gcsfs (>=2022.4.0)", "pyarrow (>=12.0.0)", "s3fs (>=2022.4.0)"] -databricks = ["databricks-sql-connector (>=2.9.3)"] -deltalake = ["deltalake (>=0.19.0)", "pyarrow (>=12.0.0)"] -dremio = ["pyarrow (>=12.0.0)"] +athena = ["botocore (>=1.28)", "pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\"", "pyathena (>=2.9.6)", "s3fs (>=2022.4.0)"] +az = ["adlfs (>=2024.7.0)"] +bigquery = ["db-dtypes (>=1.2.0)", "gcsfs (>=2022.4.0)", "google-cloud-bigquery (>=2.26.0)", "grpcio (>=1.50.0)", "pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\""] +cli = ["cron-descriptor (>=1.2.32)", "pip (>=23.0.0)", "pipdeptree (>=2.9.0,<2.10)"] +clickhouse = ["adlfs (>=2024.7.0)", "clickhouse-connect (>=0.7.7)", "clickhouse-driver (>=0.2.7)", "gcsfs (>=2022.4.0)", "pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\"", "s3fs (>=2022.4.0)"] +databricks = ["databricks-sdk (>=0.38.0)", "databricks-sql-connector (>=2.9.3,<4) ; python_version <= \"3.12\"", "databricks-sql-connector (>=3.6.0) ; python_version >= \"3.13\""] +deltalake = ["deltalake (>=0.21.0)", "pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\""] +dremio = ["pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\""] duckdb = ["duckdb (>=0.9)"] filesystem = ["botocore (>=1.28)", "s3fs (>=2022.4.0)", "sqlglot (>=20.0.0)"] gcp = ["db-dtypes (>=1.2.0)", "gcsfs (>=2022.4.0)", "google-cloud-bigquery (>=2.26.0)", "grpcio (>=1.50.0)"] gs = ["gcsfs (>=2022.4.0)"] -lancedb = ["lancedb (>=0.8.2)", "pyarrow (>=12.0.0)", "tantivy (>=0.22.0)"] -motherduck = ["duckdb (>=0.9)", "pyarrow (>=12.0.0)"] +lancedb = ["lancedb (>=0.8.2) ; python_version < \"3.13\"", "pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\"", "tantivy (>=0.22.0)"] +motherduck = ["duckdb (>=0.9)", "pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\""] mssql = ["pyodbc (>=4.0.39)"] -parquet = ["pyarrow (>=12.0.0)"] -postgres = ["psycopg2-binary (>=2.9.1)", "psycopg2cffi (>=2.9.0)"] +parquet = ["pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\""] +postgis = ["psycopg2-binary (>=2.9.1)", "psycopg2cffi (>=2.9.0) ; platform_python_implementation == \"PyPy\""] +postgres = ["psycopg2-binary (>=2.9.1)", "psycopg2cffi (>=2.9.0) ; platform_python_implementation == \"PyPy\""] +pyiceberg = ["pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\"", "pyiceberg (>=0.8.1)", "sqlalchemy (>=1.4)"] qdrant = ["qdrant-client[fastembed] (>=1.8)"] -redshift = ["psycopg2-binary (>=2.9.1)", "psycopg2cffi (>=2.9.0)"] +redshift = ["psycopg2-binary (>=2.9.1)", "psycopg2cffi (>=2.9.0) ; platform_python_implementation == \"PyPy\""] s3 = ["botocore (>=1.28)", "s3fs (>=2022.4.0)"] sftp = ["paramiko (>=3.3.0)"] snowflake = ["snowflake-connector-python (>=3.5.0)"] sql-database = ["sqlalchemy (>=1.4)"] sqlalchemy = ["alembic (>1.10.0)", "sqlalchemy (>=1.4)"] -synapse = ["adlfs (>=2022.4.0)", "pyarrow (>=12.0.0)", "pyodbc (>=4.0.39)"] +synapse = ["adlfs (>=2024.7.0)", "pyarrow (>=12.0.0,<18) ; python_version >= \"3.9\" and python_version < \"3.13\"", "pyarrow (>=18.0.0) ; python_version >= \"3.13\"", "pyodbc (>=4.0.39)"] weaviate = ["weaviate-client (>=3.22)"] [[package]] @@ -1217,6 +1249,7 @@ version = "2.4.2" description = "DNS toolkit" optional = false python-versions = ">=3.8,<4.0" +groups = ["mongodb"] files = [ {file = "dnspython-2.4.2-py3-none-any.whl", hash = "sha256:57c6fbaaeaaf39c891292012060beb141791735dbb4004798328fc2c467402d8"}, {file = "dnspython-2.4.2.tar.gz", hash = "sha256:8dcfae8c7460a2f84b4072e26f1c9f4101ca20c071649cb7c34e8b6a93d58984"}, @@ -1236,6 +1269,7 @@ version = "3.6.1" description = "Helpful functions for Python 🐍 🛠️" optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "domdf_python_tools-3.6.1-py3-none-any.whl", hash = "sha256:e18158460850957f18e740eb94ede56f580ddb0cb162ab9d9834ed8bbb1b6431"}, {file = "domdf_python_tools-3.6.1.tar.gz", hash = "sha256:acc04563d23bce4d437dd08af6b9bea788328c412772a044d8ca428a7ad861be"}, @@ -1255,6 +1289,7 @@ version = "0.10.3" description = "DuckDB in-process database" optional = false python-versions = ">=3.7.0" +groups = ["main", "unstructured_data"] files = [ {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd25cc8d001c09a19340739ba59d33e12a81ab285b7a6bed37169655e1cefb31"}, {file = "duckdb-0.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f9259c637b917ca0f4c63887e8d9b35ec248f5d987c886dfc4229d66a791009"}, @@ -1311,6 +1346,7 @@ version = "1.1.0" description = "An implementation of lxml.xmlfile for the standard library" optional = false python-versions = ">=3.6" +groups = ["unstructured_data"] files = [ {file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"}, {file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"}, @@ -1322,6 +1358,8 @@ version = "1.1.3" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" +groups = ["dev", "pytest", "unstructured_data"] +markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"}, {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"}, @@ -1336,6 +1374,7 @@ version = "17.0.4" description = "Facebook Business SDK" optional = false python-versions = "*" +groups = ["facebook_ads"] files = [ {file = "facebook_business-17.0.4-py3-none-any.whl", hash = "sha256:c3a4afbe019c1fd2454eeeefb4e895ed3276d506115fbf9a993135f6af1c1a88"}, {file = "facebook_business-17.0.4.tar.gz", hash = "sha256:52b516a237ab4cbf083053d3cc062995ff4732fca487b46543c4eab3bdbbf188"}, @@ -1354,6 +1393,7 @@ version = "0.85.1" description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "fastapi-0.85.1-py3-none-any.whl", hash = "sha256:de3166b6b1163dc22da4dc4ebdc3192fcbac7700dd1870a1afa44de636a636b5"}, {file = "fastapi-0.85.1.tar.gz", hash = "sha256:1facd097189682a4ff11cbd01334a992e51b56be663b2bd50c2c09523624f144"}, @@ -1375,6 +1415,7 @@ version = "3.12.4" description = "A platform independent file lock." optional = false python-versions = ">=3.8" +groups = ["scrapy", "unstructured_data"] files = [ {file = "filelock-3.12.4-py3-none-any.whl", hash = "sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4"}, {file = "filelock-3.12.4.tar.gz", hash = "sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd"}, @@ -1383,7 +1424,7 @@ files = [ [package.extras] docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"] testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"] -typing = ["typing-extensions (>=4.7.1)"] +typing = ["typing-extensions (>=4.7.1) ; python_version < \"3.11\""] [[package]] name = "filetype" @@ -1391,6 +1432,7 @@ version = "1.2.0" description = "Infer file type and MIME type of any file/buffer. No external dependencies." optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "filetype-1.2.0-py2.py3-none-any.whl", hash = "sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25"}, {file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"}, @@ -1402,6 +1444,7 @@ version = "6.1.0" description = "the modular source code checker: pep8 pyflakes and co" optional = false python-versions = ">=3.8.1" +groups = ["dev"] files = [ {file = "flake8-6.1.0-py2.py3-none-any.whl", hash = "sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5"}, {file = "flake8-6.1.0.tar.gz", hash = "sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23"}, @@ -1418,6 +1461,7 @@ version = "22.12.6" description = "A plugin for flake8 finding likely bugs and design problems in your program. Contains warnings that don't belong in pyflakes and pycodestyle." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "flake8-bugbear-22.12.6.tar.gz", hash = "sha256:4cdb2c06e229971104443ae293e75e64c6107798229202fbe4f4091427a30ac0"}, {file = "flake8_bugbear-22.12.6-py3-none-any.whl", hash = "sha256:b69a510634f8a9c298dfda2b18a8036455e6b19ecac4fe582e4d7a0abfa50a30"}, @@ -1436,6 +1480,7 @@ version = "2.1.0" description = "Check for python builtins being used as variables or parameters." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "flake8-builtins-2.1.0.tar.gz", hash = "sha256:12ff1ee96dd4e1f3141141ee6c45a5c7d3b3c440d0949e9b8d345c42b39c51d4"}, {file = "flake8_builtins-2.1.0-py3-none-any.whl", hash = "sha256:469e8f03d6d0edf4b1e62b6d5a97dce4598592c8a13ec8f0952e7a185eba50a1"}, @@ -1453,6 +1498,7 @@ version = "0.5.0.post1" description = "A Flake8 plugin to identify incorrect use of encodings." optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "flake8_encodings-0.5.0.post1-py3-none-any.whl", hash = "sha256:d2fecca0e89ba09c86e5d61cf6bdb1b337f0d74746aac67bbcf0c517b4cb6cba"}, {file = "flake8_encodings-0.5.0.post1.tar.gz", hash = "sha256:082c0163325c85b438a8106e876283b5ed3cbfc53e68d89130d70be8be4c9977"}, @@ -1474,6 +1520,7 @@ version = "0.2.1" description = "A helper library for Flake8 plugins." optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "flake8_helper-0.2.1-py3-none-any.whl", hash = "sha256:9123cdf351ad32ee8a51b85036052302c478122d62fb512c0773e111b3d05241"}, {file = "flake8_helper-0.2.1.tar.gz", hash = "sha256:479f86d1c52df8e49ff876ecd3873242699f93eeece7e6675cdca9c37c9b0a16"}, @@ -1488,6 +1535,7 @@ version = "4.10.0" description = "A flake8 plugin that helps you write tidier imports." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "flake8_tidy_imports-4.10.0-py3-none-any.whl", hash = "sha256:b0387fb2ea200441bd142309e716fb7b8f4b0937bdf5f8b7c0c118a5f5e2b8ed"}, {file = "flake8_tidy_imports-4.10.0.tar.gz", hash = "sha256:bd6cf86465402d2b86903009b748d85a628e599e17b76e810c9857e3a2815173"}, @@ -1502,6 +1550,7 @@ version = "23.5.26" description = "The FlatBuffers serialization format for Python" optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "flatbuffers-23.5.26-py2.py3-none-any.whl", hash = "sha256:c0ff356da363087b915fde4b8b45bdda73432fc17cddb3c8157472eab1422ad1"}, {file = "flatbuffers-23.5.26.tar.gz", hash = "sha256:9ea1144cac05ce5d86e2859f431c6cd5e66cd9c78c558317c7955fb8d4c78d89"}, @@ -1513,6 +1562,7 @@ version = "1.4.0" description = "A list-like structure which implements collections.abc.MutableSequence" optional = false python-versions = ">=3.8" +groups = ["main", "facebook_ads", "filesystem", "unstructured_data", "unstructured_data_lint"] files = [ {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"}, {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"}, @@ -1583,6 +1633,7 @@ version = "2024.3.1" description = "File-system specification" optional = false python-versions = ">=3.8" +groups = ["main", "dltpure", "filesystem", "unstructured_data"] files = [ {file = "fsspec-2024.3.1-py3-none-any.whl", hash = "sha256:918d18d41bf73f0e2b261824baeb1b124bcf771767e3a26425cd7dec3332f512"}, {file = "fsspec-2024.3.1.tar.gz", hash = "sha256:f39780e282d7d117ffb42bb96992f8a90795e4d0fb0f661a70ca39fe9c43ded9"}, @@ -1618,6 +1669,7 @@ version = "2024.3.1" description = "Convenient Filesystem interface over GCS" optional = false python-versions = ">=3.8" +groups = ["main", "filesystem"] files = [ {file = "gcsfs-2024.3.1-py2.py3-none-any.whl", hash = "sha256:57ec693a25b74637f00e7a834b4f1dcd7a7511217f7f640072d6fb51a7794bac"}, {file = "gcsfs-2024.3.1.tar.gz", hash = "sha256:d34bdb8a1a51e1b2552ae9e47d1933dec41162ba6b6cc8ea470aef693a8a6aa6"}, @@ -1642,6 +1694,7 @@ version = "4.0.10" description = "Git Object Database" optional = false python-versions = ">=3.7" +groups = ["main", "dev", "dltpure"] files = [ {file = "gitdb-4.0.10-py3-none-any.whl", hash = "sha256:c286cf298426064079ed96a9e4a9d39e7f3e9bf15ba60701e95f5492f28415c7"}, {file = "gitdb-4.0.10.tar.gz", hash = "sha256:6eb990b69df4e15bad899ea868dc46572c3f75339735663b81de79b06f17eb9a"}, @@ -1656,6 +1709,7 @@ version = "3.1.37" description = "GitPython is a Python library used to interact with Git repositories" optional = false python-versions = ">=3.7" +groups = ["main", "dev", "dltpure"] files = [ {file = "GitPython-3.1.37-py3-none-any.whl", hash = "sha256:5f4c4187de49616d710a77e98ddf17b4782060a1788df441846bddefbb89ab33"}, {file = "GitPython-3.1.37.tar.gz", hash = "sha256:f9b9ddc0761c125d5780eab2d64be4873fc6817c2899cbcb34b02344bdc7bc54"}, @@ -1673,6 +1727,7 @@ version = "0.12.0" description = "A Git URL parsing module (supports parsing and rewriting)" optional = false python-versions = ">=3.8" +groups = ["main", "dltpure"] files = [ {file = "giturlparse-0.12.0-py2.py3-none-any.whl", hash = "sha256:412b74f2855f1da2fefa89fd8dde62df48476077a72fc19b62039554d27360eb"}, {file = "giturlparse-0.12.0.tar.gz", hash = "sha256:c0fff7c21acc435491b1779566e038757a205c1ffdcb47e4f81ea52ad8c3859a"}, @@ -1684,6 +1739,7 @@ version = "21.3.0" description = "Client library for the Google Ads API" optional = false python-versions = ">=3.7" +groups = ["google_ads"] files = [ {file = "google-ads-21.3.0.tar.gz", hash = "sha256:bd4fcb6bd5e55bace413e889e82012d48578aa28f7b4d726c86e2d594c753c6c"}, {file = "google_ads-21.3.0-py3-none-any.whl", hash = "sha256:961943fc737941a38f1a826681f7974448df7c60e6c8db2ac7168b26d66738a7"}, @@ -1709,6 +1765,7 @@ version = "0.16.3" description = "Google Analytics Data API client library" optional = false python-versions = ">=3.7" +groups = ["google_analytics"] files = [ {file = "google-analytics-data-0.16.3.tar.gz", hash = "sha256:f29431ec63ab462f7a9b42227521d148c877307c629e308c284025ad834aab52"}, {file = "google_analytics_data-0.16.3-py2.py3-none-any.whl", hash = "sha256:bb73f36707a5a2966e87c9439c25cd8004d58305b0ef01c6f2f50128c08feb13"}, @@ -1717,8 +1774,8 @@ files = [ [package.dependencies] google-api-core = {version = ">=1.34.0,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]} proto-plus = [ - {version = ">=1.22.0,<2.0.0dev", markers = "python_version < \"3.11\""}, {version = ">=1.22.2,<2.0.0dev", markers = "python_version >= \"3.11\""}, + {version = ">=1.22.0,<2.0.0dev"}, ] protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" @@ -1728,6 +1785,7 @@ version = "2.12.0" description = "Google API client core library" optional = false python-versions = ">=3.7" +groups = ["main", "filesystem", "google_ads", "google_analytics", "google_sheets"] files = [ {file = "google-api-core-2.12.0.tar.gz", hash = "sha256:c22e01b1e3c4dcd90998494879612c38d0a3411d1f7b679eb89e2abe3ce1f553"}, {file = "google_api_core-2.12.0-py3-none-any.whl", hash = "sha256:ec6054f7d64ad13b41e43d96f735acbd763b0f3b695dabaa2d579673f6a6e160"}, @@ -1737,18 +1795,18 @@ files = [ google-auth = ">=2.14.1,<3.0.dev0" googleapis-common-protos = ">=1.56.2,<2.0.dev0" grpcio = [ - {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, + {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, ] grpcio-status = [ - {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, + {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "extra == \"grpc\""}, ] protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0.dev0" requests = ">=2.18.0,<3.0.0.dev0" [package.extras] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0)"] +grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "grpcio-status (>=1.33.2,<2.0.dev0)", "grpcio-status (>=1.49.1,<2.0.dev0) ; python_version >= \"3.11\""] grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] @@ -1758,6 +1816,7 @@ version = "2.129.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" +groups = ["google_ads", "google_analytics", "google_sheets"] files = [ {file = "google-api-python-client-2.129.0.tar.gz", hash = "sha256:984cc8cc8eb4923468b1926d2b8effc5b459a4dda3c845896eb87c153b28ef84"}, {file = "google_api_python_client-2.129.0-py2.py3-none-any.whl", hash = "sha256:d50f7e2dfdbb7fc2732f6a0cba1c54d7bb676390679526c6bb628c901e43ec86"}, @@ -1776,6 +1835,7 @@ version = "2.23.3" description = "Google Authentication Library" optional = false python-versions = ">=3.7" +groups = ["main", "filesystem", "google_ads", "google_analytics", "google_sheets"] files = [ {file = "google-auth-2.23.3.tar.gz", hash = "sha256:6864247895eea5d13b9c57c9e03abb49cb94ce2dc7c58e91cba3248c7477c9e3"}, {file = "google_auth-2.23.3-py2.py3-none-any.whl", hash = "sha256:a8f4608e65c244ead9e0538f181a96c6e11199ec114d41f1d7b1bffa96937bda"}, @@ -1799,6 +1859,7 @@ version = "0.2.0" description = "Google Authentication Library: httplib2 transport" optional = false python-versions = "*" +groups = ["google_ads", "google_analytics", "google_sheets"] files = [ {file = "google-auth-httplib2-0.2.0.tar.gz", hash = "sha256:38aa7badf48f974f1eb9861794e9c0cb2a0511a4ec0679b1f886d108f5640e05"}, {file = "google_auth_httplib2-0.2.0-py2.py3-none-any.whl", hash = "sha256:b65a0a2123300dd71281a7bf6e64d65a0759287df52729bdd1ae2e47dc311a3d"}, @@ -1814,6 +1875,7 @@ version = "1.1.0" description = "Google Authentication Library" optional = false python-versions = ">=3.6" +groups = ["main", "filesystem", "google_ads", "google_analytics"] files = [ {file = "google-auth-oauthlib-1.1.0.tar.gz", hash = "sha256:83ea8c3b0881e453790baff4448e8a6112ac8778d1de9da0b68010b843937afb"}, {file = "google_auth_oauthlib-1.1.0-py2.py3-none-any.whl", hash = "sha256:089c6e587d36f4803ac7e0720c045c6a8b1fd1790088b8424975b90d0ee61c12"}, @@ -1832,6 +1894,7 @@ version = "3.25.0" description = "Google BigQuery API client library" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "google-cloud-bigquery-3.25.0.tar.gz", hash = "sha256:5b2aff3205a854481117436836ae1403f11f2594e6810a98886afd57eda28509"}, {file = "google_cloud_bigquery-3.25.0-py2.py3-none-any.whl", hash = "sha256:7f0c371bc74d2a7fb74dacbc00ac0f90c8c2bec2289b51dd6685a275873b1ce9"}, @@ -1847,14 +1910,14 @@ python-dateutil = ">=2.7.2,<3.0dev" requests = ">=2.21.0,<3.0.0dev" [package.extras] -all = ["Shapely (>=1.8.4,<3.0.0dev)", "db-dtypes (>=0.3.0,<2.0.0dev)", "geopandas (>=0.9.0,<1.0dev)", "google-cloud-bigquery-storage (>=2.6.0,<3.0.0dev)", "grpcio (>=1.47.0,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "importlib-metadata (>=1.0.0)", "ipykernel (>=6.0.0)", "ipython (>=7.23.1,!=8.1.0)", "ipywidgets (>=7.7.0)", "opentelemetry-api (>=1.1.0)", "opentelemetry-instrumentation (>=0.20b0)", "opentelemetry-sdk (>=1.1.0)", "pandas (>=1.1.0)", "proto-plus (>=1.15.0,<2.0.0dev)", "protobuf (>=3.19.5,!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev)", "pyarrow (>=3.0.0)", "tqdm (>=4.7.4,<5.0.0dev)"] +all = ["Shapely (>=1.8.4,<3.0.0dev)", "db-dtypes (>=0.3.0,<2.0.0dev)", "geopandas (>=0.9.0,<1.0dev)", "google-cloud-bigquery-storage (>=2.6.0,<3.0.0dev)", "grpcio (>=1.47.0,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "importlib-metadata (>=1.0.0) ; python_version < \"3.8\"", "ipykernel (>=6.0.0)", "ipython (>=7.23.1,!=8.1.0)", "ipywidgets (>=7.7.0)", "opentelemetry-api (>=1.1.0)", "opentelemetry-instrumentation (>=0.20b0)", "opentelemetry-sdk (>=1.1.0)", "pandas (>=1.1.0)", "proto-plus (>=1.15.0,<2.0.0dev)", "protobuf (>=3.19.5,!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev)", "pyarrow (>=3.0.0)", "tqdm (>=4.7.4,<5.0.0dev)"] bigquery-v2 = ["proto-plus (>=1.15.0,<2.0.0dev)", "protobuf (>=3.19.5,!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev)"] -bqstorage = ["google-cloud-bigquery-storage (>=2.6.0,<3.0.0dev)", "grpcio (>=1.47.0,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "pyarrow (>=3.0.0)"] +bqstorage = ["google-cloud-bigquery-storage (>=2.6.0,<3.0.0dev)", "grpcio (>=1.47.0,<2.0dev)", "grpcio (>=1.49.1,<2.0dev) ; python_version >= \"3.11\"", "pyarrow (>=3.0.0)"] geopandas = ["Shapely (>=1.8.4,<3.0.0dev)", "geopandas (>=0.9.0,<1.0dev)"] ipython = ["ipykernel (>=6.0.0)", "ipython (>=7.23.1,!=8.1.0)"] ipywidgets = ["ipykernel (>=6.0.0)", "ipywidgets (>=7.7.0)"] opentelemetry = ["opentelemetry-api (>=1.1.0)", "opentelemetry-instrumentation (>=0.20b0)", "opentelemetry-sdk (>=1.1.0)"] -pandas = ["db-dtypes (>=0.3.0,<2.0.0dev)", "importlib-metadata (>=1.0.0)", "pandas (>=1.1.0)", "pyarrow (>=3.0.0)"] +pandas = ["db-dtypes (>=0.3.0,<2.0.0dev)", "importlib-metadata (>=1.0.0) ; python_version < \"3.8\"", "pandas (>=1.1.0)", "pyarrow (>=3.0.0)"] tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"] [[package]] @@ -1863,6 +1926,7 @@ version = "2.3.3" description = "Google Cloud API client core library" optional = false python-versions = ">=3.7" +groups = ["main", "filesystem"] files = [ {file = "google-cloud-core-2.3.3.tar.gz", hash = "sha256:37b80273c8d7eee1ae816b3a20ae43585ea50506cb0e60f3cf5be5f87f1373cb"}, {file = "google_cloud_core-2.3.3-py2.py3-none-any.whl", hash = "sha256:fbd11cad3e98a7e5b0343dc07cb1039a5ffd7a5bb96e1f1e27cee4bda4a90863"}, @@ -1881,6 +1945,7 @@ version = "2.12.0" description = "Google Cloud Storage API client library" optional = false python-versions = ">=3.7" +groups = ["main", "filesystem"] files = [ {file = "google-cloud-storage-2.12.0.tar.gz", hash = "sha256:57c0bcda2f5e11f008a155d8636d8381d5abab46b58e0cae0e46dd5e595e6b46"}, {file = "google_cloud_storage-2.12.0-py2.py3-none-any.whl", hash = "sha256:bc52563439d42981b6e21b071a76da2791672776eda3ba99d13a8061ebbd6e5e"}, @@ -1903,6 +1968,7 @@ version = "1.5.0" description = "A python wrapper of the C library 'Google CRC32C'" optional = false python-versions = ">=3.7" +groups = ["main", "filesystem"] files = [ {file = "google-crc32c-1.5.0.tar.gz", hash = "sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7"}, {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13"}, @@ -1983,6 +2049,7 @@ version = "2.6.0" description = "Utilities for Google Media Downloads and Resumable Uploads" optional = false python-versions = ">= 3.7" +groups = ["main", "filesystem"] files = [ {file = "google-resumable-media-2.6.0.tar.gz", hash = "sha256:972852f6c65f933e15a4a210c2b96930763b47197cdf4aa5f5bea435efb626e7"}, {file = "google_resumable_media-2.6.0-py2.py3-none-any.whl", hash = "sha256:fc03d344381970f79eebb632a3c18bb1828593a2dc5572b5f90115ef7d11e81b"}, @@ -2001,6 +2068,7 @@ version = "1.61.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" +groups = ["main", "filesystem", "google_ads", "google_analytics", "google_sheets"] files = [ {file = "googleapis-common-protos-1.61.0.tar.gz", hash = "sha256:8a64866a97f6304a7179873a465d6eee97b7a24ec6cfd78e0f575e96b821240b"}, {file = "googleapis_common_protos-1.61.0-py2.py3-none-any.whl", hash = "sha256:22f1915393bb3245343f6efe87f6fe868532efc12aa26b391b15132e1279f1c0"}, @@ -2018,6 +2086,7 @@ version = "2.0.2" description = "Lightweight in-process concurrent programming" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*" +groups = ["dev", "pg_legacy_replication", "sql_database", "unstructured_data", "unstructured_data_lint"] files = [ {file = "greenlet-2.0.2-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:bdfea8c661e80d3c1c99ad7c3ff74e6e87184895bbaca6ee8cc61209f8b9b85d"}, {file = "greenlet-2.0.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:9d14b83fab60d5e8abe587d51c75b252bcc21683f24699ada8fb275d7712f5a9"}, @@ -2086,7 +2155,7 @@ files = [ ] [package.extras] -docs = ["Sphinx", "docutils (<0.18)"] +docs = ["Sphinx", "docutils (<0.18) ; python_version < \"3\""] test = ["objgraph", "psutil"] [[package]] @@ -2095,6 +2164,7 @@ version = "1.59.0" description = "HTTP/2-based RPC framework" optional = false python-versions = ">=3.7" +groups = ["main", "google_ads", "google_analytics"] files = [ {file = "grpcio-1.59.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:225e5fa61c35eeaebb4e7491cd2d768cd8eb6ed00f2664fa83a58f29418b39fd"}, {file = "grpcio-1.59.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:b95ec8ecc4f703f5caaa8d96e93e40c7f589bad299a2617bdb8becbcce525539"}, @@ -2161,6 +2231,7 @@ version = "1.59.0" description = "Status proto mapping for gRPC" optional = false python-versions = ">=3.6" +groups = ["main", "google_ads", "google_analytics"] files = [ {file = "grpcio-status-1.59.0.tar.gz", hash = "sha256:f93b9c33e0a26162ef8431bfcffcc3e1fb217ccd8d7b5b3061b6e9f813e698b5"}, {file = "grpcio_status-1.59.0-py3-none-any.whl", hash = "sha256:cb5a222b14a80ee050bff9676623822e953bff0c50d2d29180de723652fdf10d"}, @@ -2177,6 +2248,7 @@ version = "0.14.0" description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, @@ -2188,6 +2260,7 @@ version = "0.3.1" description = "hexbytes: Python `bytes` subclass that decodes hex, with a readable console output" optional = false python-versions = ">=3.7, <4" +groups = ["main", "dltpure"] files = [ {file = "hexbytes-0.3.1-py3-none-any.whl", hash = "sha256:383595ad75026cf00abd570f44b368c6cdac0c6becfae5c39ff88829877f8a59"}, {file = "hexbytes-0.3.1.tar.gz", hash = "sha256:a3fe35c6831ee8fafd048c4c086b986075fc14fd46258fa24ecb8d65745f9a9d"}, @@ -2205,6 +2278,7 @@ version = "0.7.0" description = "hnswlib" optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "hnswlib-0.7.0.tar.gz", hash = "sha256:bc459668e7e44bb7454b256b90c98c5af750653919d9a91698dafcf416cf64c4"}, ] @@ -2218,6 +2292,7 @@ version = "0.22.0" description = "A comprehensive HTTP client library." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["google_ads", "google_analytics", "google_sheets"] files = [ {file = "httplib2-0.22.0-py3-none-any.whl", hash = "sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc"}, {file = "httplib2-0.22.0.tar.gz", hash = "sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81"}, @@ -2232,6 +2307,7 @@ version = "0.6.0" description = "A collection of framework independent HTTP protocol utils." optional = false python-versions = ">=3.5.0" +groups = ["unstructured_data"] files = [ {file = "httptools-0.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:818325afee467d483bfab1647a72054246d29f9053fd17cc4b86cda09cc60339"}, {file = "httptools-0.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72205730bf1be875003692ca54a4a7c35fac77b4746008966061d9d41a61b0f5"}, @@ -2279,6 +2355,7 @@ version = "0.17.3" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" +groups = ["unstructured_data"] files = [ {file = "huggingface_hub-0.17.3-py3-none-any.whl", hash = "sha256:545eb3665f6ac587add946e73984148f2ea5c7877eac2e845549730570c1933a"}, {file = "huggingface_hub-0.17.3.tar.gz", hash = "sha256:40439632b211311f788964602bf8b0d9d6b7a2314fba4e8d67b2ce3ecea0e3fd"}, @@ -2312,6 +2389,7 @@ version = "10.0" description = "Human friendly output for text interfaces using Python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["unstructured_data"] files = [ {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, @@ -2326,6 +2404,7 @@ version = "4.8.0" description = "Python humanize utilities" optional = false python-versions = ">=3.8" +groups = ["main", "dltpure"] files = [ {file = "humanize-4.8.0-py3-none-any.whl", hash = "sha256:8bc9e2bb9315e61ec06bf690151ae35aeb65651ab091266941edf97c90836404"}, {file = "humanize-4.8.0.tar.gz", hash = "sha256:9783373bf1eec713a770ecaa7c2d7a7902c98398009dfa3d8a2df91eec9311e8"}, @@ -2340,6 +2419,7 @@ version = "21.0.0" description = "A featureful, immutable, and correct URL for Python." optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["dev", "scrapy"] files = [ {file = "hyperlink-21.0.0-py2.py3-none-any.whl", hash = "sha256:e6b14c37ecb73e89c77d78cdb4c2cc8f3fb59a885c5b3f819ff4ed80f25af1b4"}, {file = "hyperlink-21.0.0.tar.gz", hash = "sha256:427af957daa58bc909471c6c40f74c5450fa123dd093fc53efd2e91d2705a56b"}, @@ -2354,6 +2434,7 @@ version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" +groups = ["main", "airtable", "asana_dlt", "dev", "dltpure", "facebook_ads", "filesystem", "google_ads", "google_analytics", "google_sheets", "salesforce", "scrapy", "stripe_analytics", "unstructured_data", "unstructured_data_lint"] files = [ {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, @@ -2365,6 +2446,8 @@ version = "6.8.0" description = "Read metadata from Python packages" optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] +markers = "python_version == \"3.9\"" files = [ {file = "importlib_metadata-6.8.0-py3-none-any.whl", hash = "sha256:3ebb78df84a805d7698245025b975d9d67053cd94c79245ba4b3eb694abe68bb"}, {file = "importlib_metadata-6.8.0.tar.gz", hash = "sha256:dbace7892d8c0c4ac1ad096662232f831d4e64f4c4545bd53016a3e9d4654743"}, @@ -2376,7 +2459,7 @@ zipp = ">=0.5" [package.extras] docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] perf = ["ipython"] -testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] +testing = ["flufl.flake8", "importlib-resources (>=1.3) ; python_version < \"3.9\"", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7) ; platform_python_implementation != \"PyPy\"", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1) ; platform_python_implementation != \"PyPy\"", "pytest-perf (>=0.9.2)", "pytest-ruff"] [[package]] name = "incremental" @@ -2384,6 +2467,7 @@ version = "22.10.0" description = "\"A small library that versions your Python projects.\"" optional = false python-versions = "*" +groups = ["dev", "scrapy"] files = [ {file = "incremental-22.10.0-py2.py3-none-any.whl", hash = "sha256:b864a1f30885ee72c5ac2835a761b8fe8aa9c28b9395cacf27286602688d3e51"}, {file = "incremental-22.10.0.tar.gz", hash = "sha256:912feeb5e0f7e0188e6f42241d2f450002e11bbc0937c65865045854c24c0bd0"}, @@ -2399,6 +2483,7 @@ version = "0.5.1" description = "A port of Ruby on Rails inflector to Python" optional = false python-versions = ">=3.5" +groups = ["airtable"] files = [ {file = "inflection-0.5.1-py2.py3-none-any.whl", hash = "sha256:f38b2b640938a4f35ade69ac3d053042959b62a0f1076a5bbaa1b9526605a8a2"}, {file = "inflection-0.5.1.tar.gz", hash = "sha256:1a29730d366e996aaacffb2f1f1cb9593dc38e2ddd30c91250c6dde09ea9b417"}, @@ -2410,6 +2495,7 @@ version = "2.0.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" +groups = ["dev", "pytest"] files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, @@ -2421,6 +2507,7 @@ version = "0.6.1" description = "An ISO 8601 date/time/duration parser and formatter" optional = false python-versions = "*" +groups = ["filesystem", "salesforce"] files = [ {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, @@ -2435,6 +2522,7 @@ version = "0.8.0" description = "Common interface for data container classes" optional = false python-versions = ">=3.7" +groups = ["scrapy"] files = [ {file = "itemadapter-0.8.0-py3-none-any.whl", hash = "sha256:2ac1fbcc363b789a18639935ca322e50a65a0a7dfdd8d973c34e2c468e6c0f94"}, {file = "itemadapter-0.8.0.tar.gz", hash = "sha256:77758485fb0ac10730d4b131363e37d65cb8db2450bfec7a57c3f3271f4a48a9"}, @@ -2446,6 +2534,7 @@ version = "1.1.0" description = "Base library for scrapy's ItemLoader" optional = false python-versions = ">=3.7" +groups = ["scrapy"] files = [ {file = "itemloaders-1.1.0-py3-none-any.whl", hash = "sha256:c8c82fe0c11fc4cdd08ec04df0b3c43f3cb7190002edb517e02d55de8efc2aeb"}, {file = "itemloaders-1.1.0.tar.gz", hash = "sha256:21d81c61da6a08b48e5996288cdf3031c0f92e5d0075920a0242527523e14a48"}, @@ -2463,6 +2552,7 @@ version = "1.0.1" description = "JSON Matching Expressions" optional = false python-versions = ">=3.7" +groups = ["filesystem", "scrapy"] files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, @@ -2474,6 +2564,7 @@ version = "1.3.2" description = "Lightweight pipelining with Python functions" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, @@ -2485,6 +2576,7 @@ version = "1.6.0" description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." optional = false python-versions = "*" +groups = ["main", "dltpure"] files = [ {file = "jsonpath-ng-1.6.0.tar.gz", hash = "sha256:5483f8e9d74c39c9abfab554c070ae783c1c8cbadf5df60d561bc705ac68a07e"}, {file = "jsonpath_ng-1.6.0-py3-none-any.whl", hash = "sha256:6fd04833412c4b3d9299edf369542f5e67095ca84efa17cbb7f06a34958adc9f"}, @@ -2499,6 +2591,7 @@ version = "0.0.219" description = "Building applications with LLMs through composability" optional = false python-versions = ">=3.8.1,<4.0" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "langchain-0.0.219-py3-none-any.whl", hash = "sha256:1f08a00e622f1c75087d6013f34e82be3f8dd1859266eb583a0fd7bc045090cf"}, {file = "langchain-0.0.219.tar.gz", hash = "sha256:842f8212939e5ac4005906d2215574ffb3e34d2fe28f5bc0f46eb3b28fb29c5d"}, @@ -2519,17 +2612,17 @@ SQLAlchemy = ">=1.4,<3" tenacity = ">=8.1.0,<9.0.0" [package.extras] -all = ["O365 (>=2.0.26,<3.0.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.2.6,<0.3.0)", "arxiv (>=1.4,<2.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "awadb (>=0.3.3,<0.4.0)", "azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "beautifulsoup4 (>=4,<5)", "clarifai (==9.1.0)", "clickhouse-connect (>=0.5.14,<0.6.0)", "cohere (>=3,<4)", "deeplake (>=3.6.2,<4.0.0)", "docarray[hnswlib] (>=0.32.0,<0.33.0)", "duckduckgo-search (>=3.8.3,<4.0.0)", "elasticsearch (>=8,<9)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "google-api-python-client (==2.70.0)", "google-auth (>=2.18.1,<3.0.0)", "google-search-results (>=2,<3)", "gptcache (>=0.1.7)", "html2text (>=2020.1.16,<2021.0.0)", "huggingface_hub (>=0,<1)", "jina (>=3.14,<4.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lancedb (>=0.1,<0.2)", "langkit (>=0.0.1.dev3,<0.1.0)", "lark (>=1.1.5,<2.0.0)", "lxml (>=4.9.2,<5.0.0)", "manifest-ml (>=0.0.1,<0.0.2)", "momento (>=1.5.0,<2.0.0)", "nebula3-python (>=3.4.0,<4.0.0)", "neo4j (>=5.8.1,<6.0.0)", "networkx (>=2.6.3,<3.0.0)", "nlpcloud (>=1,<2)", "nltk (>=3,<4)", "nomic (>=1.0.43,<2.0.0)", "octoai-sdk (>=0.1.1,<0.2.0)", "openai (>=0,<1)", "openlm (>=0.0.5,<0.0.6)", "opensearch-py (>=2.0.0,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pexpect (>=4.8.0,<5.0.0)", "pgvector (>=0.1.6,<0.2.0)", "pinecone-client (>=2,<3)", "pinecone-text (>=0.4.2,<0.5.0)", "psycopg2-binary (>=2.9.5,<3.0.0)", "pymongo (>=4.3.3,<5.0.0)", "pyowm (>=3.3.0,<4.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pytesseract (>=0.3.10,<0.4.0)", "pyvespa (>=0.33.0,<0.34.0)", "qdrant-client (>=1.1.2,<2.0.0)", "redis (>=4,<5)", "requests-toolbelt (>=1.0.0,<2.0.0)", "sentence-transformers (>=2,<3)", "singlestoredb (>=0.7.1,<0.8.0)", "spacy (>=3,<4)", "steamship (>=2.16.9,<3.0.0)", "tensorflow-text (>=2.11.0,<3.0.0)", "tigrisdb (>=1.0.0b6,<2.0.0)", "tiktoken (>=0.3.2,<0.4.0)", "torch (>=1,<3)", "transformers (>=4,<5)", "weaviate-client (>=3,<4)", "wikipedia (>=1,<2)", "wolframalpha (==5.0.0)"] +all = ["O365 (>=2.0.26,<3.0.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.2.6,<0.3.0)", "arxiv (>=1.4,<2.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "awadb (>=0.3.3,<0.4.0)", "azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "beautifulsoup4 (>=4,<5)", "clarifai (==9.1.0)", "clickhouse-connect (>=0.5.14,<0.6.0)", "cohere (>=3,<4)", "deeplake (>=3.6.2,<4.0.0)", "docarray[hnswlib] (>=0.32.0,<0.33.0)", "duckduckgo-search (>=3.8.3,<4.0.0)", "elasticsearch (>=8,<9)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "google-api-python-client (==2.70.0)", "google-auth (>=2.18.1,<3.0.0)", "google-search-results (>=2,<3)", "gptcache (>=0.1.7)", "html2text (>=2020.1.16,<2021.0.0)", "huggingface_hub (>=0,<1)", "jina (>=3.14,<4.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lancedb (>=0.1,<0.2)", "langkit (>=0.0.1.dev3,<0.1.0)", "lark (>=1.1.5,<2.0.0)", "lxml (>=4.9.2,<5.0.0)", "manifest-ml (>=0.0.1,<0.0.2)", "momento (>=1.5.0,<2.0.0)", "nebula3-python (>=3.4.0,<4.0.0)", "neo4j (>=5.8.1,<6.0.0)", "networkx (>=2.6.3,<3.0.0)", "nlpcloud (>=1,<2)", "nltk (>=3,<4)", "nomic (>=1.0.43,<2.0.0)", "octoai-sdk (>=0.1.1,<0.2.0)", "openai (>=0,<1)", "openlm (>=0.0.5,<0.0.6)", "opensearch-py (>=2.0.0,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pexpect (>=4.8.0,<5.0.0)", "pgvector (>=0.1.6,<0.2.0)", "pinecone-client (>=2,<3)", "pinecone-text (>=0.4.2,<0.5.0)", "psycopg2-binary (>=2.9.5,<3.0.0)", "pymongo (>=4.3.3,<5.0.0)", "pyowm (>=3.3.0,<4.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pytesseract (>=0.3.10,<0.4.0)", "pyvespa (>=0.33.0,<0.34.0)", "qdrant-client (>=1.1.2,<2.0.0) ; python_full_version >= \"3.8.1\" and python_version < \"3.12\"", "redis (>=4,<5)", "requests-toolbelt (>=1.0.0,<2.0.0)", "sentence-transformers (>=2,<3)", "singlestoredb (>=0.7.1,<0.8.0)", "spacy (>=3,<4)", "steamship (>=2.16.9,<3.0.0)", "tensorflow-text (>=2.11.0,<3.0.0) ; python_version >= \"3.10\" and python_version < \"3.12\"", "tigrisdb (>=1.0.0b6,<2.0.0)", "tiktoken (>=0.3.2,<0.4.0) ; python_version >= \"3.9\" and python_version < \"4.0\"", "torch (>=1,<3)", "transformers (>=4,<5)", "weaviate-client (>=3,<4)", "wikipedia (>=1,<2)", "wolframalpha (==5.0.0)"] azure = ["azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-core (>=1.26.4,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "azure-search-documents (==11.4.0a20230509004)", "openai (>=0,<1)"] clarifai = ["clarifai (==9.1.0)"] cohere = ["cohere (>=3,<4)"] docarray = ["docarray[hnswlib] (>=0.32.0,<0.33.0)"] embeddings = ["sentence-transformers (>=2,<3)"] -extended-testing = ["atlassian-python-api (>=3.36.0,<4.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "chardet (>=5.1.0,<6.0.0)", "esprima (>=4.0.1,<5.0.0)", "gql (>=3.4.1,<4.0.0)", "html2text (>=2020.1.16,<2021.0.0)", "jq (>=1.4.1,<2.0.0)", "lxml (>=4.9.2,<5.0.0)", "openai (>=0,<1)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "streamlit (>=1.18.0,<2.0.0)", "telethon (>=1.28.5,<2.0.0)", "tqdm (>=4.48.0)", "zep-python (>=0.31)"] +extended-testing = ["atlassian-python-api (>=3.36.0,<4.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "chardet (>=5.1.0,<6.0.0)", "esprima (>=4.0.1,<5.0.0)", "gql (>=3.4.1,<4.0.0)", "html2text (>=2020.1.16,<2021.0.0)", "jq (>=1.4.1,<2.0.0)", "lxml (>=4.9.2,<5.0.0)", "openai (>=0,<1)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "streamlit (>=1.18.0,<2.0.0) ; python_full_version >= \"3.8.1\" and python_full_version != \"3.9.7\" and python_version < \"4.0\"", "telethon (>=1.28.5,<2.0.0)", "tqdm (>=4.48.0)", "zep-python (>=0.31)"] javascript = ["esprima (>=4.0.1,<5.0.0)"] llms = ["anthropic (>=0.2.6,<0.3.0)", "clarifai (==9.1.0)", "cohere (>=3,<4)", "huggingface_hub (>=0,<1)", "manifest-ml (>=0.0.1,<0.0.2)", "nlpcloud (>=1,<2)", "openai (>=0,<1)", "openllm (>=0.1.6)", "openlm (>=0.0.5,<0.0.6)", "torch (>=1,<3)", "transformers (>=4,<5)"] -openai = ["openai (>=0,<1)", "tiktoken (>=0.3.2,<0.4.0)"] -qdrant = ["qdrant-client (>=1.1.2,<2.0.0)"] +openai = ["openai (>=0,<1)", "tiktoken (>=0.3.2,<0.4.0) ; python_version >= \"3.9\" and python_version < \"4.0\""] +qdrant = ["qdrant-client (>=1.1.2,<2.0.0) ; python_full_version >= \"3.8.1\" and python_version < \"3.12\""] text-helpers = ["chardet (>=5.1.0,<6.0.0)"] [[package]] @@ -2538,6 +2631,7 @@ version = "0.0.20" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = ">=3.8.1,<4.0" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "langchainplus_sdk-0.0.20-py3-none-any.whl", hash = "sha256:07a869d476755803aa04c4986ce78d00c2fe4ff584c0eaa57d7570c9664188db"}, {file = "langchainplus_sdk-0.0.20.tar.gz", hash = "sha256:3d300e2e3290f68cc9d842c059f9458deba60e776c9e790309688cad1bfbb219"}, @@ -2554,6 +2648,7 @@ version = "4.9.3" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" +groups = ["salesforce", "scrapy", "unstructured_data"] files = [ {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"}, @@ -2661,6 +2756,7 @@ version = "4.3.2" description = "LZ4 Bindings for Python" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "lz4-4.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1c4c100d99eed7c08d4e8852dd11e7d1ec47a3340f49e3a96f8dfbba17ffb300"}, {file = "lz4-4.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:edd8987d8415b5dad25e797043936d91535017237f72fa456601be1479386c92"}, @@ -2710,6 +2806,7 @@ version = "1.15.1" description = "Small library to dynamically create python functions." optional = false python-versions = "*" +groups = ["main", "dltpure"] files = [ {file = "makefun-1.15.1-py2.py3-none-any.whl", hash = "sha256:a63cfc7b47a539c76d97bd4fdb833c7d0461e759fd1225f580cb4be6200294d4"}, {file = "makefun-1.15.1.tar.gz", hash = "sha256:40b0f118b6ded0d8d78c78f1eb679b8b6b2462e3c1b3e05fb1b2da8cd46b48a5"}, @@ -2721,6 +2818,7 @@ version = "3.5" description = "Python implementation of John Gruber's Markdown." optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] files = [ {file = "Markdown-3.5-py3-none-any.whl", hash = "sha256:4afb124395ce5fc34e6d9886dab977fd9ae987fc6e85689f08278cf0c69d4bf3"}, {file = "Markdown-3.5.tar.gz", hash = "sha256:a807eb2e4778d9156c8f07876c6e4d50b5494c5665c4834f67b06459dfd877b3"}, @@ -2739,6 +2837,7 @@ version = "3.0.0" description = "Python port of markdown-it. Markdown parsing, done right!" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "dltpure"] files = [ {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, @@ -2763,6 +2862,7 @@ version = "3.20.1" description = "A lightweight library for converting complex datatypes to and from native Python datatypes." optional = false python-versions = ">=3.8" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "marshmallow-3.20.1-py3-none-any.whl", hash = "sha256:684939db93e80ad3561392f47be0230743131560a41c5110684c16e21ade0a5c"}, {file = "marshmallow-3.20.1.tar.gz", hash = "sha256:5d2371bbe42000f2b3fb5eaa065224df7d8f8597bc19a1bbfa5bfe7fba8da889"}, @@ -2783,6 +2883,7 @@ version = "1.5.1" description = "Enum field for Marshmallow" optional = false python-versions = "*" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "marshmallow-enum-1.5.1.tar.gz", hash = "sha256:38e697e11f45a8e64b4a1e664000897c659b60aa57bfa18d44e226a9920b6e58"}, {file = "marshmallow_enum-1.5.1-py2.py3-none-any.whl", hash = "sha256:57161ab3dbfde4f57adeb12090f39592e992b9c86d206d02f6bd03ebec60f072"}, @@ -2797,6 +2898,7 @@ version = "0.7.0" description = "McCabe checker, plugin for flake8" optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, @@ -2808,6 +2910,7 @@ version = "0.1.2" description = "Markdown URL utilities" optional = false python-versions = ">=3.7" +groups = ["main", "dev", "dltpure"] files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, @@ -2819,6 +2922,7 @@ version = "7.1.0" description = "Mimesis: Fake Data Generator." optional = false python-versions = ">=3.8,<4.0" +groups = ["dev"] files = [ {file = "mimesis-7.1.0-py3-none-any.whl", hash = "sha256:da65bea6d6d5d5d87d5c008e6b23ef5f96a49cce436d9f8708dabb5152da0290"}, {file = "mimesis-7.1.0.tar.gz", hash = "sha256:c83b55d35536d7e9b9700a596b7ccfb639a740e3e1fb5e08062e8ab2a67dcb37"}, @@ -2830,6 +2934,7 @@ version = "1.6" description = "An implementation of time.monotonic() for Python 2 & < 3.3" optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "monotonic-1.6-py2.py3-none-any.whl", hash = "sha256:68687e19a14f11f26d140dd5c86f3dba4bf5df58003000ed467e0e2a69bca96c"}, {file = "monotonic-1.6.tar.gz", hash = "sha256:3a55207bcfed53ddd5c5bae174524062935efed17792e9de2ad0205ce9ad63f7"}, @@ -2841,6 +2946,7 @@ version = "10.1.0" description = "More routines for operating on iterables, beyond itertools" optional = false python-versions = ">=3.8" +groups = ["salesforce"] files = [ {file = "more-itertools-10.1.0.tar.gz", hash = "sha256:626c369fa0eb37bac0291bce8259b332fd59ac792fa5497b59837309cd5b114a"}, {file = "more_itertools-10.1.0-py3-none-any.whl", hash = "sha256:64e0735fcfdc6f3464ea133afe8ea4483b1c5fe3a3d69852e6503b43a0b222e6"}, @@ -2852,6 +2958,7 @@ version = "1.3.0" description = "Python library for arbitrary-precision floating-point arithmetic" optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, @@ -2860,7 +2967,7 @@ files = [ [package.extras] develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] +gmpy = ["gmpy2 (>=2.1.0a4) ; platform_python_implementation != \"PyPy\""] tests = ["pytest (>=4.6)"] [[package]] @@ -2869,6 +2976,7 @@ version = "1.24.1" description = "The Microsoft Authentication Library (MSAL) for Python library" optional = false python-versions = ">=2.7" +groups = ["filesystem"] files = [ {file = "msal-1.24.1-py2.py3-none-any.whl", hash = "sha256:ce4320688f95c301ee74a4d0e9dbcfe029a63663a8cc61756f40d0d0d36574ad"}, {file = "msal-1.24.1.tar.gz", hash = "sha256:aa0972884b3c6fdec53d9a0bd15c12e5bd7b71ac1b66d746f54d128709f3f8f8"}, @@ -2880,7 +2988,7 @@ PyJWT = {version = ">=1.0.0,<3", extras = ["crypto"]} requests = ">=2.0.0,<3" [package.extras] -broker = ["pymsalruntime (>=0.13.2,<0.14)"] +broker = ["pymsalruntime (>=0.13.2,<0.14) ; python_version >= \"3.6\" and platform_system == \"Windows\""] [[package]] name = "msal-extensions" @@ -2888,6 +2996,7 @@ version = "1.0.0" description = "Microsoft Authentication Library extensions (MSAL EX) provides a persistence API that can save your data on disk, encrypted on Windows, macOS and Linux. Concurrent data access will be coordinated by a file lock mechanism." optional = false python-versions = "*" +groups = ["filesystem"] files = [ {file = "msal-extensions-1.0.0.tar.gz", hash = "sha256:c676aba56b0cce3783de1b5c5ecfe828db998167875126ca4b47dc6436451354"}, {file = "msal_extensions-1.0.0-py2.py3-none-any.whl", hash = "sha256:91e3db9620b822d0ed2b4d1850056a0f133cba04455e62f11612e40f5502f2ee"}, @@ -2906,6 +3015,7 @@ version = "1.2.0" description = "This module enables reading, parsing and converting Microsoft Outlook MSG E-Mail files." optional = false python-versions = ">=3.4" +groups = ["unstructured_data"] files = [ {file = "msg_parser-1.2.0-py2.py3-none-any.whl", hash = "sha256:d47a2f0b2a359cb189fad83cc991b63ea781ecc70d91410324273fbf93e95375"}, {file = "msg_parser-1.2.0.tar.gz", hash = "sha256:0de858d4fcebb6c8f6f028da83a17a20fe01cdce67c490779cf43b3b0162aa66"}, @@ -2923,6 +3033,7 @@ version = "6.0.4" description = "multidict implementation" optional = false python-versions = ">=3.7" +groups = ["main", "facebook_ads", "filesystem", "unstructured_data", "unstructured_data_lint"] files = [ {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"}, {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"}, @@ -3006,6 +3117,7 @@ version = "1.10.0" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, @@ -3053,6 +3165,7 @@ version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" +groups = ["dev", "unstructured_data", "unstructured_data_lint"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, @@ -3064,6 +3177,7 @@ version = "3.6.0" description = "Generate mypy stub files from protobuf specs" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "mypy-protobuf-3.6.0.tar.gz", hash = "sha256:02f242eb3409f66889f2b1a3aa58356ec4d909cdd0f93115622e9e70366eca3c"}, {file = "mypy_protobuf-3.6.0-py3-none-any.whl", hash = "sha256:56176e4d569070e7350ea620262478b49b7efceba4103d468448f1d21492fd6c"}, @@ -3079,6 +3193,7 @@ version = "8.4.0" description = "Simple yet flexible natural sorting in Python." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c"}, {file = "natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581"}, @@ -3094,6 +3209,7 @@ version = "3.8.1" description = "Natural Language Toolkit" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"}, {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"}, @@ -3119,6 +3235,7 @@ version = "2.8.6" description = "Fast numerical expression evaluator for NumPy" optional = false python-versions = ">=3.7" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "numexpr-2.8.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:80acbfefb68bd92e708e09f0a02b29e04d388b9ae72f9fcd57988aca172a7833"}, {file = "numexpr-2.8.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6e884687da8af5955dc9beb6a12d469675c90b8fb38b6c93668c989cfc2cd982"}, @@ -3161,6 +3278,7 @@ version = "1.24.4" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "mongodb", "stripe_analytics", "unstructured_data", "unstructured_data_lint"] files = [ {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, @@ -3198,6 +3316,7 @@ version = "3.2.2" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" optional = false python-versions = ">=3.6" +groups = ["main", "asana_dlt", "filesystem", "google_ads", "google_analytics"] files = [ {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"}, {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"}, @@ -3214,6 +3333,7 @@ version = "0.46" description = "Python package to parse, read and write Microsoft OLE2 files (Structured Storage or Compound Document, Microsoft Office)" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["unstructured_data"] files = [ {file = "olefile-0.46.zip", hash = "sha256:133b031eaf8fd2c9399b78b8bc5b8fcbe4c31e85295749bb17a87cba8f3c3964"}, ] @@ -3224,6 +3344,7 @@ version = "1.16.1" description = "ONNX Runtime is a runtime accelerator for Machine Learning models" optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "onnxruntime-1.16.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:28b2c7f444b4119950b69370801cd66067f403d19cbaf2a444735d7c269cce4a"}, {file = "onnxruntime-1.16.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c24e04f33e7899f6aebb03ed51e51d346c1f906b05c5569d58ac9a12d38a2f58"}, @@ -3265,6 +3386,7 @@ version = "0.27.10" description = "Python client library for the OpenAI API" optional = false python-versions = ">=3.7.1" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "openai-0.27.10-py3-none-any.whl", hash = "sha256:beabd1757e3286fa166dde3b70ebb5ad8081af046876b47c14c41e203ed22a14"}, {file = "openai-0.27.10.tar.gz", hash = "sha256:60e09edf7100080283688748c6803b7b3b52d5a55d21890f3815292a0552d83b"}, @@ -3287,6 +3409,7 @@ version = "1.2.4" description = "OpenAPI (v3) specification schema as pydantic class" optional = false python-versions = ">=3.6.1" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "openapi-schema-pydantic-1.2.4.tar.gz", hash = "sha256:3e22cf58b74a69f752cc7e5f1537f6e44164282db2700cbbcd3bb99ddd065196"}, {file = "openapi_schema_pydantic-1.2.4-py3-none-any.whl", hash = "sha256:a932ecc5dcbb308950282088956e94dea069c9823c84e507d64f6b622222098c"}, @@ -3301,6 +3424,7 @@ version = "3.1.2" description = "A Python library to read/write Excel 2010 xlsx/xlsm files" optional = false python-versions = ">=3.6" +groups = ["unstructured_data"] files = [ {file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"}, {file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"}, @@ -3315,6 +3439,8 @@ version = "3.9.9" description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" optional = false python-versions = ">=3.8" +groups = ["main", "dltpure"] +markers = "platform_python_implementation != \"PyPy\"" files = [ {file = "orjson-3.9.9-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:f28090060a31f4d11221f9ba48b2273b0d04b702f4dcaa197c38c64ce639cc51"}, {file = "orjson-3.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8038ba245d0c0a6337cfb6747ea0c51fe18b0cf1a4bc943d530fd66799fae33d"}, @@ -3374,6 +3500,7 @@ version = "7.4.0" description = "A decorator to automatically detect mismatch when overriding a method." optional = false python-versions = ">=3.6" +groups = ["unstructured_data"] files = [ {file = "overrides-7.4.0-py3-none-any.whl", hash = "sha256:3ad24583f86d6d7a49049695efe9933e67ba62f0c7625d53c59fa832ce4b8b7d"}, {file = "overrides-7.4.0.tar.gz", hash = "sha256:9502a3cca51f4fac40b5feca985b6703a5c1f6ad815588a7ca9e285b9dca6757"}, @@ -3385,6 +3512,7 @@ version = "23.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.7" +groups = ["main", "dev", "dltpure", "mongodb", "pytest", "scrapy", "unstructured_data", "unstructured_data_lint"] files = [ {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, @@ -3396,6 +3524,7 @@ version = "2.0.3" description = "Powerful data structures for data analysis, time series, and statistics" optional = false python-versions = ">=3.8" +groups = ["main", "mongodb", "stripe_analytics", "unstructured_data"] files = [ {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, @@ -3426,9 +3555,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.20.3", markers = "python_version < \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -3463,6 +3592,7 @@ version = "2.0.2.230605" description = "Type annotations for pandas" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pandas_stubs-2.0.2.230605-py3-none-any.whl", hash = "sha256:39106b602f3cb6dc5f728b84e1b32bde6ecf41ee34ee714c66228009609fbada"}, {file = "pandas_stubs-2.0.2.230605.tar.gz", hash = "sha256:624c7bb06d38145a44b61be459ccd19b038e0bf20364a025ecaab78fea65e858"}, @@ -3478,6 +3608,7 @@ version = "1.8.1" description = "Parsel is a library to extract data from HTML and XML using XPath and CSS selectors" optional = false python-versions = ">=3.7" +groups = ["scrapy"] files = [ {file = "parsel-1.8.1-py2.py3-none-any.whl", hash = "sha256:2708fc74daeeb4ce471e2c2e9089b650ec940c7a218053e57421e69b5b00f82c"}, {file = "parsel-1.8.1.tar.gz", hash = "sha256:aff28e68c9b3f1a901db2a4e3f158d8480a38724d7328ee751c1a4e1c1801e39"}, @@ -3496,6 +3627,7 @@ version = "0.11.2" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "pathspec-0.11.2-py3-none-any.whl", hash = "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20"}, {file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"}, @@ -3507,6 +3639,7 @@ version = "3.2.0" description = "pathvalidate is a Python library to sanitize/validate a string such as filenames/file-paths/etc." optional = false python-versions = ">=3.7" +groups = ["main", "dltpure"] files = [ {file = "pathvalidate-3.2.0-py3-none-any.whl", hash = "sha256:cc593caa6299b22b37f228148257997e2fa850eea2daf7e4cc9205cef6908dee"}, {file = "pathvalidate-3.2.0.tar.gz", hash = "sha256:5e8378cf6712bff67fbe7a8307d99fa8c1a0cb28aa477056f8fc374f0dff24ad"}, @@ -3514,7 +3647,7 @@ files = [ [package.extras] docs = ["Sphinx (>=2.4)", "sphinx-rtd-theme (>=1.2.2)", "urllib3 (<2)"] -test = ["Faker (>=1.0.8)", "allpairspy (>=2)", "click (>=6.2)", "pytest (>=6.0.1)", "pytest-discord (>=0.1.4)", "pytest-md-report (>=0.4.1)"] +test = ["Faker (>=1.0.8)", "allpairspy (>=2)", "click (>=6.2)", "pytest (>=6.0.1)", "pytest-discord (>=0.1.4) ; python_version >= \"3.7\"", "pytest-md-report (>=0.4.1)"] [[package]] name = "pbr" @@ -3522,6 +3655,7 @@ version = "5.11.1" description = "Python Build Reasonableness" optional = false python-versions = ">=2.6" +groups = ["dev"] files = [ {file = "pbr-5.11.1-py2.py3-none-any.whl", hash = "sha256:567f09558bae2b3ab53cb3c1e2e33e726ff3338e7bae3db5dc954b3a44eef12b"}, {file = "pbr-5.11.1.tar.gz", hash = "sha256:aefc51675b0b533d56bb5fd1c8c6c0522fe31896679882e1c4c63d5e4a0fccb3"}, @@ -3533,6 +3667,7 @@ version = "1.16.3" description = "A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list." optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "pdf2image-1.16.3-py3-none-any.whl", hash = "sha256:b6154164af3677211c22cbb38b2bd778b43aca02758e962fe1e231f6d3b0e380"}, {file = "pdf2image-1.16.3.tar.gz", hash = "sha256:74208810c2cef4d9e347769b8e62a52303982ddb4f2dfd744c7ab4b940ae287e"}, @@ -3547,6 +3682,7 @@ version = "20221105" description = "PDF parser and analyzer" optional = false python-versions = ">=3.6" +groups = ["unstructured_data"] files = [ {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"}, {file = "pdfminer.six-20221105.tar.gz", hash = "sha256:8448ab7b939d18b64820478ecac5394f482d7a79f5f7eaa7703c6c959c175e1d"}, @@ -3567,6 +3703,7 @@ version = "3.0.0" description = "Python datetimes made easy" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "dltpure", "salesforce"] files = [ {file = "pendulum-3.0.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2cf9e53ef11668e07f73190c805dbdf07a1939c3298b78d5a9203a86775d1bfd"}, {file = "pendulum-3.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fb551b9b5e6059377889d2d878d940fd0bbb80ae4810543db18e6f77b02c5ef6"}, @@ -3658,7 +3795,7 @@ python-dateutil = ">=2.6" tzdata = ">=2020.1" [package.extras] -test = ["time-machine (>=2.6.0)"] +test = ["time-machine (>=2.6.0) ; implementation_name != \"pypy\""] [[package]] name = "pillow" @@ -3666,6 +3803,7 @@ version = "9.5.0" description = "Python Imaging Library (Fork)" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "Pillow-9.5.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:ace6ca218308447b9077c14ea4ef381ba0b67ee78d64046b3f19cf4e1139ad16"}, {file = "Pillow-9.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3d403753c9d5adc04d4694d35cf0391f0f3d57c8e0030aac09d7678fa8030aa"}, @@ -3745,6 +3883,7 @@ version = "3.11.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." optional = false python-versions = ">=3.7" +groups = ["dev", "salesforce"] files = [ {file = "platformdirs-3.11.0-py3-none-any.whl", hash = "sha256:e9d171d00af68be50e9202731309c4e658fd8bc76f55c11c7dd760d023bda68e"}, {file = "platformdirs-3.11.0.tar.gz", hash = "sha256:cf8ee52a3afdb965072dcc652433e0c7e3e40cf5ea1477cd4b3b1d2eb75495b3"}, @@ -3760,6 +3899,7 @@ version = "1.3.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "dltpure", "pytest"] files = [ {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, @@ -3775,6 +3915,7 @@ version = "3.11" description = "Python Lex & Yacc" optional = false python-versions = "*" +groups = ["main", "dltpure"] files = [ {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, @@ -3786,6 +3927,7 @@ version = "2.8.2" description = "Wraps the portalocker recipe for easy usage" optional = false python-versions = ">=3.8" +groups = ["filesystem"] files = [ {file = "portalocker-2.8.2-py3-none-any.whl", hash = "sha256:cfb86acc09b9aa7c3b43594e19be1345b9d16af3feb08bf92f23d4dce513a28e"}, {file = "portalocker-2.8.2.tar.gz", hash = "sha256:2b035aa7828e46c58e9b31390ee1f169b98e1066ab10b9a6a861fe7e25ee4f33"}, @@ -3805,6 +3947,7 @@ version = "3.0.2" description = "Integrate PostHog into any python application." optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "posthog-3.0.2-py2.py3-none-any.whl", hash = "sha256:a8c0af6f2401fbe50f90e68c4143d0824b54e872de036b1c2f23b5abb39d88ce"}, {file = "posthog-3.0.2.tar.gz", hash = "sha256:701fba6e446a4de687c6e861b587e7b7741955ad624bf34fe013c06a0fec6fb3"}, @@ -3828,6 +3971,7 @@ version = "0.3.0" description = "Pure-Python robots.txt parser with support for modern conventions" optional = false python-versions = ">=3.7" +groups = ["scrapy"] files = [ {file = "Protego-0.3.0-py2.py3-none-any.whl", hash = "sha256:db38f6a945839d8162a4034031a21490469566a2726afb51d668497c457fb0aa"}, {file = "Protego-0.3.0.tar.gz", hash = "sha256:04228bffde4c6bcba31cf6529ba2cfd6e1b70808fdc1d2cb4301be6b28d6c568"}, @@ -3839,6 +3983,7 @@ version = "1.22.3" description = "Beautiful, Pythonic protocol buffers." optional = false python-versions = ">=3.6" +groups = ["google_ads", "google_analytics"] files = [ {file = "proto-plus-1.22.3.tar.gz", hash = "sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b"}, {file = "proto_plus-1.22.3-py3-none-any.whl", hash = "sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df"}, @@ -3856,6 +4001,7 @@ version = "4.25.5" description = "" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "filesystem", "google_ads", "google_analytics", "google_sheets", "pg_legacy_replication", "unstructured_data"] files = [ {file = "protobuf-4.25.5-cp310-abi3-win32.whl", hash = "sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8"}, {file = "protobuf-4.25.5-cp310-abi3-win_amd64.whl", hash = "sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea"}, @@ -3876,6 +4022,7 @@ version = "2.9.9" description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = false python-versions = ">=3.7" +groups = ["main", "pg_legacy_replication", "pg_replication"] files = [ {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"}, {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"}, @@ -3957,6 +4104,8 @@ version = "2.9.0" description = ".. image:: https://travis-ci.org/chtd/psycopg2cffi.svg?branch=master" optional = false python-versions = "*" +groups = ["main"] +markers = "platform_python_implementation == \"PyPy\"" files = [ {file = "psycopg2cffi-2.9.0.tar.gz", hash = "sha256:7e272edcd837de3a1d12b62185eb85c45a19feda9e62fa1b120c54f9e8d35c52"}, ] @@ -3971,6 +4120,7 @@ version = "3.3.0" description = "Apache Pulsar Python client library" optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "pulsar_client-3.3.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:c31afd3e67a044ff93177df89e08febf214cc965e95ede097d9fe8755af00e01"}, {file = "pulsar_client-3.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f66982284571674b215324cc26b5c2f7c56c7043113c47a7084cb70d67a8afb"}, @@ -4018,6 +4168,7 @@ version = "1.11.0" description = "library with cross-python path, ini-parsing, io, code, log facilities" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["dev", "pytest"] files = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, @@ -4029,6 +4180,7 @@ version = "2.1.0.post1" description = "Python Client for the Airtable API" optional = false python-versions = "*" +groups = ["airtable"] files = [ {file = "pyairtable-2.1.0.post1-py2.py3-none-any.whl", hash = "sha256:a80eb85f7c020bf41679bb00ca57da11aeaa43769afbc73619276798a2ca182e"}, {file = "pyairtable-2.1.0.post1.tar.gz", hash = "sha256:e588249e68cf338dcdca9908537ed16d5a22ae72345ec930022b230ba96e5f84"}, @@ -4047,6 +4199,7 @@ version = "16.0.0" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.8" +groups = ["main", "mongodb"] files = [ {file = "pyarrow-16.0.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:22a1fdb1254e5095d629e29cd1ea98ed04b4bbfd8e42cc670a6b639ccc208b60"}, {file = "pyarrow-16.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:574a00260a4ed9d118a14770edbd440b848fcae5a3024128be9d0274dbcaf858"}, @@ -4095,6 +4248,7 @@ version = "0.5.0" description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main", "filesystem", "google_ads", "google_analytics", "google_sheets", "scrapy"] files = [ {file = "pyasn1-0.5.0-py2.py3-none-any.whl", hash = "sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57"}, {file = "pyasn1-0.5.0.tar.gz", hash = "sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde"}, @@ -4106,6 +4260,7 @@ version = "0.3.0" description = "A collection of ASN.1-based protocols modules" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +groups = ["main", "filesystem", "google_ads", "google_analytics", "google_sheets", "scrapy"] files = [ {file = "pyasn1_modules-0.3.0-py2.py3-none-any.whl", hash = "sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d"}, {file = "pyasn1_modules-0.3.0.tar.gz", hash = "sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c"}, @@ -4120,6 +4275,7 @@ version = "2.11.1" description = "Python style guide checker" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pycodestyle-2.11.1-py2.py3-none-any.whl", hash = "sha256:44fe31000b2d866f2e41841b18528a505fbd7fef9017b04eff4e2648a0fadc67"}, {file = "pycodestyle-2.11.1.tar.gz", hash = "sha256:41ba0e7afc9752dfb53ced5489e89f8186be00e599e712660695b7a75ff2663f"}, @@ -4131,6 +4287,7 @@ version = "22.3.5" description = "ISO country, subdivision, language, currency and script definitions and their translations" optional = false python-versions = ">=3.6, <4" +groups = ["facebook_ads"] files = [ {file = "pycountry-22.3.5.tar.gz", hash = "sha256:b2163a246c585894d808f18783e19137cb70a0c18fb36748dc01fc6f109c1646"}, ] @@ -4144,10 +4301,12 @@ version = "2.21" description = "C parser in Python" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main", "filesystem", "salesforce", "scrapy", "unstructured_data"] files = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] +markers = {main = "platform_python_implementation == \"PyPy\""} [[package]] name = "pydantic" @@ -4155,6 +4314,7 @@ version = "1.10.13" description = "Data validation and settings management using python type hints" optional = false python-versions = ">=3.7" +groups = ["airtable", "unstructured_data", "unstructured_data_lint"] files = [ {file = "pydantic-1.10.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:efff03cc7a4f29d9009d1c96ceb1e7a70a65cfe86e89d34e4a5f2ab1e5693737"}, {file = "pydantic-1.10.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3ecea2b9d80e5333303eeb77e180b90e95eea8f765d08c3d278cd56b00345d01"}, @@ -4207,6 +4367,8 @@ version = "2.0.7" description = "Multi-producer multi-consumer in-memory signal dispatch system" optional = false python-versions = "*" +groups = ["scrapy"] +markers = "platform_python_implementation == \"CPython\"" files = [ {file = "PyDispatcher-2.0.7-py3-none-any.whl", hash = "sha256:96543bea04115ffde08f851e1d45cacbfd1ee866ac42127d9b476dc5aefa7de0"}, {file = "PyDispatcher-2.0.7.tar.gz", hash = "sha256:b777c6ad080dc1bad74a4c29d6a46914fa6701ac70f94b0d66fbcfde62f5be31"}, @@ -4221,6 +4383,7 @@ version = "3.1.0" description = "passive checker of Python programs" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pyflakes-3.1.0-py2.py3-none-any.whl", hash = "sha256:4132f6d49cb4dae6819e5379898f2b8cce3c5f23994194c24b77d5da2e36f774"}, {file = "pyflakes-3.1.0.tar.gz", hash = "sha256:a0aae034c444db0071aa077972ba4768d40c830d9539fd45bf4cd3f8f6992efc"}, @@ -4232,13 +4395,14 @@ version = "2.16.1" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.7" +groups = ["main", "dev", "dltpure"] files = [ {file = "Pygments-2.16.1-py3-none-any.whl", hash = "sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692"}, {file = "Pygments-2.16.1.tar.gz", hash = "sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29"}, ] [package.extras] -plugins = ["importlib-metadata"] +plugins = ["importlib-metadata ; python_version < \"3.8\""] [[package]] name = "pyjwt" @@ -4246,6 +4410,7 @@ version = "2.8.0" description = "JSON Web Token implementation in Python" optional = false python-versions = ">=3.7" +groups = ["filesystem", "salesforce"] files = [ {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"}, {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"}, @@ -4266,6 +4431,7 @@ version = "4.5.0" description = "Python driver for MongoDB " optional = false python-versions = ">=3.7" +groups = ["mongodb"] files = [ {file = "pymongo-4.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:2d4fa1b01fa7e5b7bb8d312e3542e211b320eb7a4e3d8dc884327039d93cb9e0"}, {file = "pymongo-4.5.0-cp310-cp310-manylinux1_i686.whl", hash = "sha256:dfcd2b9f510411de615ccedd47462dae80e82fdc09fe9ab0f0f32f11cf57eeb5"}, @@ -4356,9 +4522,9 @@ dnspython = ">=1.16.0,<3.0.0" [package.extras] aws = ["pymongo-auth-aws (<2.0.0)"] -encryption = ["certifi", "pymongo[aws]", "pymongocrypt (>=1.6.0,<2.0.0)"] -gssapi = ["pykerberos", "winkerberos (>=0.5.0)"] -ocsp = ["certifi", "cryptography (>=2.5)", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] +encryption = ["certifi ; os_name == \"nt\" or sys_platform == \"darwin\"", "pymongo[aws]", "pymongocrypt (>=1.6.0,<2.0.0)"] +gssapi = ["pykerberos ; os_name != \"nt\"", "winkerberos (>=0.5.0) ; os_name == \"nt\""] +ocsp = ["certifi ; os_name == \"nt\" or sys_platform == \"darwin\"", "cryptography (>=2.5)", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] snappy = ["python-snappy"] zstd = ["zstandard"] @@ -4368,6 +4534,7 @@ version = "1.4.0" description = "\"Tools for using NumPy, Pandas, Polars, and PyArrow with MongoDB\"" optional = false python-versions = ">=3.8" +groups = ["mongodb"] files = [ {file = "pymongoarrow-1.4.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:57a438dad3808c10931ffadd6028c8107133d254229996f8260e7c61417d98fe"}, {file = "pymongoarrow-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:de545ecfc591288c75b602b4baeb6cd9f7db9ff6890c69d46dcb63eebd5e379d"}, @@ -4411,6 +4578,7 @@ version = "1.1.0" description = "Pure Python MySQL Driver" optional = false python-versions = ">=3.7" +groups = ["sql_database"] files = [ {file = "PyMySQL-1.1.0-py3-none-any.whl", hash = "sha256:8969ec6d763c856f7073c4c64662882675702efcb114b4bcbb955aea3a069fa7"}, {file = "PyMySQL-1.1.0.tar.gz", hash = "sha256:4f13a7df8bf36a51e81dd9f3605fede45a4878fe02f9236349fd82a3f0612f96"}, @@ -4426,6 +4594,7 @@ version = "23.2.0" description = "Python wrapper module around the OpenSSL library" optional = false python-versions = ">=3.6" +groups = ["scrapy"] files = [ {file = "pyOpenSSL-23.2.0-py3-none-any.whl", hash = "sha256:24f0dc5227396b3e831f4c7f602b950a5e9833d292c8e4a2e06b709292806ae2"}, {file = "pyOpenSSL-23.2.0.tar.gz", hash = "sha256:276f931f55a452e7dea69c7173e984eb2a4407ce413c918aa34b55f82f9b8bac"}, @@ -4444,6 +4613,7 @@ version = "1.11" description = "Thin wrapper for pandoc." optional = false python-versions = ">=3.6" +groups = ["unstructured_data"] files = [ {file = "pypandoc-1.11-py3-none-any.whl", hash = "sha256:b260596934e9cfc6513056110a7c8600171d414f90558bf4407e68b209be8007"}, {file = "pypandoc-1.11.tar.gz", hash = "sha256:7f6d68db0e57e0f6961bec2190897118c4d305fc2d31c22cd16037f22ee084a5"}, @@ -4455,6 +4625,7 @@ version = "3.1.1" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.6.8" +groups = ["google_ads", "google_analytics", "google_sheets"] files = [ {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"}, {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"}, @@ -4469,6 +4640,7 @@ version = "3.0.1" description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" optional = false python-versions = ">=3.6" +groups = ["dev"] files = [ {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, @@ -4490,6 +4662,8 @@ version = "2.1.2" description = "Multi-producer-multi-consumer signal dispatching mechanism" optional = false python-versions = "*" +groups = ["scrapy"] +markers = "platform_python_implementation == \"PyPy\"" files = [ {file = "PyPyDispatcher-2.1.2.tar.gz", hash = "sha256:b6bec5dfcff9d2535bca2b23c80eae367b1ac250a645106948d315fcfa9130f2"}, ] @@ -4500,6 +4674,8 @@ version = "3.4.1" description = "A python implementation of GNU readline." optional = false python-versions = "*" +groups = ["unstructured_data"] +markers = "sys_platform == \"win32\"" files = [ {file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"}, {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"}, @@ -4511,6 +4687,7 @@ version = "7.4.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" +groups = ["dev", "pytest"] files = [ {file = "pytest-7.4.2-py3-none-any.whl", hash = "sha256:1d881c6124e08ff0a1bb75ba3ec0bfd8b5354a01c194ddd5a0a870a48d99b002"}, {file = "pytest-7.4.2.tar.gz", hash = "sha256:a766259cfab564a2ad52cb1aae1b881a75c3eb7e34ca3779697c23ed47c47069"}, @@ -4533,6 +4710,7 @@ version = "5.0.0" description = "Pytest plugin for measuring coverage." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857"}, {file = "pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652"}, @@ -4551,6 +4729,7 @@ version = "1.6.0" description = "run tests in isolated forked subprocesses" optional = false python-versions = ">=3.7" +groups = ["dev", "pytest"] files = [ {file = "pytest-forked-1.6.0.tar.gz", hash = "sha256:4dafd46a9a600f65d822b8f605133ecf5b3e1941ebb3588e943b4e3eb71a5a3f"}, {file = "pytest_forked-1.6.0-py3-none-any.whl", hash = "sha256:810958f66a91afb1a1e2ae83089d8dc1cd2437ac96b12963042fbb9fb4d16af0"}, @@ -4566,6 +4745,7 @@ version = "3.12.0" description = "Thin-wrapper around the mock package for easier use with pytest" optional = false python-versions = ">=3.8" +groups = ["dev", "pytest"] files = [ {file = "pytest-mock-3.12.0.tar.gz", hash = "sha256:31a40f038c22cad32287bb43932054451ff5583ff094bca6f675df2f8bc1a6e9"}, {file = "pytest_mock-3.12.0-py3-none-any.whl", hash = "sha256:0972719a7263072da3a21c7f4773069bcc7486027d7e8e1f81d98a47e701bc4f"}, @@ -4583,6 +4763,7 @@ version = "2.8.2" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main", "dev", "dltpure", "filesystem", "mongodb", "salesforce", "stripe_analytics", "unstructured_data"] files = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, @@ -4597,6 +4778,7 @@ version = "1.0.1" description = "Create, read, and update Microsoft Word .docx files." optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "python-docx-1.0.1.tar.gz", hash = "sha256:255148e15a4414244ec75f50e92d19864e52a7416768c65491707a7414659524"}, {file = "python_docx-1.0.1-py3-none-any.whl", hash = "sha256:851340c49b36f917a1838a44c602a5a0702c0c3507b9890969545732dc10d2d1"}, @@ -4612,6 +4794,7 @@ version = "1.0.0" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] files = [ {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, @@ -4626,6 +4809,7 @@ version = "0.4.27" description = "File type identification using libmagic" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +groups = ["unstructured_data"] files = [ {file = "python-magic-0.4.27.tar.gz", hash = "sha256:c1ba14b08e4a5f5c31a302b7721239695b2f0f058d125bd5ce1ee36b9d9d3c3b"}, {file = "python_magic-0.4.27-py2.py3-none-any.whl", hash = "sha256:c212960ad306f700aa0d01e5d7a325d20548ff97eb9920dcd29513174f0294d3"}, @@ -4637,6 +4821,7 @@ version = "0.6.22" description = "Generate and manipulate Open XML PowerPoint (.pptx) files" optional = false python-versions = "*" +groups = ["unstructured_data"] files = [ {file = "python-pptx-0.6.22.tar.gz", hash = "sha256:38f8ee92dde31d24b4562560e61b0357e5d97ecf75c4352ae6616d5a32978654"}, {file = "python_pptx-0.6.22-py3-none-any.whl", hash = "sha256:3d097c29e08de2da1fc3c6752169087065efa4153216e77fc1b27dff1bcdcb46"}, @@ -4653,6 +4838,7 @@ version = "2023.3.post1" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" +groups = ["main", "dltpure", "mongodb", "salesforce", "stripe_analytics", "unstructured_data"] files = [ {file = "pytz-2023.3.post1-py2.py3-none-any.whl", hash = "sha256:ce42d816b81b68506614c11e8937d3aa9e41007ceb50bfdcb0749b921bf646c7"}, {file = "pytz-2023.3.post1.tar.gz", hash = "sha256:7b4fddbeb94a1eba4b557da24f19fdf9db575192544270a9101d8509f9f43d7b"}, @@ -4664,6 +4850,7 @@ version = "306" description = "Python for Window Extensions" optional = false python-versions = "*" +groups = ["main", "dltpure", "filesystem"] files = [ {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"}, @@ -4680,6 +4867,7 @@ files = [ {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"}, {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"}, ] +markers = {main = "sys_platform == \"win32\"", dltpure = "sys_platform == \"win32\"", filesystem = "platform_system == \"Windows\""} [[package]] name = "pyyaml" @@ -4687,6 +4875,7 @@ version = "6.0.1" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.6" +groups = ["main", "dev", "dltpure", "google_ads", "unstructured_data", "unstructured_data_lint"] files = [ {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, @@ -4747,6 +4936,7 @@ version = "1.6.2" description = "Collection of persistent (disk-based) and non-persistent (memory-based) queues" optional = false python-versions = ">=3.5" +groups = ["scrapy"] files = [ {file = "queuelib-1.6.2-py2.py3-none-any.whl", hash = "sha256:4b96d48f650a814c6fb2fd11b968f9c46178b683aad96d68f930fe13a8574d19"}, {file = "queuelib-1.6.2.tar.gz", hash = "sha256:4b207267f2642a8699a1f806045c56eb7ad1a85a10c0e249884580d139c2fcd2"}, @@ -4758,6 +4948,7 @@ version = "2023.10.3" description = "Alternative regular expression module, to replace re." optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "regex-2023.10.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4c34d4f73ea738223a094d8e0ffd6d2c1a1b4c175da34d6b0de3d8d69bee6bcc"}, {file = "regex-2023.10.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a8f4e49fc3ce020f65411432183e6775f24e02dff617281094ba6ab079ef0915"}, @@ -4855,6 +5046,7 @@ version = "2.31.0" description = "Python HTTP for Humans." optional = false python-versions = ">=3.7" +groups = ["main", "airtable", "asana_dlt", "dev", "dltpure", "facebook_ads", "filesystem", "google_ads", "google_analytics", "google_sheets", "salesforce", "scrapy", "stripe_analytics", "unstructured_data", "unstructured_data_lint"] files = [ {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, @@ -4876,6 +5068,7 @@ version = "1.5.1" description = "File transport adapter for Requests" optional = false python-versions = "*" +groups = ["salesforce", "scrapy"] files = [ {file = "requests-file-1.5.1.tar.gz", hash = "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e"}, {file = "requests_file-1.5.1-py2.py3-none-any.whl", hash = "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"}, @@ -4891,6 +5084,7 @@ version = "1.11.0" description = "Mock out responses from the requests package" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "requests-mock-1.11.0.tar.gz", hash = "sha256:ef10b572b489a5f28e09b708697208c4a3b2b89ef80a9f01584340ea357ec3c4"}, {file = "requests_mock-1.11.0-py2.py3-none-any.whl", hash = "sha256:f7fae383f228633f6bececebdab236c478ace2284d6292c6e7e2867b9ab74d15"}, @@ -4902,7 +5096,7 @@ six = "*" [package.extras] fixture = ["fixtures"] -test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"] +test = ["fixtures", "mock ; python_version < \"3.3\"", "purl", "pytest", "requests-futures", "sphinx", "testtools"] [[package]] name = "requests-oauthlib" @@ -4910,6 +5104,7 @@ version = "1.3.1" description = "OAuthlib authentication support for Requests." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["main", "asana_dlt", "filesystem", "google_ads", "google_analytics"] files = [ {file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"}, {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"}, @@ -4928,6 +5123,7 @@ version = "1.0.0" description = "A utility belt for advanced users of python-requests" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["salesforce"] files = [ {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, @@ -4942,6 +5138,7 @@ version = "0.5.0" description = "This is a small Python module for parsing Pip requirement files." optional = false python-versions = ">=3.6,<4.0" +groups = ["main", "dltpure"] files = [ {file = "requirements-parser-0.5.0.tar.gz", hash = "sha256:3336f3a3ae23e06d3f0f88595e4052396e3adf91688787f637e5d2ca1a904069"}, {file = "requirements_parser-0.5.0-py3-none-any.whl", hash = "sha256:e7fcdcd04f2049e73a9fb150d8a0f9d51ce4108f5f7cbeac74c484e17b12bcd9"}, @@ -4956,6 +5153,7 @@ version = "13.6.0" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.7.0" +groups = ["main", "dev", "dltpure"] files = [ {file = "rich-13.6.0-py3-none-any.whl", hash = "sha256:2b38e2fe9ca72c9a00170a1a2d20c63c790d0e10ef1fe35eba76e1e7b1d7d245"}, {file = "rich-13.6.0.tar.gz", hash = "sha256:5c14d22737e6d5084ef4771b62d5d4363165b403455a30a1c8ca39dc7b644bef"}, @@ -4968,12 +5166,28 @@ pygments = ">=2.13.0,<3.0.0" [package.extras] jupyter = ["ipywidgets (>=7.5.1,<9)"] +[[package]] +name = "rich-argparse" +version = "1.7.1" +description = "Rich help formatters for argparse and optparse" +optional = false +python-versions = ">=3.8" +groups = ["main", "dltpure"] +files = [ + {file = "rich_argparse-1.7.1-py3-none-any.whl", hash = "sha256:a8650b42e4a4ff72127837632fba6b7da40784842f08d7395eb67a9cbd7b4bf9"}, + {file = "rich_argparse-1.7.1.tar.gz", hash = "sha256:d7a493cde94043e41ea68fb43a74405fa178de981bf7b800f7a3bd02ac5c27be"}, +] + +[package.dependencies] +rich = ">=11.0.0" + [[package]] name = "rsa" version = "4.9" description = "Pure-Python RSA implementation" optional = false python-versions = ">=3.6,<4" +groups = ["main", "filesystem", "google_ads", "google_analytics", "google_sheets"] files = [ {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, @@ -4988,6 +5202,7 @@ version = "2024.3.1" description = "Convenient Filesystem interface over S3" optional = false python-versions = ">= 3.8" +groups = ["filesystem"] files = [ {file = "s3fs-2024.3.1-py3-none-any.whl", hash = "sha256:f4566a5446c473740d272ec08e0b4aae8db1aa05f662c42ff0aa2c89bb5060ea"}, {file = "s3fs-2024.3.1.tar.gz", hash = "sha256:1b8bc8dbd65e7b60f5487378f6eeffe1de59aa72caa9efca6dad6ab877405487"}, @@ -5008,6 +5223,7 @@ version = "2.11.1" description = "A high-level Web Crawling and Web Scraping framework" optional = false python-versions = ">=3.8" +groups = ["scrapy"] files = [ {file = "Scrapy-2.11.1-py2.py3-none-any.whl", hash = "sha256:f1edee0cd214512054c01a8d031a8d213dddb53492b02c9e66256e3efe90d175"}, {file = "Scrapy-2.11.1.tar.gz", hash = "sha256:733a039c7423e52b69bf2810b5332093d4e42a848460359c07b02ecff8f73ebe"}, @@ -5039,6 +5255,7 @@ version = "3.0.2" description = "Python helper for Semantic Versioning (https://semver.org)" optional = false python-versions = ">=3.7" +groups = ["main", "dltpure"] files = [ {file = "semver-3.0.2-py3-none-any.whl", hash = "sha256:b1ea4686fe70b981f85359eda33199d60c53964284e0cfb4977d243e37cf4bf4"}, {file = "semver-3.0.2.tar.gz", hash = "sha256:6253adb39c70f6e51afed2fa7152bcd414c411286088fb4b9effb133885ab4cc"}, @@ -5050,6 +5267,7 @@ version = "24.1.0" description = "Service identity verification for pyOpenSSL & cryptography." optional = false python-versions = ">=3.8" +groups = ["scrapy"] files = [ {file = "service_identity-24.1.0-py3-none-any.whl", hash = "sha256:a28caf8130c8a5c1c7a6f5293faaf239bbfb7751e4862436920ee6f2616f568a"}, {file = "service_identity-24.1.0.tar.gz", hash = "sha256:6829c9d62fb832c2e1c435629b0a8c476e1929881f28bee4d20bc24161009221"}, @@ -5074,6 +5292,7 @@ version = "68.2.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" +groups = ["main", "dev", "dltpure", "facebook_ads", "google_ads", "scrapy"] files = [ {file = "setuptools-68.2.2-py3-none-any.whl", hash = "sha256:b454a35605876da60632df1a60f736524eb73cc47bbc9f3f1ef1b644de74fd2a"}, {file = "setuptools-68.2.2.tar.gz", hash = "sha256:4ac1475276d2f1c48684874089fefcd83bd7162ddaafb81fac866ba0db282a87"}, @@ -5081,7 +5300,7 @@ files = [ [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7) ; platform_python_implementation != \"PyPy\"", "pytest-checkdocs (>=2.4)", "pytest-cov ; platform_python_implementation != \"PyPy\"", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1) ; platform_python_implementation != \"PyPy\"", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-ruff ; sys_platform != \"cygwin\"", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] @@ -5090,6 +5309,7 @@ version = "1.12.5" description = "A basic Salesforce.com REST API client." optional = false python-versions = "*" +groups = ["salesforce"] files = [ {file = "simple-salesforce-1.12.5.tar.gz", hash = "sha256:ef65f72438e3b215619f6835d3d4356e147adf3a7ece6896d239127dd6aefcd1"}, {file = "simple_salesforce-1.12.5-py2.py3-none-any.whl", hash = "sha256:07029575385d04132babfd6e19c1c8068c859d616a45dab07bbf9875bdc5ab93"}, @@ -5109,6 +5329,7 @@ version = "3.19.2" description = "Simple, fast, extensible JSON encoder/decoder for Python" optional = false python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*" +groups = ["main", "dltpure"] files = [ {file = "simplejson-3.19.2-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3471e95110dcaf901db16063b2e40fb394f8a9e99b3fe9ee3acc6f6ef72183a2"}, {file = "simplejson-3.19.2-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:3194cd0d2c959062b94094c0a9f8780ffd38417a5322450a0db0ca1a23e7fbd2"}, @@ -5216,6 +5437,7 @@ version = "1.16.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +groups = ["main", "dev", "dltpure", "facebook_ads", "filesystem", "mongodb", "salesforce", "scrapy", "stripe_analytics", "unstructured_data"] files = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, @@ -5227,6 +5449,7 @@ version = "5.0.1" description = "A pure Python implementation of a sliding window memory map manager" optional = false python-versions = ">=3.7" +groups = ["main", "dev", "dltpure"] files = [ {file = "smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da"}, {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"}, @@ -5238,6 +5461,7 @@ version = "1.3.0" description = "Sniff out which async library your code is running under" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, @@ -5249,6 +5473,7 @@ version = "2.0.22" description = "Database Abstraction Library" optional = false python-versions = ">=3.7" +groups = ["pg_legacy_replication", "sql_database", "unstructured_data", "unstructured_data_lint"] files = [ {file = "SQLAlchemy-2.0.22-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f146c61ae128ab43ea3a0955de1af7e1633942c2b2b4985ac51cc292daf33222"}, {file = "SQLAlchemy-2.0.22-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:875de9414393e778b655a3d97d60465eb3fae7c919e88b70cc10b40b9f56042d"}, @@ -5335,6 +5560,7 @@ version = "0.20.4" description = "The little ASGI library that shines." optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "starlette-0.20.4-py3-none-any.whl", hash = "sha256:c0414d5a56297d37f3db96a84034d61ce29889b9eaccf65eb98a0b39441fcaa3"}, {file = "starlette-0.20.4.tar.gz", hash = "sha256:42fcf3122f998fefce3e2c5ad7e5edbf0f02cf685d646a83a08d404726af5084"}, @@ -5353,6 +5579,7 @@ version = "5.1.0" description = "Manage dynamic plugins for Python applications" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "stevedore-5.1.0-py3-none-any.whl", hash = "sha256:8cc040628f3cea5d7128f2e76cf486b2251a4e543c7b938f58d9a377f6694a2d"}, {file = "stevedore-5.1.0.tar.gz", hash = "sha256:a54534acf9b89bc7ed264807013b505bf07f74dbe4bcfa37d32bd063870b087c"}, @@ -5367,6 +5594,7 @@ version = "5.5.0" description = "Python bindings for the Stripe API" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +groups = ["stripe_analytics"] files = [ {file = "stripe-5.5.0-py2.py3-none-any.whl", hash = "sha256:b4947da66dbb3de8969004ba6398f9a019c6b1b3ffe6aa88d5b07ac560a52b28"}, {file = "stripe-5.5.0.tar.gz", hash = "sha256:04a9732b37a46228ecf0e496163a3edd93596b0e6200029fbc48911638627e19"}, @@ -5381,6 +5609,7 @@ version = "1.12" description = "Computer algebra system (CAS) in Python" optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] files = [ {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, @@ -5395,6 +5624,7 @@ version = "0.9.0" description = "Pretty-print tabular data" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, @@ -5409,6 +5639,7 @@ version = "8.2.3" description = "Retry code until it succeeds" optional = false python-versions = ">=3.7" +groups = ["main", "dltpure", "unstructured_data", "unstructured_data_lint"] files = [ {file = "tenacity-8.2.3-py3-none-any.whl", hash = "sha256:ce510e327a630c9e1beaf17d42e6ffacc88185044ad85cf74c0a8887c6a0f88c"}, {file = "tenacity-8.2.3.tar.gz", hash = "sha256:5398ef0d78e63f40007c1fb4c0bff96e1911394d2fa8d194f77619c05ff6cc8a"}, @@ -5423,6 +5654,7 @@ version = "0.4.0" description = "tiktoken is a fast BPE tokeniser for use with OpenAI's models" optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] files = [ {file = "tiktoken-0.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:176cad7f053d2cc82ce7e2a7c883ccc6971840a4b5276740d0b732a2b2011f8a"}, {file = "tiktoken-0.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:450d504892b3ac80207700266ee87c932df8efea54e05cefe8613edc963c1285"}, @@ -5468,6 +5700,7 @@ version = "5.1.1" description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." optional = false python-versions = ">=3.8" +groups = ["scrapy"] files = [ {file = "tldextract-5.1.1-py3-none-any.whl", hash = "sha256:b9c4510a8766d377033b6bace7e9f1f17a891383ced3c5d50c150f181e9e1cc2"}, {file = "tldextract-5.1.1.tar.gz", hash = "sha256:9b6dbf803cb5636397f0203d48541c0da8ba53babaf0e8a6feda2d88746813d4"}, @@ -5488,6 +5721,7 @@ version = "0.14.1" description = "" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "tokenizers-0.14.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:04ec1134a18ede355a05641cdc7700f17280e01f69f2f315769f02f7e295cf1e"}, {file = "tokenizers-0.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:638abedb39375f0ddce2de536fc9c976639b2d1b7202d715c2e7a25f0ebfd091"}, @@ -5603,6 +5837,8 @@ version = "2.0.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.7" +groups = ["dev", "pytest"] +markers = "python_version < \"3.11\"" files = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, @@ -5614,6 +5850,7 @@ version = "0.12.1" description = "Style preserving TOML library" optional = false python-versions = ">=3.7" +groups = ["main", "dltpure"] files = [ {file = "tomlkit-0.12.1-py3-none-any.whl", hash = "sha256:712cbd236609acc6a3e2e97253dfc52d4c2082982a88f61b640ecf0817eab899"}, {file = "tomlkit-0.12.1.tar.gz", hash = "sha256:38e1ff8edb991273ec9f6181244a6a391ac30e9f5098e7535640ea6be97a7c86"}, @@ -5625,6 +5862,7 @@ version = "4.66.1" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"}, {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"}, @@ -5645,6 +5883,7 @@ version = "22.10.0" description = "An asynchronous networking framework written in Python" optional = false python-versions = ">=3.7.1" +groups = ["dev", "scrapy"] files = [ {file = "Twisted-22.10.0-py3-none-any.whl", hash = "sha256:86c55f712cc5ab6f6d64e02503352464f0400f66d4f079096d744080afcccbd0"}, {file = "Twisted-22.10.0.tar.gz", hash = "sha256:32acbd40a94f5f46e7b42c109bfae2b302250945561783a8b7a059048f2d4d31"}, @@ -5661,21 +5900,21 @@ typing-extensions = ">=3.6.5" "zope.interface" = ">=4.4.2" [package.extras] -all-non-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] +all-non-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3) ; python_version < \"3.7\"", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226) ; platform_system == \"Windows\"", "service-identity (>=18.1.0)"] conch = ["appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "cryptography (>=2.6)", "pyasn1"] conch-nacl = ["PyNaCl", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "cryptography (>=2.6)", "pyasn1"] -contextvars = ["contextvars (>=2.4,<3)"] -dev = ["coverage (>=6b1,<7)", "pydoctor (>=22.9.0,<22.10.0)", "pyflakes (>=2.2,<3.0)", "python-subunit (>=1.4,<2.0)", "readthedocs-sphinx-ext (>=2.1,<3.0)", "sphinx (>=5.0,<6)", "sphinx-rtd-theme (>=1.0,<2.0)", "towncrier (>=22.8,<23.0)", "twistedchecker (>=0.7,<1.0)"] +contextvars = ["contextvars (>=2.4,<3) ; python_version < \"3.7\""] +dev = ["coverage (>=6b1,<7)", "pydoctor (>=22.9.0,<22.10.0)", "pyflakes (>=2.2,<3.0)", "python-subunit (>=1.4,<2.0) ; python_version < \"3.10\"", "readthedocs-sphinx-ext (>=2.1,<3.0)", "sphinx (>=5.0,<6)", "sphinx-rtd-theme (>=1.0,<2.0)", "towncrier (>=22.8,<23.0)", "twistedchecker (>=0.7,<1.0)"] dev-release = ["pydoctor (>=22.9.0,<22.10.0)", "readthedocs-sphinx-ext (>=2.1,<3.0)", "sphinx (>=5.0,<6)", "sphinx-rtd-theme (>=1.0,<2.0)", "towncrier (>=22.8,<23.0)"] -gtk-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pygobject", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] +gtk-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3) ; python_version < \"3.7\"", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pygobject", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226) ; platform_system == \"Windows\"", "service-identity (>=18.1.0)"] http2 = ["h2 (>=3.0,<5.0)", "priority (>=1.1.0,<2.0)"] -macos-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyobjc-core", "pyobjc-framework-CFNetwork", "pyobjc-framework-Cocoa", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] -mypy = ["PyHamcrest (>=1.9.0)", "PyNaCl", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "coverage (>=6b1,<7)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "mypy (==0.930)", "mypy-zope (==0.3.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pydoctor (>=22.9.0,<22.10.0)", "pyflakes (>=2.2,<3.0)", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "python-subunit (>=1.4,<2.0)", "pywin32 (!=226)", "readthedocs-sphinx-ext (>=2.1,<3.0)", "service-identity (>=18.1.0)", "sphinx (>=5.0,<6)", "sphinx-rtd-theme (>=1.0,<2.0)", "towncrier (>=22.8,<23.0)", "twistedchecker (>=0.7,<1.0)", "types-pyOpenSSL", "types-setuptools"] -osx-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyobjc-core", "pyobjc-framework-CFNetwork", "pyobjc-framework-Cocoa", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] -serial = ["pyserial (>=3.0)", "pywin32 (!=226)"] +macos-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3) ; python_version < \"3.7\"", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyobjc-core", "pyobjc-framework-CFNetwork", "pyobjc-framework-Cocoa", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226) ; platform_system == \"Windows\"", "service-identity (>=18.1.0)"] +mypy = ["PyHamcrest (>=1.9.0)", "PyNaCl", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3) ; python_version < \"3.7\"", "coverage (>=6b1,<7)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "mypy (==0.930)", "mypy-zope (==0.3.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pydoctor (>=22.9.0,<22.10.0)", "pyflakes (>=2.2,<3.0)", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "python-subunit (>=1.4,<2.0) ; python_version < \"3.10\"", "pywin32 (!=226) ; platform_system == \"Windows\"", "readthedocs-sphinx-ext (>=2.1,<3.0)", "service-identity (>=18.1.0)", "sphinx (>=5.0,<6)", "sphinx-rtd-theme (>=1.0,<2.0)", "towncrier (>=22.8,<23.0)", "twistedchecker (>=0.7,<1.0)", "types-pyOpenSSL", "types-setuptools"] +osx-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3) ; python_version < \"3.7\"", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyobjc-core", "pyobjc-framework-CFNetwork", "pyobjc-framework-Cocoa", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226) ; platform_system == \"Windows\"", "service-identity (>=18.1.0)"] +serial = ["pyserial (>=3.0)", "pywin32 (!=226) ; platform_system == \"Windows\""] test = ["PyHamcrest (>=1.9.0)", "cython-test-exception-raiser (>=1.0.2,<2)", "hypothesis (>=6.0,<7.0)"] tls = ["idna (>=2.4)", "pyopenssl (>=21.0.0)", "service-identity (>=18.1.0)"] -windows-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3)", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "pywin32 (!=226)", "service-identity (>=18.1.0)"] +windows-platform = ["PyHamcrest (>=1.9.0)", "appdirs (>=1.4.0)", "bcrypt (>=3.0.0)", "contextvars (>=2.4,<3) ; python_version < \"3.7\"", "cryptography (>=2.6)", "cython-test-exception-raiser (>=1.0.2,<2)", "h2 (>=3.0,<5.0)", "hypothesis (>=6.0,<7.0)", "idna (>=2.4)", "priority (>=1.1.0,<2.0)", "pyasn1", "pyopenssl (>=21.0.0)", "pyserial (>=3.0)", "pywin32 (!=226)", "pywin32 (!=226) ; platform_system == \"Windows\"", "service-identity (>=18.1.0)"] [[package]] name = "twisted-iocpsupport" @@ -5683,6 +5922,8 @@ version = "1.0.4" description = "An extension for use in the twisted I/O Completion Ports reactor." optional = false python-versions = "*" +groups = ["dev", "scrapy"] +markers = "platform_system == \"Windows\"" files = [ {file = "twisted-iocpsupport-1.0.4.tar.gz", hash = "sha256:858096c0d15e33f15ac157f455d8f86f2f2cdd223963e58c0f682a3af8362d89"}, {file = "twisted_iocpsupport-1.0.4-cp310-cp310-win32.whl", hash = "sha256:afa2b630797f9ed2f27f3d9f55e3f72b4244911e45a8c82756f44babbf0b243e"}, @@ -5711,6 +5952,7 @@ version = "5.29.1.20241207" description = "Typing stubs for protobuf" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "types_protobuf-5.29.1.20241207-py3-none-any.whl", hash = "sha256:92893c42083e9b718c678badc0af7a9a1307b92afe1599e5cba5f3d35b668b2f"}, {file = "types_protobuf-5.29.1.20241207.tar.gz", hash = "sha256:2ebcadb8ab3ef2e3e2f067e0882906d64ba0dc65fc5b0fd7a8b692315b4a0be9"}, @@ -5722,6 +5964,7 @@ version = "2.9.21.20240218" description = "Typing stubs for psycopg2" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "types-psycopg2-2.9.21.20240218.tar.gz", hash = "sha256:3084cd807038a62c80fb5be78b41d855b48a060316101ea59fd85c302efb57d4"}, {file = "types_psycopg2-2.9.21.20240218-py3-none-any.whl", hash = "sha256:cac96264e063cbce28dee337a973d39e6df4ca671252343cb4f8e5ef6db5e67d"}, @@ -5733,6 +5976,7 @@ version = "2023.3.1.1" description = "Typing stubs for pytz" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "types-pytz-2023.3.1.1.tar.gz", hash = "sha256:cc23d0192cd49c8f6bba44ee0c81e4586a8f30204970fc0894d209a6b08dab9a"}, {file = "types_pytz-2023.3.1.1-py3-none-any.whl", hash = "sha256:1999a123a3dc0e39a2ef6d19f3f8584211de9e6a77fe7a0259f04a524e90a5cf"}, @@ -5744,6 +5988,7 @@ version = "2.31.0.6" description = "Typing stubs for requests" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "types-requests-2.31.0.6.tar.gz", hash = "sha256:cd74ce3b53c461f1228a9b783929ac73a666658f223e28ed29753771477b3bd0"}, {file = "types_requests-2.31.0.6-py3-none-any.whl", hash = "sha256:a2db9cb228a81da8348b49ad6db3f5519452dd20a9c1e1a868c83c5fe88fd1a9"}, @@ -5758,6 +6003,7 @@ version = "68.2.0.0" description = "Typing stubs for setuptools" optional = false python-versions = "*" +groups = ["main", "dltpure"] files = [ {file = "types-setuptools-68.2.0.0.tar.gz", hash = "sha256:a4216f1e2ef29d089877b3af3ab2acf489eb869ccaf905125c69d2dc3932fd85"}, {file = "types_setuptools-68.2.0.0-py3-none-any.whl", hash = "sha256:77edcc843e53f8fc83bb1a840684841f3dc804ec94562623bfa2ea70d5a2ba1b"}, @@ -5769,6 +6015,7 @@ version = "3.5.2.14" description = "Typing stubs for stripe" optional = false python-versions = "*" +groups = ["stripe_analytics"] files = [ {file = "types-stripe-3.5.2.14.tar.gz", hash = "sha256:bcc020aa5ba9acd796b9f2ac21f044c8e377ce2c0f570057f0f64c4b4637bbe7"}, {file = "types_stripe-3.5.2.14-py3-none-any.whl", hash = "sha256:f5f1249f72a35ada1db95523edc7e8f7b543dc8434b2ff23eaa9ec2e251c2e59"}, @@ -5780,6 +6027,7 @@ version = "1.26.25.14" description = "Typing stubs for urllib3" optional = false python-versions = "*" +groups = ["dev"] files = [ {file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"}, {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, @@ -5791,6 +6039,7 @@ version = "4.8.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" +groups = ["main", "airtable", "dev", "dltpure", "filesystem", "pg_legacy_replication", "scrapy", "sql_database", "unstructured_data", "unstructured_data_lint"] files = [ {file = "typing_extensions-4.8.0-py3-none-any.whl", hash = "sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0"}, {file = "typing_extensions-4.8.0.tar.gz", hash = "sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef"}, @@ -5802,6 +6051,7 @@ version = "0.9.0" description = "Runtime inspection utilities for typing module." optional = false python-versions = "*" +groups = ["unstructured_data", "unstructured_data_lint"] files = [ {file = "typing_inspect-0.9.0-py3-none-any.whl", hash = "sha256:9ee6fc59062311ef8547596ab6b955e1b8aa46242d854bfc78f4f6b0eff35f9f"}, {file = "typing_inspect-0.9.0.tar.gz", hash = "sha256:b23fc42ff6f6ef6954e4852c1fb512cdd18dbea03134f91f856a95ccc9461f78"}, @@ -5817,6 +6067,7 @@ version = "2023.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" +groups = ["main", "dev", "dltpure", "mongodb", "salesforce", "stripe_analytics", "unstructured_data"] files = [ {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, @@ -5828,6 +6079,7 @@ version = "0.7.12" description = "A library that prepares raw documents for downstream ML tasks." optional = false python-versions = ">=3.7.0" +groups = ["unstructured_data"] files = [ {file = "unstructured-0.7.12-py3-none-any.whl", hash = "sha256:6dec4f23574e213f30bccb680a4fb84c95617092ce4abf5d8955cc71af402fef"}, {file = "unstructured-0.7.12.tar.gz", hash = "sha256:3dcddea34f52e1070f38fd10063b3b0f64bc4cbe5b778d6b86b5d33262d625cd"}, @@ -5875,6 +6127,7 @@ version = "4.1.1" description = "Implementation of RFC 6570 URI Templates" optional = false python-versions = ">=3.6" +groups = ["google_ads", "google_analytics", "google_sheets"] files = [ {file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"}, {file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"}, @@ -5886,14 +6139,15 @@ version = "1.26.17" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +groups = ["main", "airtable", "asana_dlt", "dev", "dltpure", "facebook_ads", "filesystem", "google_ads", "google_analytics", "google_sheets", "salesforce", "scrapy", "stripe_analytics", "unstructured_data", "unstructured_data_lint"] files = [ {file = "urllib3-1.26.17-py2.py3-none-any.whl", hash = "sha256:94a757d178c9be92ef5539b8840d48dc9cf1b2709c9d6b588232a055c524458b"}, {file = "urllib3-1.26.17.tar.gz", hash = "sha256:24d6a242c28d29af46c3fae832c36db3bbebcc533dd1bb549172cd739c82df21"}, ] [package.extras] -brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +brotli = ["brotli (==1.0.9) ; os_name != \"nt\" and python_version < \"3\" and platform_python_implementation == \"CPython\"", "brotli (>=1.0.9) ; python_version >= \"3\" and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] @@ -5902,6 +6156,7 @@ version = "0.23.2" description = "The lightning-fast ASGI server." optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] files = [ {file = "uvicorn-0.23.2-py3-none-any.whl", hash = "sha256:1f9be6558f01239d4fdf22ef8126c39cb1ad0addf76c40e760549d2c2f43ab53"}, {file = "uvicorn-0.23.2.tar.gz", hash = "sha256:4d3cc12d7727ba72b64d12d3cc7743124074c0a69f7b201512fc50c3e3f1569a"}, @@ -5915,12 +6170,12 @@ httptools = {version = ">=0.5.0", optional = true, markers = "extra == \"standar python-dotenv = {version = ">=0.13", optional = true, markers = "extra == \"standard\""} pyyaml = {version = ">=5.1", optional = true, markers = "extra == \"standard\""} typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} -uvloop = {version = ">=0.14.0,<0.15.0 || >0.15.0,<0.15.1 || >0.15.1", optional = true, markers = "(sys_platform != \"win32\" and sys_platform != \"cygwin\") and platform_python_implementation != \"PyPy\" and extra == \"standard\""} +uvloop = {version = ">=0.14.0,<0.15.0 || >0.15.0,<0.15.1 || >0.15.1", optional = true, markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\" and extra == \"standard\""} watchfiles = {version = ">=0.13", optional = true, markers = "extra == \"standard\""} websockets = {version = ">=10.4", optional = true, markers = "extra == \"standard\""} [package.extras] -standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] +standard = ["colorama (>=0.4) ; sys_platform == \"win32\"", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1) ; sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"", "watchfiles (>=0.13)", "websockets (>=10.4)"] [[package]] name = "uvloop" @@ -5928,6 +6183,8 @@ version = "0.18.0" description = "Fast implementation of asyncio event loop on top of libuv" optional = false python-versions = ">=3.7.0" +groups = ["unstructured_data"] +markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"" files = [ {file = "uvloop-0.18.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1f354d669586fca96a9a688c585b6257706d216177ac457c92e15709acaece10"}, {file = "uvloop-0.18.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:280904236a5b333a273292b3bcdcbfe173690f69901365b973fa35be302d7781"}, @@ -5969,7 +6226,7 @@ files = [ [package.extras] docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] -test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0)", "aiohttp (>=3.8.1)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] +test = ["Cython (>=0.29.36,<0.30.0)", "aiohttp (==3.9.0b0) ; python_version >= \"3.12\"", "aiohttp (>=3.8.1) ; python_version < \"3.12\"", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] [[package]] name = "w3lib" @@ -5977,6 +6234,7 @@ version = "2.1.2" description = "Library of web-related functions" optional = false python-versions = ">=3.7" +groups = ["scrapy"] files = [ {file = "w3lib-2.1.2-py3-none-any.whl", hash = "sha256:c4432926e739caa8e3f49f5de783f336df563d9490416aebd5d39fb896d264e7"}, {file = "w3lib-2.1.2.tar.gz", hash = "sha256:ed5b74e997eea2abe3c1321f916e344144ee8e9072a6f33463ee8e57f858a4b1"}, @@ -5988,6 +6246,7 @@ version = "0.21.0" description = "Simple, modern and high performance file watching and code reload in python." optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] files = [ {file = "watchfiles-0.21.0-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:27b4035013f1ea49c6c0b42d983133b136637a527e48c132d368eb19bf1ac6aa"}, {file = "watchfiles-0.21.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c81818595eff6e92535ff32825f31c116f867f64ff8cdf6562cd1d6b2e1e8f3e"}, @@ -6075,6 +6334,7 @@ version = "11.0.3" description = "An implementation of the WebSocket Protocol (RFC 6455 & 7692)" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3ccc8a0c387629aec40f2fc9fdcb4b9d5431954f934da3eaf16cdc94f67dbfac"}, {file = "websockets-11.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d67ac60a307f760c6e65dad586f556dde58e683fab03323221a4e530ead6f74d"}, @@ -6148,26 +6408,14 @@ files = [ {file = "websockets-11.0.3.tar.gz", hash = "sha256:88fc51d9a26b10fc331be344f1781224a375b78488fc343620184e95a4b27016"}, ] -[[package]] -name = "wheel" -version = "0.41.2" -description = "A built-package format for Python" -optional = false -python-versions = ">=3.7" -files = [ - {file = "wheel-0.41.2-py3-none-any.whl", hash = "sha256:75909db2664838d015e3d9139004ee16711748a52c8f336b52882266540215d8"}, - {file = "wheel-0.41.2.tar.gz", hash = "sha256:0c5ac5ff2afb79ac23ab82bab027a0be7b5dbcf2e54dc50efe4bf507de1f7985"}, -] - -[package.extras] -test = ["pytest (>=6.0.0)", "setuptools (>=65)"] - [[package]] name = "win-precise-time" version = "1.4.2" description = "" optional = false python-versions = ">=3.7" +groups = ["main", "dltpure"] +markers = "os_name == \"nt\"" files = [ {file = "win-precise-time-1.4.2.tar.gz", hash = "sha256:89274785cbc5f2997e01675206da3203835a442c60fd97798415c6b3c179c0b9"}, {file = "win_precise_time-1.4.2-cp310-cp310-win32.whl", hash = "sha256:7fa13a2247c2ef41cd5e9b930f40716eacc7fc1f079ea72853bd5613fe087a1a"}, @@ -6190,6 +6438,7 @@ version = "1.15.0" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +groups = ["filesystem"] files = [ {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"}, {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29"}, @@ -6274,6 +6523,7 @@ version = "2.0.1" description = "Library for developers to extract data from Microsoft Excel (tm) .xls spreadsheet files" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +groups = ["unstructured_data"] files = [ {file = "xlrd-2.0.1-py2.py3-none-any.whl", hash = "sha256:6a33ee89877bd9abc1158129f6e94be74e2679636b8a205b43b85206c3f0bbdd"}, {file = "xlrd-2.0.1.tar.gz", hash = "sha256:f72f148f54442c6b056bf931dbc34f986fd0c3b0b6b5a58d013c9aef274d0c88"}, @@ -6290,6 +6540,7 @@ version = "3.1.7" description = "A Python module for creating Excel XLSX files." optional = false python-versions = ">=3.6" +groups = ["unstructured_data"] files = [ {file = "XlsxWriter-3.1.7-py3-none-any.whl", hash = "sha256:8c730c4beb468696c4160aa1d6d168fb4c1a20dd972b212cd8cc1e74ddeab1b6"}, {file = "XlsxWriter-3.1.7.tar.gz", hash = "sha256:353042efb0f8551ce72baa087e98228f3394fcb380e8b96313edf1eec8d50823"}, @@ -6301,6 +6552,7 @@ version = "1.9.2" description = "Yet another URL library" optional = false python-versions = ">=3.7" +groups = ["main", "facebook_ads", "filesystem", "unstructured_data", "unstructured_data_lint"] files = [ {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"}, {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8"}, @@ -6388,6 +6640,7 @@ version = "4.2.1" description = "A Python SOAP client" optional = false python-versions = ">=3.7" +groups = ["salesforce"] files = [ {file = "zeep-4.2.1-py3-none-any.whl", hash = "sha256:6754feb4c34a4b6d65fbc359252bf6654dcce3937bf1d95aae4402a60a8f5939"}, {file = "zeep-4.2.1.tar.gz", hash = "sha256:72093acfdb1d8360ed400869b73fbf1882b95c4287f798084c42ee0c1ff0e425"}, @@ -6415,6 +6668,8 @@ version = "3.17.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.8" +groups = ["unstructured_data"] +markers = "python_version == \"3.9\"" files = [ {file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"}, {file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"}, @@ -6422,7 +6677,7 @@ files = [ [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] +testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7) ; platform_python_implementation != \"PyPy\"", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1) ; platform_python_implementation != \"PyPy\"", "pytest-ruff"] [[package]] name = "zope-interface" @@ -6430,6 +6685,7 @@ version = "6.2" description = "Interfaces for Python" optional = false python-versions = ">=3.7" +groups = ["dev", "scrapy"] files = [ {file = "zope.interface-6.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:506f5410b36e5ba494136d9fa04c548eaf1a0d9c442b0b0e7a0944db7620e0ab"}, {file = "zope.interface-6.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b386b8b9d2b6a5e1e4eadd4e62335571244cb9193b7328c2b6e38b64cfda4f0e"}, @@ -6483,6 +6739,7 @@ version = "0.21.0" description = "Zstandard bindings for Python" optional = false python-versions = ">=3.7" +groups = ["unstructured_data"] files = [ {file = "zstandard-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:649a67643257e3b2cff1c0a73130609679a5673bf389564bc6d4b164d822a7ce"}, {file = "zstandard-0.21.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:144a4fe4be2e747bf9c646deab212666e39048faa4372abb6a250dab0f347a29"}, @@ -6536,6 +6793,6 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = ">=3.9,<3.13" -content-hash = "6a657c817cec2ef5e110c455fd86ec73ce82e1e97dea77613ba4400238608594" +content-hash = "57a99164550c77f5d400a1f134f72ea85b4a07128b28876923bd9b3deee5a94b" diff --git a/pyproject.toml b/pyproject.toml index 13beebf21..7b66c7057 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,10 +12,10 @@ packages = [{include = "sources"}] [tool.poetry.dependencies] python = ">=3.9,<3.13" -dlt = {version = "1.3.0", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb"]} +dlt = {version = "1.8.1", allow-prereleases = true, extras = ["redshift", "bigquery", "postgres", "duckdb"]} [tool.poetry.group.dltpure.dependencies] -dlt = {version = "1.3.0", allow-prereleases = true} +dlt = {version = "1.8.1", allow-prereleases = true} [tool.poetry.group.pytest.dependencies] pytest = "^7.2.0" diff --git a/tests/pg_legacy_replication/test_pg_replication.py b/tests/pg_legacy_replication/test_pg_replication.py index 044ebbc7c..643cf2f1f 100644 --- a/tests/pg_legacy_replication/test_pg_replication.py +++ b/tests/pg_legacy_replication/test_pg_replication.py @@ -10,9 +10,8 @@ init_replication, cleanup_snapshot_resources, replication_source, - ReplicationOptions, ) -from sources.pg_legacy_replication.helpers import SqlTableOptions, TableBackend +from sources.pg_legacy_replication.helpers import TableBackend from tests.utils import ( ALL_DESTINATIONS, assert_load_info, diff --git a/tests/test_dlt_init.py b/tests/test_dlt_init.py index 3beb86357..9bbf901fa 100644 --- a/tests/test_dlt_init.py +++ b/tests/test_dlt_init.py @@ -1,13 +1,12 @@ import pytest import os -import sys from typing import Any, Iterator, List from dlt.common.configuration.providers import SecretsTomlProvider from dlt.common.storages.file_storage import FileStorage from dlt.common.utils import set_working_dir -from dlt.extract.source import SourceReference +from dlt.extract import SourceReference from dlt.cli import init_command, echo from dlt.cli.init_command import SOURCES_MODULE_NAME, utils as cli_utils, files_ops