diff --git a/pages/clustering.mdx b/pages/clustering.mdx
index 23c3f70ff..3fd82ca17 100644
--- a/pages/clustering.mdx
+++ b/pages/clustering.mdx
@@ -3,30 +3,59 @@ title: Clustering
description: Learn all about replication and high availability features in Memgraph.
---
-import {CommunityLinks} from '/components/social-card/CommunityLinks'
+import { Callout } from 'nextra/components'
+import { CommunityLinks } from '/components/social-card/CommunityLinks'
# Clustering
-To create a cluster, you can [replicate data](/clustering/replication) across
-several instances. One instance is the MAIN instance and others are either SYNC
-or ASYNC replicas. With Memgraph Community, to achieve high availability, you
-need to manage automatic failover. On the other hand, Memgraph Enterprise has
-[high availability](/clustering/high-availability) features included in the
+To ensure redundancy and increase uptime, you can set up a cluster of Memgraph instances which can
+guarantee you 24/7 uptime and availability of your graph dependent services.
+
+With Memgraph Community, you gain [replication](/clustering/replication) capabilities out of the box.
+You can set up a MAIN instance (writes and reads) with as many REPLICA instances (reads) as you want.
+However, to achieve high availability, you need to manage automatic failover.
+
+On the other hand, Memgraph Enterprise has [high availability](/clustering/high-availability) features included in the
offering to ease the management of Memgraph clusters. In such case, the cluster
-consists of MAIN instance, REPLICA instances and COORDINATOR instances which,
-backed up by Raft protocol, manage the cluster state.
+consists of:
+- MAIN instance
+- REPLICA instances
+- COORDINATOR instances (backed up by Raft protocol, manage the cluster state and perform leader election)
+
+
+
+**We strongly suggest that user reads the guide on [how replication works](/clustering/concepts/how-replication-works)
+in Memgraph on a logical level, before moving to the part of setting the cluster up.**
+Choosing the appropriate number of Memgraph instances, as well as the replication mode on each
+of them is crucial to understand, as that impacts performance and availability of the cluster based on your needs.
+
+
+
+
+
+Replication and high availability currently **work only in the [in-memory
+transactional storage mode](/fundamentals/storage-memory-usage#in-memory-transactional-storage-mode-default)**.
+
+
+
+## [How replication works](/clustering/concepts/how-replication-works)
+
+Learn about the underlying implementation and theoretical concepts behind Memgraph replication, including CAP theorem, replication modes, and synchronization mechanisms.
-Replication and high availability currently **work only in the in-memory
-transactional [storage mode](/fundamentals/storage-memory-usage)**.
+## [Replication guide (Community)](/clustering/replication)
+Learn how to set up a replication cluster with Memgraph.
+**Replication is included in Memgraph Community**, making it accessible to all users who want to create data replicas across multiple instances.
+Memgraph Community however does not ensure high availability itself, as **automatic failover is not included**. Community users are encouraged to
+perform the necessary steps themselves for keeping the replication cluster up and running.
-## [High availability](/clustering/high-availability)
+## [High availability guide (Enterprise)](/clustering/high-availability)
-Learn how to utilize high availability features and all the important under the
-hood information.
+Learn how to setup and manage a high availability cluster with Memgraph.
+This guide is for users of **Memgraph Enterprise** who want to achieve clustering and 24/7 uptime.
-## [Replication](/clustering/replication)
+## [FAQ](/clustering/faq)
-Learn how replication is achieved in Memgraph and how to set it up.
+Frequently asked questions about clustering, replication, and high availability in Memgraph.
\ No newline at end of file
diff --git a/pages/clustering/_meta.ts b/pages/clustering/_meta.ts
index 51e038a9e..3d59275d6 100644
--- a/pages/clustering/_meta.ts
+++ b/pages/clustering/_meta.ts
@@ -1,5 +1,6 @@
export default {
+ "concepts": "Concepts",
"high-availability": "High availability",
- "replication": "Replication"
+ "replication": "Replication",
+ "faq": "FAQ"
}
-
\ No newline at end of file
diff --git a/pages/clustering/concepts/_meta.ts b/pages/clustering/concepts/_meta.ts
new file mode 100644
index 000000000..6e30e36bb
--- /dev/null
+++ b/pages/clustering/concepts/_meta.ts
@@ -0,0 +1,4 @@
+export default {
+ "how-replication-works": "How replication works",
+ "how-high-availability-works": "How high availability works",
+}
diff --git a/pages/clustering/concepts/how-high-availability-works.mdx b/pages/clustering/concepts/how-high-availability-works.mdx
new file mode 100644
index 000000000..efe9a8e7d
--- /dev/null
+++ b/pages/clustering/concepts/how-high-availability-works.mdx
@@ -0,0 +1,779 @@
+---
+title: How high availability works
+description: Learn about the underlying implementation and theoretical concepts behind Memgraph's high availability.
+---
+
+import { Callout } from 'nextra/components'
+import { Steps } from 'nextra/components'
+import {CommunityLinks} from '/components/social-card/CommunityLinks'
+
+
+# How high availability works (Enterprise)
+
+
+
+This guide is a continuation of [how replication works](/clustering/concepts/how-replication-works).
+We recommend reading it first, before moving to the high availability concepts.
+
+
+
+A cluster is considered highly available if, at any point, there is some instance that can respond to a user query.
+Our high availability relies on replication and automatic failover. The cluster consists of:
+- The MAIN instance on which the user can execute write queries
+- REPLICA instances that can only respond to read queries
+- COORDINATOR instances that manage the cluster state.
+
+MAIN and REPLICA instances can also be called **data instances** in this context, as each data instance can
+interchange the role of a MAIN or replica during the course of cluster lifecycle. Data instances are the ones that
+are holding the graph data which is being replicated.
+
+**The coordinator instance is a new addition** to enable the high availability feature and orchestrates data
+instances to ensure that there is always one main instance in the cluster. Coordinator instances are much smaller
+than the data instances, since the sole purpose of them is to manage the cluster.
+
+## High availability implementation
+
+For achieving high availability, Memgraph uses Raft consensus protocol, which is very similar to Paxos in terms of performance and fault-tolerance but with
+a significant advantage that it is much easier to understand. It's important to say that Raft isn't a
+Byzantine fault-tolerant algorithm. You can learn more about Raft in the paper [In Search of an Understandable Consensus Algorithm](https://raft.github.io/raft.pdf).
+**As a design decision, Memgraph uses an industry-proven library [NuRaft](https://github.com/eBay/NuRaft) for the implementation of the Raft
+protocol.**
+
+Typical Memgraph's highly available cluster consists of:
+- 3 data instances (1 MAIN and 2 REPLICAs)
+- 3 coordinator instances (1 leader and 2 followers)
+
+// typical memgraph HA setup
+
+Minimal setup for data instances is 1 MAIN and 1 REPLICA.
+
+// minimal memgraph HA setup
+
+The constraint for number coordinators is only that it needs to be an **odd number of them, greater than 1** (3, 5, 7, ...).
+Users can create more than 3 coordinators, but the replication factor (RF) of 3 is a de facto standard in distributed databases.
+
+
+
+The Raft consensus algorithm ensures that all nodes in a distributed system
+agree on a single source of truth, even in the presence of failures, by electing
+a leader to manage a replicated log. It simplifies the management of the
+replicated log across the cluster, providing a way to achieve consistency and
+coordination in a fault-tolerant manner. **Users are advised to use an odd number of coordinator instances**
+since Raft, as a consensus algorithm, works by forming a majority in the decision making.
+
+
+
+One coordinator instance is the leader whose job is to always ensure there is exactly one MAIN, or one writeable instance.
+The other two coordinator instances, called also follower coordinators, replicate changes the leader coordinator did in its own Raft log.
+Operations saved into the Raft log are those that are related to cluster management.
+
+You can start the coordinator instance by specifying `--coordinator-id`,
+`--coordinator-port` and `--management-port` flags. Followers ping the leader on the `--management-port` to get health state of the cluster. The coordinator instance only responds to
+queries related to high availability, so you cannot execute any data-oriented query on it. The coordinator port is used for the Raft protocol, which
+all coordinators use to ensure the consistency of the cluster's state. Data instances are distinguished from coordinator instances by
+specifying only `--management-port` flag. This port is used for RPC network communication between the coordinator and data
+instances. When started by default, the data instance is MAIN by default. The coordinator will ensure that no data
+inconsistency can happen during and after the instance's restart. Once all instances are started, the user can start
+adding data instances to the cluster.
+
+## Observability
+
+Monitoring the cluster state is very important and tracking various metrics can provide us with a valuable information. Currently, we track
+metrics which reveal us p50, p90 and p99 latencies of RPC messages, the duration of recovery process and the time needed to react to changes
+in the cluster. We are also counting the number of different RPC messages exchanged and the number of failed requests since this can give
+us information about parts of the cluster that need further care. You can see the full list of metrics
+[on the system metrics monitoring page](/database-management/monitoring#system-metrics).
+
+## How to query the cluster? (Bolt+routing)
+
+When we talk about standalone instances, the most straightforward way to connect is by using the `bolt://` protocol.
+This is not optimal if you are running a cluster of Memgraph instances for multiple reasons:
+- you need to connect to each instance separately
+- you don't know which instance is MAIN due to automatic failovers which can happen at any time
+
+Because of that, users can use the **Bolt + routing (`neo4j://`)** protocol, which ensures that write queries are always sent to
+the current MAIN instance. This prevents split-brain scenarios, as clients never
+write to the old main but are automatically routed to the new main after a failover.
+
+The routing protocol works as follows: the client sends a `ROUTE` Bolt
+message to any coordinator instance. The coordinator responds with a **routing
+table** containing three entries:
+
+1. Instances from which data can be read (REPLICAs + optionally MAIN, depending on system configuration)
+2. The instance where data can be written (MAIN)
+3. Instances acting as routers (COORDINATORs)
+
+When a client connects directly to the cluster leader, the leader immediately
+returns the current routing table. Thanks to the Raft consensus protocol, the
+leader always has the most up-to-date cluster state. If a follower receives a
+routing request, it forwards the request to the current leader, ensuring the
+client always gets accurate routing information.
+
+This ensures:
+
+- **Consistency**: All clients receive the same routing information, regardless of
+their entry point.
+- **Reliability**: The Raft consensus protocol ensures data accuracy on the leader
+node.
+- **Transparency**: Client requests are handled seamlessly, whether connected to
+leaders or followers.
+
+// routing drawing
+
+**Bolt+routing is a client-side routing protocol**, meaning network endpoint
+resolution happens inside the database drivers.
+For more details about the Bolt messages involved in the communication, check [the following
+link](https://neo4j.com/docs/bolt/current/bolt/message/#messages-route).
+
+
+
+Memgraph currently does not implement server-side routing.
+
+
+
+Users only need to change the scheme they use for connecting to coordinators.
+This means instead of using `bolt://,` you should use
+`neo4j://` to get an active connection to the current
+main instance in the cluster. You can find examples of how to use bolt+routing
+in different programming languages
+[here](https://github.com/memgraph/memgraph/tree/master/tests/drivers).
+
+It is important to note that setting up the cluster on one coordinator
+(registration of data instances and coordinators, setting main) must be done
+using bolt connection since bolt+routing is only used for routing data-related
+queries, not coordinator-based queries.
+
+## System configuration
+
+
+When deploying coordinators to servers, you can use the instance of almost any size. Instances of 4GiB or 8GiB will suffice since coordinators'
+job mainly involves network communication and storing Raft metadata. Coordinators and data instances can be deployed on same servers (pairwise)
+but from the availability perspective, it is better to separate them physically.
+
+When setting up disk space, you should always make sure that there is at least space for `--snapshot-retention-count+1` snapshots + few WAL files. That's
+because we first create (N+1)th snapshot and then delete the oldest one so we could guarantee that the creation of a new snapshot ended successfully. This is
+especially important when using Memgraph HA in K8s, since in K8s there is usually a limit set on the disk space used.
+
+
+
+Important note if you're using native Memgraph deployment with Red Hat.
+
+Red Hat uses SELinux to enforce security policies.
+SELinux (Security-Enhanced Linux) is a security mechanism for implementing mandatory access control (MAC) in the Linux kernel.
+It restricts programs, users, and processes to only the resources they require, following a least-privilege model.
+When deploying Memgraph with high availability (HA), consider checking out this attribute for instance visibility and
+setting the level of security mechanism to permissive.
+
+This rule could also apply to CentOS and Fedora, but at the moment it's not tested and verified.
+
+
+## Authentication
+
+User accounts exist exclusively on data instances - coordinators do not manage user authentication. Therefore, coordinator instances prohibit:
+ - Environment variables `MEMGRAPH_USER` and `MEMGRAPH_PASSWORD`.
+ - Authentication queries such as `CREATE USER`.
+
+When using the **bolt+routing protocol**, provide credentials for users that exist on the data instances. The authentication flow works as follows:
+
+1. Client connects to a **coordinator**.
+2. Coordinator responds with the **routing table** (without authenticating).
+3. Client connects to the **designated data instance** using the **same credentials**.
+4. Data instance **authenticates the user and processes the request**.
+
+This architecture separates routing coordination from the user management, ensuring that authentication occurs only where user data resides.
+
+
+## Starting instances
+
+You can start the data and coordinator instances using environment flags or configuration flags.
+The main difference between data instance and coordinator is that data instances have `--management-port`,
+whereas coordinators must have `--coordinator-id` and `--coordinator-port`.
+
+### Configuration Flags
+
+#### Data instance
+
+Memgraph data instance must use flag `--management-port=`. This flag is tied to the high availability feature, enables the coordinator to connect to the data instance,
+and allows the Memgraph data instance to use the high availability feature. The flag `--storage-wal-enabled` must be enabled, otherwise data instance won't be started.
+
+```
+docker run --name instance1 -p 7687:7687 -p 7444:7444 memgraph/memgraph-mage
+--management-port=13011 \
+--bolt-port=7692 \
+```
+
+#### Coordinator instance
+
+```
+docker run --name coord1 -p 7691:7691 -p 7445:7444 memgraph/memgraph-mage
+--coordinator-port=10111
+--bolt-port=7691
+--coordinator-id=1
+--coordinator-hostname=localhost
+--management-port=12121
+```
+
+Coordinator IDs serve as identifiers, the coordinator port is used for synchronization and log replication between coordinators and management port is used to get health state of
+cluster from leader coordinator. Coordinator IDs, coordinator ports and management ports must be different for all coordinators.
+
+Configuration option `--coordinator-hostname` must be set on all coordinator instances. It is used on followers to ping the leader coordinator on the correct IP address and return
+the health state about the cluster. You can set this configuration flag to the IP address, the fully qualified domain name (FQDN), or even the DNS name.
+The suggested approach is to use DNS, otherwise, in case the IP address changes, network communication between instances in the cluster will stop working.
+
+When testing on a local setup, the flag `--coordinator-hostname` should be set to `localhost` for each instance.
+
+It is important that in the host you set the bolt ports distinct for every instance, regardless of them being a data instance, or a coordinator instance.
+
+### Env flags
+
+There is an additional way to set high availability instances using environment variables. It is important to say that for the following configuration options, you can either use
+environment variables or configuration flags:
+
+- bolt port
+- coordinator port
+- coordinator id
+- management port
+- path to nuraft log file
+- coordinator hostname
+
+#### Data instances
+
+Here are the environment variables you need to use to set data instance using only environment variables:
+
+```
+export MEMGRAPH_MANAGEMENT_PORT=13011
+export MEMGRAPH_BOLT_PORT=7692
+```
+
+When using any of these environment variables, flags `--bolt-port` and `--management-port` will be ignored.
+
+
+#### Coordinator instances
+
+```
+export MEMGRAPH_COORDINATOR_PORT=10111
+export MEMGRAPH_COORDINATOR_ID=1
+export MEMGRAPH_BOLT_PORT=7687
+export MEMGRAPH_NURAFT_LOG_FILE=""
+export MEMGRAPH_COORDINATOR_HOSTNAME="localhost"
+export MEMGRAPH_MANAGEMENT_PORT=12121
+```
+
+When using any of these environment variables, flags for `--bolt-port`, `--coordinator-port`, `--coordinator-id` and `--coordinator-hostname` will be ignored.
+
+
+There is an additional environment variable you can use to set the path to the file with cypher queries used to start a high availability cluster.
+Here, you can use queries we define in the next chapter called User API.
+
+```
+export MEMGRAPH_HA_CLUSTER_INIT_QUERIES=
+```
+After the coordinator instance is started, Memgraph will run queries one by one from this file to set up a high availability cluster.
+
+## User API
+
+### Register instance
+
+Registering instances should be done on a single coordinator. The chosen coordinator will become the cluster's leader.
+
+Register instance query will result in several actions:
+1. The coordinator instance will connect to the data instance on the `management_server` network address.
+2. The coordinator instance will start pinging the data instance every `--instance-health-check-frequency-sec` seconds to check its status.
+3. Data instance will be demoted from main to replica.
+4. Data instance will start the replication server on `replication_server`.
+
+```plaintext
+REGISTER INSTANCE instanceName ( AS ASYNC | AS STRICT_SYNC ) ? WITH CONFIG {"bolt_server": boltServer, "management_server": managementServer, "replication_server": replicationServer};
+```
+
+This operation will result in writing to the Raft log.
+
+In case the main instance already exists in the cluster, a replica instance will be automatically connected to the main. Constructs ( AS ASYNC | AS STRICT_SYNC ) serve to specify
+instance's replication mode when the instance behaves as replica. You can only have `STRICT_SYNC` and `ASYNC` or `SYNC` and `ASYNC` replicas together in the cluster. Combining `STRICT_SYNC`
+and `SYNC` replicas together doesn't have proper semantic meaning so it is forbidden.
+
+
+### Add coordinator instance
+
+The user can choose any coordinator instance to run cluster setup queries. This can be done before or after registering data instances,
+the order isn't important.
+
+```plaintext
+ADD COORDINATOR coordinatorId WITH CONFIG {"bolt_server": boltServer, "coordinator_server": coordinatorServer};
+```
+
+
+
+`ADD COORDINATOR` query needs to be run for all coordinators in the cluster.
+
+```
+ADD COORDINATOR 1 WITH CONFIG {"bolt_server": "127.0.0.1:7691", "coordinator_server": "127.0.0.1:10111", "management_server": "127.0.0.1:12111"};
+ADD COORDINATOR 2 WITH CONFIG {"bolt_server": "127.0.0.1:7692", "coordinator_server": "127.0.0.1:10112", "management_server": "127.0.0.1:12112"};
+ADD COORDINATOR 3 WITH CONFIG {"bolt_server": "127.0.0.1:7693", "coordinator_server": "127.0.0.1:10113", "management_server": "127.0.0.1:12113"};
+```
+
+
+
+### Remove coordinator instance
+
+If during cluster setup or at some later stage of cluster life, the user decides to remove some coordinator instance, `REMOVE COORDINATOR` query can be used.
+Only on leader can this query be executed in order to remove followers. Current cluster's leader cannot be removed since this is prohibited
+by NuRaft. In order to remove the current leader, you first need to trigger leadership change.
+
+```plaintext
+REMOVE COORDINATOR ;
+```
+
+
+### Set instance to main
+
+Once all data instances are registered, one data instance should be promoted to main. This can be achieved by using the following query:
+
+```plaintext
+SET INSTANCE instanceName to main;
+```
+
+This query will register all other instances as replicas to the new main. If one of the instances is unavailable, setting the instance to main will not succeed.
+If there is already a main instance in the cluster, this query will fail.
+
+This operation will result in writing to the Raft log.
+
+### Demote instance
+
+Demote instance query can be used by an admin to demote the current main to replica. In this case, the leader coordinator won't perform a failover, but as a user,
+you should choose promote one of the data instances to main using the `SET INSTANCE `instance` TO main` query.
+
+```plaintext
+DEMOTE INSTANCE instanceName;
+```
+
+This operation will result in writing to the Raft log.
+
+
+
+By combining the functionalities of queries `DEMOTE INSTANCE instanceName` and `SET INSTANCE instanceName TO main` you get the manual failover capability. This can be useful
+e.g during a maintenance work on the instance where the current main is deployed.
+
+
+
+
+### Unregister instance
+
+There are various reasons which could lead to the decision that an instance needs to be removed from the cluster. The hardware can be broken,
+network communication could be set up incorrectly, etc. The user can remove the instance from the cluster using the following query:
+
+```plaintext
+UNREGISTER INSTANCE instanceName;
+```
+
+When unregistering an instance, ensure that the instance being unregistered is
+**not** the main instance. Unregistering main can lead to an inconsistent
+cluster state. Additionally, the cluster must have an **alive** main instance
+during the unregistration process. If no main instance is available, the
+operation cannot be guaranteed to succeed.
+
+The instance requested to be unregistered will also be unregistered from the current main's replica set.
+
+### Force reset cluster state
+
+In case the cluster gets stuck there is an option to do the force reset of the cluster. You need to execute a command on the leader coordinator.
+This command will result in the following actions:
+
+1. The coordinator instance will demote each alive instance to replica.
+2. From the alive instance it will choose a new main instance.
+3. Instances that are down will be demoted to replicas once they come back up.
+
+```plaintext
+FORCE RESET CLUSTER STATE;
+```
+
+This operation will result in writing to the Raft log.
+
+### Show instances
+
+You can check the state of the whole cluster using the `SHOW INSTANCES` query. The query will display all the Memgraph servers visible in the cluster. With
+each server you can see the following information:
+ 1. Network endpoints they are using for managing cluster state
+ 2. Health state of server
+ 3. Role - main, replica, LEADER, FOLLOWER or unknown if not alive
+ 4. The time passed since the last response time to the leader's health ping
+
+This query can be run on either the leader or followers. Since only the leader knows the exact status of the health state and last response time,
+followers will execute actions in this exact order:
+ 1. Try contacting the leader to get the health state of the cluster, since the leader has all the information.
+ If the leader responds, the follower will return the result as if the `SHOW INSTANCES` query was run on the leader.
+ 2. When the leader doesn't respond or currently there is no leader, the follower will return all the Memgraph servers
+ with the health state set to "down".
+
+```plaintext
+SHOW INSTANCES;
+```
+
+
+### Show instance
+
+You can check the state of the current coordinator to which you are connected by running the following query:
+
+```plaintext
+SHOW INSTANCE;
+```
+
+This query will return the information about:
+1. instance name
+2. external bolt server to which you can connect using Memgraph clients
+3. coordinator server over which Raft communication is done
+4. management server which is also used for inter-coordinators communication and
+5. cluster role: whether the coordinator is currently a leader of the follower.
+
+If the query `ADD COORDINATOR` wasn't run for the current instance, the value of the bolt server will be "".
+
+### Show replication lag
+
+The user can find the current replication lag on each instance by running `SHOW REPLICATION LAG` on the cluster's leader. The replication lag is expressed with
+the number of committed transactions. Such an info is made durable through snapshots and WALs so restarts won't cause the information loss. The information
+about the replication lag can be useful when manually performing a failover to check whether there is a risk of a data loss.
+
+```plaintext
+SHOW REPLICATION LAG;
+```
+
+
+## Setting config for highly-available cluster
+
+There are several flags that you can use for managing the cluster. Flag `--management-port` is used by both data instances
+and coordinators. The provided flag needs to be unique. Setting a flag will create an RPC server on instances capable of
+responding to the coordinator's RPC messages.
+
+
+
+RPC (Remote Procedure Call) is a protocol for executing functions on a remote
+system. RPC enables direct communication in distributed systems and is crucial
+for replication and high availability tasks.
+
+
+
+Flags `--coordinator-id`, `--coordinator-port` and `--management-port` need to be unique and specified on coordinator instances. They will cause the creation of a Raft
+server that coordinator instances use for communication. Flag `--instance-health-check-frequency-sec` specifies how often should leader coordinator
+check the status of the replication instance to update its status. Flag `--instance-down-timeout-sec` gives the user the ability to control how much time should
+pass before the coordinator starts considering the instance to be down.
+
+There is a configuration option for specifying whether reads from the main are enabled. The configuration value is by default false but can be changed in run-time
+using the following query:
+
+```
+SET COORDINATOR SETTING 'enabled_reads_on_main' TO 'true'/'false' ;
+```
+
+Users can also choose whether failover to the async replica is allowed by using the following query:
+
+```
+SET COORDINATOR SETTING 'sync_failover_only' TO 'true'/'false' ;
+```
+
+Users can control the maximum transaction lag allowed during failover through configuration. If a replica is behind the main instance by more than the configured threshold,
+that replica becomes ineligible for failover. This prevents data loss beyond the user's acceptable limits.
+
+To implement this functionality, we employ a caching mechanism on the cluster leader that tracks replicas' lag. The cache gets updated with each StateCheckRpc response from
+replicas. During the brief failover window on the cooordinators' side, the new cluster leader may not have the current lag information for all data instances and in that case,
+any replica can become main. This trade-off is intentional and it avoids flooding Raft logs with frequently-changing lag data while maintaining failover safety guarantees
+in the large majority of situations.
+
+
+The configuration value can be controlled using the query:
+
+```
+SET COORDINATOR SETTING 'max_failover_replica_lag' TO '10' ;
+```
+
+
+
+
+By default, the value is `true`, which means that only sync replicas are candidates in the election. When the value is set to `false`, the async replica is also considered, but
+there is an additional risk of experiencing data loss. However, failover to an async replica may be necessary when other sync replicas are down and you want to
+manually perform a failover.
+
+
+Users can control the maximum allowed replica lag to maintain read consistency. When a replica falls behind the current main by more than `max_replica_read_lag_` transactions, the
+bolt+routing protocol will exclude that replica from read query routing to ensure data freshness.
+
+The configuration value can be controlled using the query:
+
+
+```
+SET COORDINATOR SETTING 'max_replica_read_lag_' TO '10' ;
+```
+
+All run-time configuration options can be retrieved using:
+
+```
+SHOW COORDINATOR SETTINGS ;
+```
+
+
+
+
+Consider the instance to be down only if several consecutive pings fail because a single ping can fail because of a large number of different reasons in distributed systems.
+
+
+
+### RPC timeouts
+
+For the majority of RPC messages, Memgraph uses a default timeout of 10s. This is to ensure that when sending a RPC request, the client
+will not block indefinitely before receiving a response if the communication between the client and the server is broken. The list of RPC messages
+for which the timeout is used is the following:
+
+- ShowInstancesReq -> coordinator sending to coordinator
+- DemoteMainToReplicaReq -> coordinator sending to data instances
+- PromoteToMainReq -> coordinator sending to data instances
+- RegisterReplicaOnMainReq -> coordinator sending to data instances
+- UnregisterReplicaReq -> coordinator sending to data instances
+- EnableWritingOnMainReq -> coordinator sending to data instances
+- GetInstanceUUIDReq -> coordinator sending to data instances
+- GetDatabaseHistoriesReq -> coordinator sending to data instances
+- StateCheckReq -> coordinator sending to data instances. The timeout is set to 5s.
+- SwapMainUUIDReq -> coordinator sending to data instances
+- FrequentHeartbeatReq -> main sending to replica. The timeout is set to 5s.
+- HeartbeatReq -> main sending to replica
+- TimestampReq -> main sending to replica
+- SystemHeartbeatReq -> main sending to replica
+- ForceResetStorageReq -> main sending to replica. The timeout is set to 60s.
+- SystemRecoveryReq -> main sending to replica. The timeout is set to 5s.
+- FinalizeCommitReq -> main sending to replica. The timeout is set to 10s.
+
+
+For RPC messages which are sending the variable number of storage deltas — PrepareCommitRpc, CurrentWalRpc, and
+WalFilesRpc — it is not practical to set a strict execution timeout. The
+processing time on the replica side is directly proportional to the number of
+deltas being transferred. To handle this, the replica sends periodic progress
+updates to the main instance after processing every 100,000 deltas. Since
+processing 100,000 deltas is expected to take a relatively consistent amount of
+time, we can enforce a timeout based on this interval. The default timeout for
+these RPC messages is 30 seconds, though in practice, processing 100,000 deltas
+typically takes less than 3 seconds.
+
+SnapshotRpc is also a replication-related RPC message, but its execution time
+is tracked a bit differently from RPC messages shipping deltas. The replica sends an update to the main instance after
+completing 1,000,000 units of work. The work units are assigned as follows:
+
+- Processing nodes, edges, or indexed entities (label index, label-property index,
+ edge type index, edge type property index) = 1 unit
+- Processing a node inside a point or text index = 10 units
+- Processing a node inside a vector index (most computationally expensive) =
+ 1,000 units
+
+With this unit-based tracking system, the replica is expected to report progress
+every 2–3 seconds. Given this, a timeout of 60 seconds is set to avoid
+unnecessary network instability while ensuring responsiveness.
+
+Except for timeouts on read and write operations, Memgraph also has a timeout of 5s
+for sockets when establishing a connection. Such a timeout helps in having a low p99
+latencies when using the RPC stack, which manifests for users as smooth and predictable
+network communication between instances.
+
+
+## Failover
+
+### Determining instance's health
+
+Every `--instance-health-check-frequency-sec` seconds, the coordinator contacts each instance.
+The instance is not considered to be down unless `--instance-down-timeout-sec` has passed and the instance hasn't responded to the coordinator in the meantime.
+Users must set `--instance-health-check-frequency-sec` to be less or equal to the `--instance-down-timeout-sec` but we advise users to set `--instance-down-timeout-sec` to
+a multiplier of `--instance-health-check-frequency-sec`. Set the multiplier coefficient to be N>=2.
+For example, set `--instance-down-timeout-sec=5` and `--instance-health-check-frequency-sec=1` which will result in coordinator contacting each instance every second and
+the instance is considered dead after it doesn't respond 5 times (5 seconds / 1 second).
+
+In case a replica doesn't respond to a health check, the leader coordinator will try to contact it again every `--instance-health-check-frequency-sec`.
+When the replica instance rejoins the cluster (comes back up), it always rejoins as replica. For main instance, there are two options.
+If it is down for less than `--instance-down-timeout-sec`, it will rejoin as main because it is still considered alive. If it is down for more than `--instance-down-timeout-sec`,
+the failover procedure is initiated. Whether main will rejoin as main depends on the success of the failover procedure. If the failover procedure succeeds, now old main
+will rejoin as replica. If failover doesn't succeed, main will rejoin as main once it comes back up.
+
+### Failover procedure - high level description
+
+From alive replicas coordinator chooses a new potential main and writes a log to the Raft storage about the new main. On the next leader's ping to the instance,
+it will send to the instance an RPC request to the new main, which is still in replica state, to promote itself to the main instance with info
+about other replicas to which it will replicate data. Once that request succeeds, the new main can start replication to the other instances and accept write queries.
+
+### Choosing new main from available replicas
+
+
+During failover, the coordinator must select a new main instance from available replicas, as some may be offline. The leader coordinator queries each live replica to
+retrieve the committed transaction count for every database.
+
+The selection algorithm prioritizes data recency using a two-phase approach:
+
+1. **Database majority rule**: The coordinator identifies which replica has the highest committed transaction count for each database. The replica that leads in the most
+databases becomes the preferred candidate.
+2. **Total transaction tiebreaker**: If multiple replicas tie for leading the most databases, the coordinator sums each replica's committed transactions across all databases.
+The replica with the highest total becomes the new main.
+
+This approach ensures the new main instance has the most up-to-date data across the cluster while maintaining consistency guarantees.
+
+### Old main rejoining to the cluster
+
+Once the old main gets back up, the coordinator sends an RPC request to demote the old main to replica. The coordinator tracks at all times which instance was the last main.
+
+The leader coordinator sends two RPC requests in the given order to demote old main to replica:
+1. Demote main to replica RPC request
+2. A request to store the UUID of the current main, which the old main, now acting as a replica instance, must listen to.
+
+### How replica knows which main to listen
+
+Each replica has a UUID of main it listens to. If a network partition happens where main can talk to a replica but the coordinator can't talk to the main, from the coordinator's
+point of view that main is down. From replica's point of view, the main instance is still alive. The coordinator will start the failover procedure, and we can end up with multiple mains
+where replicas can listen to both mains. To prevent such an issue, each replica gets a new UUID that no current main has. The coordinator generates the new UUID,
+which the new main will get to use on its promotion to main.
+
+If replica was down at one point, main could have changed. When replica gets back up, it doesn't listen to any main until the coordinator sends an RPC request to replica to start
+listening to main with the given UUID.
+
+### Replication concerns
+
+#### Force sync of data
+
+During a failover event, Memgraph selects the most up-to-date, alive instance to
+become the new main. The selection process works as follows:
+1. From the list of available replica instances, Memgraph chooses the one with
+the latest commit timestamp for the default database.
+2. If an instance that had more recent data was down during this selection
+process, it will not be considered for promotion to main.
+
+If a previously down instance had more up-to-date data but was unavailable
+during failover, it will go through a specific recovery process upon rejoining
+the cluster:
+- The replica will reset its storage.
+- The replica will receive all commits from the new main to
+ synchronize its state.
+- The replica's old durability files will be preserved in a `.old` directory in
+ `data_directory/snapshots` and `data_directory/wal` folders, allowing admins
+ to manually recover data if needed.
+
+Depending on the replication mode used, there are different levels of data loss
+that can happen upon the failover. With the default `SYNC` replication mode,
+Memgraph prioritizes availability over strict consistency and can result in
+a non-zero Recovery Point Objective (RPO), that is, the loss of committed data, because:
+- The promoted main might not have received all commits from the previous main
+ before the failure.
+- This design ensures that the main remains writable for the maximum possible
+ time.
+
+With `ASYNC` replication mode, you also risk losing some data upon the failover because
+main can freely continue commiting no matter the status of ASYNC replicas.
+
+The `STRICT_SYNC` replication mode allows users experiencing a failover without any data loss
+in all situations. It comes with reduced throughput because of the cost of running two-phase commit protocol.
+
+
+## Actions on follower coordinators
+
+From follower coordinators you can only execute `SHOW INSTANCES`. Registration of data instance, unregistration of data instances, demoting instance, setting instance to main and
+force resetting cluster state are all disabled.
+
+
+## Instances' restart
+
+### Data instances' restart
+
+Data instances can fail both as main and as replica. When an instance that was replica comes back, it won't accept updates from any instance until the coordinator updates its
+responsible peer. This should happen automatically when the coordinator's ping to the instance passes. When the main instance comes back, any writing to the main instance will be
+forbidden until a ping from the coordinator passes.
+
+### Coordinator instances restart
+
+In case the coordinator instance dies and it is restarted, it will not lose any data from the RAFT log or RAFT snapshots, since coordinator data is always backed-up by a durable storage.
+For more details read about high availability durability in the durability chapter.
+
+
+## Durability
+
+All NuRaft data is made durable by default. This includes all Raft logs, Raft snapshots and information about cluster connectivity.
+The details about the cluster connectivity are made durable since without that information, the coordinator can't rejoin the cluster on its restart.
+
+Information about logs and snapshots is stored under one RocksDB instance in the `high_availability/raft_data/logs` directory stored under
+the top-level `--data-directory` folder. All the data stored there is recovered in case the coordinator restarts.
+
+Data about other coordinators is recovered from the `high_availability/raft_data/network` directory stored under
+the top-level `--data-directory` folder. When the coordinator rejoins, it will reestablish the communication with other coordinators and receive updates from the current leader.
+
+### First start
+
+On the first start of coordinators, each will store the current version of the `logs` and `network` durability store. From that point on,
+each RAFT log that is sent to the coordinator is also stored on disk. For every new coordinator instance, the server config is updated. Logs are created
+for each user action and failover action. Snapshots are created every N (N currently being 5) logs.
+
+
+### Restart of coordinator
+
+In case of the coordinator's failure, on the restart, it will read information about other coordinators stored under `high_availability/raft_data/network` directory.
+
+From the `network` directory we will recover the server state before the coordinator stopped, including the current term, for whom the coordinator voted, and whether
+election timer is allowed.
+
+It will also recover the following server config information:
+- other servers, including their endpoints, id, and auxiliary data
+- ID of the previous log
+- ID of the current log
+- additional data needed by nuRaft
+
+The following information will be recovered from a common RocksDB `logs` instance:
+- current version of `logs` durability store
+- snapshots found with `snapshot_id_` prefix in database:
+ - coordinator cluster state - all data instances with their role (main or replica), all coordinator instances and UUID of main instance which replica is listening to
+ - last log idx
+ - last log term
+ - last cluster config
+- logs found in the interval between the start index and the last log index
+ - data - each log holds data on what has changed since the last state
+ - term - nuRAFT term
+ - log type - nuRAFT log type
+
+
+### Handling of durability errors
+
+If snapshots are not correctly stored, the exception is thrown and left for the nuRAFT library to handle the issue. Logs can be missed and not stored since they are compacted and
+deleted every two snapshots and will be removed relatively fast.
+
+Memgraph throws an error when failing to store cluster config, which is updated in the `high_availability/raft_data/network` folder.
+If this happens, it will happen only on the first cluster start when coordinators are connecting since
+coordinators are configured only once at the start of the whole cluster. This is a non-recoverable error since in case the coordinator rejoins the cluster and has
+the wrong state of other clusters, it can become a leader without being connected to other coordinators.
+
+
+## Recovering from errors
+
+Distributed systems can fail in numerous ways. Memgraph processes are resilient to network
+failures, omission faults and independent machine failures. Byzantine failures aren't tolerated since the Raft consensus protocol cannot deal with them either.
+
+Recovery Time Objective (RTO) is an often used term for measuring the maximum tolerable length of time that an instance or cluster can be down.
+Since every highly available Memgraph cluster has two types of instances, we need to analyze the failures of each separately.
+
+Raft is a quorum-based protocol and it needs a majority of instances alive in order to stay functional. Hence, with just one coordinator instance down, RTO is 0 since
+the cluster stays available. With 2+ coordinator instances down
+(in a cluster with RF = 3), the RTO depends on the time needed for instances to come back.
+
+Depending on the replica's replication mode, its failure can lead to different situations. If the replica was registered with STRICT_SYNC mode, then on its failure, writing
+on main will be disabled. On the other hand, if replica was registered as ASYNC or SYNC, further writes on main are still allowed. In both cases, reads are still allowed from
+main and other replicas.
+
+
+The most important thing to analyze is what happens when main gets down. In that case, the leader coordinator uses
+user-controllable parameters related to the frequency of health checks from the leader to replication instances (`--instance-health-check-frequency-sec`)
+and the time needed to realize the instance is down (`--instance-down-timeout-sec`). After collecting enough evidence, the leader concludes the main is down and performs failover
+using just a handful of RPC messages (correct time depends on the distance between instances). It is important to mention that the whole failover is performed without the loss of committed data
+if the newly chosen main (previously replica) had all up-to-date data.
+
+## Raft configuration parameters
+
+Several Raft-related parameters are important for the correct functioning of the cluster. The leader coordinator sends a heartbeat
+message to other coordinators every second to determine their health. This configuration option is connected with leader election timeout which
+is a randomized value from the interval [2000ms, 4000ms] and which is used by followers to decide when to trigger new election process. Leadership
+expiration is set to 2000ms so that cluster can never get into situation where multiple leaders exist. These specific values give a cluster
+the ability to survive occasional network hiccups without triggering leadership changes.
+
+
+## Data center failure
+
+The architecture we currently use allows us to deploy coordinators in 3 data centers and hence tolerate a failure of the whole data center. Data instances can be freely
+distributed in any way you want between data centers. The failover time will be slightly increased due to the network communication needed.
+
+
diff --git a/pages/clustering/concepts/how-replication-works.mdx b/pages/clustering/concepts/how-replication-works.mdx
new file mode 100644
index 000000000..2dfd35380
--- /dev/null
+++ b/pages/clustering/concepts/how-replication-works.mdx
@@ -0,0 +1,352 @@
+---
+title: How replication works
+description: Learn about the underlying implementation and theoretical concepts behind Memgraph replication, including CAP theorem, replication modes, and synchronization mechanisms.
+---
+
+import { Callout } from 'nextra/components'
+import {CommunityLinks} from '/components/social-card/CommunityLinks'
+
+# How replication works in Memgraph
+
+Uninterrupted data and operational availability in production systems are
+critical and can be achieved in many ways. In Memgraph we opted for replication.
+
+In distributed systems theory the CAP theorem, also named Brewer's theorem,
+states that any distributed system can simultaneously guarantee two out of the
+three properties:
+
+1. **Consistency** (C) - every node has the same view of data at a given point in
+ time
+2. **Availability** (A) - all clients can find a replica of the data, even in the
+ case of a partial node failure
+3. **Partition tolerance** (P) - the system continues to work as expected despite a
+ partial network failure
+
+
+
+Most of the Memgraph use cases do not benefit from well-known algorithms that
+strive to achieve all three CAP properties, such as Raft, because due to their
+complexity, they produce performance issues. Memgraph use-cases are based on
+running analytical graph workloads on real-time data, demanding a simpler
+concept such as **replication**.
+
+Replication consists of replicating data from one storage to one or several
+other storages. The downside of its simplicity is that only two out of three CAP
+properties can be achieved.
+
+### Replication implementation in Memgraph
+
+To enable replication, there must be at least two instances of Memgraph in a
+cluster. Each instance has one of two roles: **MAIN** or **REPLICA**.
+
+
+
+The MAIN instance can accept read and write queries to the database, while the REPLICA instances accept only
+read queries.
+
+
+
+During the initial startup, all instances are MAIN by default. When creating a replication cluster,
+one instance has to be chosen as the MAIN instance.
+The rest of the instances have to be **demoted to REPLICA roles**. Replicas receive data by creating an RPC
+replication server which is listening on an arbitrary port.
+
+The way MAIN instance replicates data to the REPLICA instances can be carried out in a
+**SYNC**, **ASYNC**, or **STRICT_SYNC** mode.
+The replication mode defines the terms by which the MAIN instance can commit the
+changes to the database, thus modifying the system to prioritize either
+consistency or availability.
+
+- **STRICT_SYNC mode** - Replication is implemented as a [two-phase commit protocol (2PC)](https://en.wikipedia.org/wiki/Two-phase_commit_protocol).
+After committing a transaction, the MAIN instance will communicate the changes to all REPLICA instances
+and wait until it receives a response or information that a timeout is reached.
+The STRICT_SYNC mode ensures consistency and partition tolerance (CP), but not availability for writes.
+If the primary database has multiple replicas, the system is highly available for reads. But, when a replica fails,
+the MAIN instance can’t process the write due to the nature of synchronous replication.
+
+- **SYNC mode** - After committing a transaction, the MAIN instance will communicate the changes to all
+REPLICA instances and wait until it receives a response or information that a timeout is reached.
+It is different from STRICT_SYNC mode because it the MAIN can continue committing even in situations when
+SYNC replica is down.
+
+- **ASYNC mode** - The MAIN instance will commit a transaction without receiving confirmation from
+REPLICA instances that they have received the same transaction. ASYNC mode ensures system availability
+and partition tolerance (AP), while data can only be eventually consistent.
+
+By using the timestamp, the MAIN instance knows the current state of the
+REPLICA. If the REPLICA is not synchronized (lagging behind) with the MAIN instance, the MAIN
+instance sends the correct data for synchronization as WAL files. When all the WAL files have been
+successfully transferred to the REPLICA instance, the system is then considered to be in-sync.
+This procedure is similar to [how PostgreSQL does replication](https://www.postgresql.org/docs/current/warm-standby.html#STREAMING-REPLICATION).
+
+If the REPLICA is so far behind the MAIN instance that the synchronization using
+WAL files is impossible, Memgraph will use snapshots.
+
+### Replication modes
+
+Replication mode defines the terms by which the MAIN instance can commit the
+changes to the database, thus modifying the system to prioritize either
+consistency or availability. There are two possible replication modes
+implemented in Memgraph replication:
+
+- SYNC
+- ASYNC
+- STRICT_SYNC
+
+
+
+When a REPLICA instance is registered and added to the cluster, it will start
+replicating to catch up to the current state of the MAIN instance. Initial replication
+when a REPLICA instance is registered is handled in ASYNC mode by design decision.
+
+When the REPLICA instance synchronizes with the MAIN
+instance, the replication mode will change according to the mode defined during
+registration.
+
+#### SYNC replication mode
+
+SYNC mode is the most straightforward replication mode in which the main storage
+thread waits for the response and cannot continue until the response is
+received or a timeout is reached. If the REPLICA fails, MAIN instance will still commit
+the data and move forward. This behaviour does not block writes on REPLICA failure, and still
+ensures other REPLICAs to receive new data.
+
+The following diagrams express the behavior of the MAIN instance in cases when
+SYNC REPLICA doesn't answer within the expected timeout.
+
+**SYNC replication ensures consistency and partition tolerance (CP).**
+However, there is an extremely minimal chance of data loss. For complete consistency without data loss,
+Memgraph offers STRICT_SYNC replication mode.
+
+#### STRICT_SYNC replication mode
+
+The STRICT_SYNC replication mode behaves very similarly to a
+SYNC mode except that MAIN **won't commit a transaction locally in a situation in
+which one of STRICT_SYNC replicas is down**. To achieve that, all instances run
+together a *two-phase commit* protocol which allows you such a synchronization. This
+reduces the throughout but such a mode is super useful in a high-availability
+scenario in which a failover is the most critical operation to support. Such a mode then
+allows you a failover **without the fear of experiencing a data loss**.
+
+**STRICT_SYNC mode ensures consistency and partition tolerance (CP).**
+
+#### ASYNC replication mode
+
+In the ASYNC replication mode, the MAIN instance will commit a transaction
+without receiving confirmation from REPLICA instances that they have received
+the same transaction. This means that the **MAIN instance does not wait for the
+response from the REPLICA instances** in the main thread but in some other thread.
+
+Each REPLICA instance has one permanent thread connecting it with
+the MAIN instance for ASYNC replication. Using this background thread, the MAIN instance pushes
+replication tasks to the REPLICA instance, creates a custom thread pool pattern,
+and receives confirmations of successful replication from the REPLICATION
+instance.
+
+**ASYNC mode ensures system availability and partition tolerance (AP).**
+
+
+
+### REPLICA state
+
+There are 5 states in which replica can be at a point in time:
+- **READY** - replica is not lagging behind and all the data is replicated
+- **REPLICATING** - state the REPLICA is in when it's receiving transaction commits.
+If this action succeeds, the replica will again move to READY state. If it fails, it will move to INVALID.
+- **INVALID/BEHIND** - replica is behind, and needs to be synced with MAIN
+- **RECOVERY** - after MAIN detects that a REPLICA is invalid/behind, the REPLICA state is changed to RECOVERY.
+At this point, the transfer of durability files is performed in order for the REPLICA to catch up with MAIN
+- **DIVERGED** - this is a state in which REPLICA can be found if you're performing manual failover. Manual conflict
+resolution and recovery of the cluster is needed in order for this state to convert to READY.
+
+Based on RPC heartbeats, MAIN decides in which state the REPLICA is in at a point in time. REPLICA doesn't know by itself
+in which state it is in. It doesn't need to know that, as MAIN is the sole initiator of synchronization mechanisms when
+performing replication or recovery.
+
+
+
+### How are instances synchronized?
+
+To understand how individual instances are keeping the state of the data in sync, we need to
+understand the basic durability entities which are replicated from MAIN:
+- **Snapshots** - Point-in-time images of the full database state. Snapshots are the largest durability objects
+ that are replicated
+- **WALs (write-ahead logs)** - Append-only durability files that store sequences of committed deltas.
+ Because WALs are much smaller than snapshots, Memgraph prefers them for recovery when possible.
+- **Delta objects** - The smallest atomic updates produced when MAIN commits a transaction
+ (e.g., create/update/delete of nodes/edges/properties). A single transaction can have multiple deltas that need to be replicated
+ on commit. If the REPLICA is fully in sync, only Delta objects will be replicated during the commit time.
+ For more information about delta objects, please refer to the
+ [in-memory transactional storage mode guides](/fundamentals/storage-memory-usage#in-memory-analytical-storage-mode).
+
+
+
+To learn more about durability in Memgraph, check out the [data durability fundamentals](/fundamentals/data-durability).
+
+
+
+Each transaction in Memgraph has an auto-incrementing timestamp which acts as a time variable.
+
+In the ideal scenario, the MAIN will just send the Delta objects to the REPLICA. In that case, REPLICA will
+transfer from READY to REPLICATING state, and then come back again to the READY state. Transfer of delta objects
+in real-time is the optimal approach, because they're the smallest units to be transferred over the network.
+This happy flow ensures the REPLICA is always in-sync with MAIN.
+
+There are a variety of scenarios when that happy flow can not be maintained, such as network issues, or failing of
+the instance. By comparing timestamps, the MAIN instance knows when a REPLICA instance is not
+synchronized and is missing some earlier transactions. **If the REPLICA is behind (INVALID replica state),
+it will have a lower timestamp than the MAIN instance.** The REPLICA instance is
+then set into a RECOVERY state, where it remains until it is fully synchronized
+with the MAIN instance.
+
+**The missing data changes can be sent as snapshots or WAL files, which are the main
+data durability files for Memgraph.** Because of the difference in file size,
+Memgraph favors to send the WAL files over, rather than the snapshots. After all the
+necessary durability files have been sent over, REPLICA can then move to READY state.
+
+While the REPLICA instance is in the RECOVERY state, the MAIN instance
+calculates the optimal synchronization path based on the REPLICA instance's
+timestamp and the current state of the durability files while keeping the
+overall size of the files necessary for synchronization to a minimum.
+
+
+
+Imagine there were 5 changes made to the database. Each change is saved in a WAL
+file, so there are 5 WAL files, and the snapshot was created after 3 changes.
+The REPLICA instance can be synchronized using a snapshot and the 2 latest WAL
+files or using 5 WAL files. Both options would correctly synchronize the
+instances, but 5 WAL files are much smaller.
+
+If the RECOVERY did not succeed, the REPLICA again moves back and forth the INVALID and RECOVERY
+states. Reason for this can again be network issues, but also data corruption. If you believe the
+system is not able to recover at all, please contact our Enterprise support or Discord channel.
+
+### Replication of multi-tenant data (Enterprise)
+
+
+
+Memgraph's multi-tenancy offers management of multiple logically isolated databases. The word *database* here is a synonym
+to a *tenant*. Learn more about this in our [multi-tenancy documentation page](/database-management/multi-tenancy).
+
+
+
+When running multi-tenancy, there can be multiple durability files sent over the network for
+each database. To ensure correct mapping between the MAIN and REPLICA databases, each database has its
+own database UUID. When creating a new database on MAIN, the database is also replicated to the REPLICAs,
+as well as the identical UUIDs of the databases. This ensures there is a mapping between the set of MAIN
+databases and the set of REPLICA databases.
+
+When sending replication data over the network, durability files are also assigned
+the database UUID. It serves as a unique location of which database to apply the durability
+files to in the replicas.
+
+
+
+
+
+Ensure replication / high-availability at the very beginning in order to make sure the correct information is replicated
+from MAIN to REPLICA. In this particular case, if you created databases respectively on the standalone
+instances, and then connected the cluster, it would not work because UUIDs are generated randomly.
+The operation would end up in a mismatch of database UUIDs, and you would not be able to recover the cluster.
+
+
+
+### Advanced replication topics
+
+
+
+The following section explains highly technical topics, dedicated to those who would
+like to know more about technical implementations of replication in Memgraph.
+
+
+
+The durability files are constantly being created, deleted, and appended to. Also,
+each replica could need a different set of files to sync. There are several ways
+to ensure that the necessary files persist and that instances can read the WAL
+files currently being updated without affecting the performance of the rest of
+the database.
+
+#### Locking durability files
+
+Durability files are also used for recovery and are periodically deleted to
+eliminate redundant data. The problem is that they can be deleted while they are
+being used to synchronize a REPLICA with the MAIN instance.
+
+To delay the file deletion, Memgraph uses a file retainer that consists of
+multiple lockers. Threads can store and lock the files they found while
+searching for the optimal recovery path in the lockers, thus ensuring the files
+will still exist once they are sent to the REPLICA instance as a part of the
+synchronization process. If another part of the system sends a deletion
+request for a certain file, the file retainer first checks if that file is
+locked in a locker. If it is not, it is deleted immediately. If the file is
+locked, the file retainer adds the file to the deletion queue. The file retainer
+will periodically clean the queue by deleting the files that are no longer
+locked inside the locker.
+
+#### Writing and reading files simultaneously
+
+Memgraph internal file buffer is used when writing deltas to WAL files, and
+mid-writing, the content of one WAL file can be divided across two locations. If
+at that point that WAL file is used to synchronize the REPLICA instance, once
+the data is being read from the internal buffer, the buffer can be flushed, and
+the REPLICA could receive an invalid WAL file because it is missing a chunk of
+data. It could also happen that the WAL file is sent before all the transactions
+are written to the internal buffer.
+
+To avoid these issues, flushing of that internal buffer is disabled while the
+current WAL is sent to a REPLICA instance. To get all the data necessary for the
+synchronization, the replication thread reads the content directly from the WAL
+file, then reads how many bytes are written in the buffer and copies the data to
+another location. Then the flushing is enabled again, and the transaction is
+replicated using the copied buffer. Because the access to the internal buffer
+was not blocked, new data can be written. The content of the buffer (including
+any new data) is then written in a new WAL file that will be sent in the next
+synchronization process.
+
+
+
+#### Fixing timestamp consistency
+
+Timestamps are used to compare the state of the REPLICA instance in comparison
+to the MAIN instance.
+
+At first, we used the current timestamp without increasing its value for global
+operations, like creating an index or creating a constraint. By using a single
+timestamp, it was impossible to know which operations the REPLICA had applied
+because sequential global operations had the same timestamp. To avoid this
+issue, a unique timestamp is assigned to each global operation.
+
+As replicas allow read queries, each of those queries was assigned with its own
+timestamp. Those timestamps caused issues when the replicated write transactions
+were assigned an older timestamp. A read transaction would return different data
+from the same read query if a transaction was replicated between those two read
+transactions which obstructed the snapshot isolation. To avoid this problem, the
+**timestamp on REPLICA instances isn't increased** because the read transactions
+don't produce any changes, so no deltas need to be timestamped.
+
+#### Epoch ID as a complement to timestamp ID
+
+A unique ID `epoch_id` is also assigned each time an instance is run as the MAIN
+instance in the replication cluster to check if the data is compatible for
+replication. The `epoch_id` is necessary when the original MAIN instance fails,
+a REPLICA instance becomes a new MAIN, and after some time, the original MAIN
+instance is brought back online. If no transactions were run on the original
+MAIN instance, the difference in timestamps will indicate that it is behind the
+new MAIN, and it would be impossible to set the original MAIN-REPLICA
+relationship. But if the transactions were run on the original MAIN after it was
+brought back online, the timestamp would be of no help, but the `epoch_id` would
+indicate incomparability, thus preventing the original MAIN from reclaiming its
+original role.
+
+
+
+#### System data replication
+
+We have outlined in the main section of this guide how graph data replication works.
+When we talk about data storage, we strictly mean the graph itself, along with the complementary
+performance and correctness data structures, such as nodes, relationships, properties, indices,
+constraints, triggers, and streams.
+For replication support of non-graph data, such as authentication configurations, multi-tenant data, please refer
+to the [system replication reference](/clustering/replication/system-replication).
+
+
diff --git a/pages/clustering/faq.mdx b/pages/clustering/faq.mdx
new file mode 100644
index 000000000..da8c5cad9
--- /dev/null
+++ b/pages/clustering/faq.mdx
@@ -0,0 +1,49 @@
+---
+title: Frequently asked questions about clustering
+description: Explore the documentation page for Memgraph and access the FAQ section to find solutions to common queries and concerns. Discover essential information and insights now.
+---
+import { CommunityLinks } from '/components/social-card/CommunityLinks'
+
+
+# Frequently asked questions
+
+## High availability (general)
+
+#### Does Memgraph support chaining REPLICA instances?
+Memgraph at the moment doesn't support chaining REPLICA instances, that is, a REPLICA
+instance cannot be replicated on another REPLICA instance.
+
+#### Can a REPLICA listen to multiple MAIN instances?
+Memgraph enforces the behaviour that REPLICA can only listen to exactly one MAIN instance.
+When starting any Memgraph instance, it is assigned a unique UUID of the instance. This is communicated
+when a replica is registered, to ensure REPLICA does not receive replication data from another MAIN instance.
+A REPLICA stores the UUID of the MAIN instance it listens to.
+The instance UUID of each Memgraph is persisted on disk across restarts, so this behaviour is enforced throughout the
+cluster lifecycle.
+
+#### Can a REPLICA create snapshots by itself?
+No. REPLICA can only receive snapshots during the recovery phase.
+
+#### Can a REPLICA create WALs by itself?
+Actually, this is being done in the system. When a MAIN is committing, it is sending the Delta objects to the REPLICA.
+Here the replica is doing two things:
+- it is applying the Delta objects from MAIN to catch up
+- it is writing the Delta objects to its own WAL files
+
+Why is this important?
+Picture the following scenario. REPLICA is up to date with the MAIN. MAIN is constantly sending delta objects to the REPLICA.
+After a while, the MAIN goes down and REPLICA is promoted to be the "new MAIN". The new MAIN would not have any durability
+files, if it didn't write WALs during its period of being a REPLICA. If the old MAIN rises, the new MAIN would perhaps have
+insufficient information to be sent to the new REPLICA. That's why REPLICA always needs to write down in WALs what it's
+being received.
+
+
+### High availability with Docker
+
+### High availability with Docker Compose
+
+### High availability with K8s
+
+
+
+
diff --git a/pages/clustering/high-availability.mdx b/pages/clustering/high-availability.mdx
index 92307249d..2feb69b53 100644
--- a/pages/clustering/high-availability.mdx
+++ b/pages/clustering/high-availability.mdx
@@ -1,1253 +1,36 @@
---
title: High availability
-description: Dive into the documentation page for Memgraph and learn how to configure a cluster of Memgraph instances to achieve high availability.
+description: Dive into the documentation page for Memgraph and learn how to configure high availability for data redundancy, DR and 24/7 uptime.
---
import { Callout } from 'nextra/components'
-import { Steps } from 'nextra/components'
-import {CommunityLinks} from '/components/social-card/CommunityLinks'
+import { CommunityLinks } from '/components/social-card/CommunityLinks'
+# How to setup high availability with Memgraph (Eterprise)
-# High availability (Enterprise)
-
-A cluster is considered highly available if, at any point, there is some instance that can respond to a user query. Our high availability
-relies on replication. The cluster consists of:
-- The main instance on which the user can execute write queries
-- replica instances that can only respond to read queries
-- COORDINATOR instances that manage the cluster state.
-
-Depending on how configuration flags are set, Memgraph can run as a data instance or coordinator instance.
-The coordinator instance is a new addition to enable the high availability feature and orchestrates data instances to ensure that there is always one main instance in the cluster.
-
-## Cluster management
-
-For achieving high availability, Memgraph uses Raft consensus protocol, which is very similar to Paxos in terms of performance and fault-tolerance but with
-a significant advantage that it is much easier to understand. It's important to say that Raft isn't a
-Byzantine fault-tolerant algorithm. You can learn more about Raft in the paper [In Search of an Understandable Consensus Algorithm](https://raft.github.io/raft.pdf).
-
-Typical Memgraph's highly available cluster consists of 3 data instances (1 main and 2 replicaS) and 3 coordinator instances backed up by Raft protocol.
-Users can create more than 3 coordinators, but the replication factor (RF) of 3 is a de facto standard in distributed databases.
-
-One coordinator instance is the leader whose job is to always ensure one writeable data instance (main). The other two coordinator instances replicate
-changes the leader coordinator did in its own Raft log. Operations saved into the Raft log are those that are related to cluster management. Memgraph doesn't have its
-implementation of the Raft protocol. For this task, Memgraph uses an industry-proven library [NuRaft](https://github.com/eBay/NuRaft).
-
-You can start the coordinator instance by specifying `--coordinator-id`,
-`--coordinator-port` and `--management-port` flags. Followers ping the leader on the `--management-port` to get health state of the cluster. The coordinator instance only responds to
-queries related to high availability, so you cannot execute any data-oriented query on it. The coordinator port is used for the Raft protocol, which
-all coordinators use to ensure the consistency of the cluster's state. Data instances are distinguished from coordinator instances by
-specifying only `--management-port` flag. This port is used for RPC network communication between the coordinator and data
-instances. When started by default, the data instance is main. The coordinator will ensure that no data inconsistency can happen during and after the instance's
-restart. Once all instances are started, the user can start adding data instances to the cluster.
-
-
-
-
-The Raft consensus algorithm ensures that all nodes in a distributed system
-agree on a single source of truth, even in the presence of failures, by electing
-a leader to manage a replicated log. It simplifies the management of the
-replicated log across the cluster, providing a way to achieve consistency and
-coordination in a fault-tolerant manner. Users are advised to use an odd number of instances
-since Raft, as a consensus algorithm, works by forming a majority in the decision making.
-
-
-
-## Observability
-
-Monitoring the cluster state is very important and tracking various metrics can provide us with a valuable information. Currently, we track
-metrics which reveal us p50, p90 and p99 latencies of RPC messages, the duration of recovery process and the time needed to react to changes
-in the cluster. We are also counting the number of different RPC messages exchanged and the number of failed requests since this can give
-us information about parts of the cluster that need further care. You can see the full list of metrics [here](/database-management/monitoring#system-metrics).
-
-## Bolt+routing
-
-Directly connecting to the main instance isn't preferred in the HA cluster since
-the main instance changes due to various failures. Because of that, users can
-use **Bolt with routing**, which ensures that write queries are always sent to
-the current main instance. This prevents split-brain scenarios, as clients never
-write to the old main but are automatically routed to the new main after a
-failover.
-The routing protocol works as follows: the client sends a `ROUTE` Bolt
-message to any coordinator instance. The coordinator responds with a **routing
-table** containing three entries:
-
-1. Instances from which data can be read
-2. The instance where data can be written
-3. Instances acting as routers
-
-When a client connects directly to the cluster leader, the leader immediately
-returns the current routing table. Thanks to the Raft consensus protocol, the
-leader always has the most up-to-date cluster state. If a follower receives a
-routing request, it forwards the request to the current leader, ensuring the
-client always gets accurate routing information.
-
-This ensures:
-
-- **Consistency**: All clients receive the same routing information, regardless of
-their entry point.
-- **Reliability**: The Raft consensus protocol ensures data accuracy on the leader
-node.
-- **Transparency**: Client requests are handled seamlessly, whether connected to
-leaders or followers.
-
-In the Memgraph HA cluster:
-
-- The **main instance** is the only writable instance
-- **Replicas** are readable instances
-- **Coordinators** act as routers
-
-However, the cluster can be configured in such a way that main can also be used
-for reading. Check this
-[paragraph](#setting-config-for-highly-available-cluster) for more info.
-Bolt+routing is the client-side routing protocol, meaning network endpoint
-resolution happens inside drivers. For more details about the Bolt messages
-involved in the communication, check [the following
-link](https://neo4j.com/docs/bolt/current/bolt/message/#messages-route).
-
-Users only need to change the scheme they use for connecting to coordinators.
-This means instead of using `bolt://,` you should use
-`neo4j://` to get an active connection to the current
-main instance in the cluster. You can find examples of how to use bolt+routing
-in different programming languages
-[here](https://github.com/memgraph/memgraph/tree/master/tests/drivers).
-
-It is important to note that setting up the cluster on one coordinator
-(registration of data instances and coordinators, setting main) must be done
-using bolt connection since bolt+routing is only used for routing data-related
-queries, not coordinator-based queries.
-
-## System configuration
-
-
-When deploying coordinators to servers, you can use the instance of almost any size. Instances of 4GiB or 8GiB will suffice since coordinators'
-job mainly involves network communication and storing Raft metadata. Coordinators and data instances can be deployed on same servers (pairwise)
-but from the availability perspective, it is better to separate them physically.
-
-When setting up disk space, you should always make sure that there is at least space for `--snapshot-retention-count+1` snapshots + few WAL files. That's
-because we first create (N+1)th snapshot and then delete the oldest one so we could guarantee that the creation of a new snapshot ended successfully. This is
-especially important when using Memgraph HA in K8s, since in K8s there is usually a limit set on the disk space used.
-
-
-
-Important note if you're using native Memgraph deployment with Red Hat.
-
-Red Hat uses SELinux to enforce security policies.
-SELinux (Security-Enhanced Linux) is a security mechanism for implementing mandatory access control (MAC) in the Linux kernel.
-It restricts programs, users, and processes to only the resources they require, following a least-privilege model.
-When deploying Memgraph with high availability (HA), consider checking out this attribute for instance visibility and
-setting the level of security mechanism to permissive.
-
-This rule could also apply to CentOS and Fedora, but at the moment it's not tested and verified.
-
-
-## Authentication
-
-User accounts exist exclusively on data instances - coordinators do not manage user authentication. Therefore, coordinator instances prohibit:
- - Environment variables `MEMGRAPH_USER` and `MEMGRAPH_PASSWORD`.
- - Authentication queries such as `CREATE USER`.
-
-When using the **bolt+routing protocol**, provide credentials for users that exist on the data instances. The authentication flow works as follows:
-
-1. Client connects to a **coordinator**.
-2. Coordinator responds with the **routing table** (without authenticating).
-3. Client connects to the **designated data instance** using the **same credentials**.
-4. Data instance **authenticates the user and processes the request**.
-
-This architecture separates routing coordination from the user management, ensuring that authentication occurs only where user data resides.
-
-
-## Starting instances
-
-You can start the data and coordinator instances using environment flags or configuration flags.
-The main difference between data instance and coordinator is that data instances have `--management-port`,
-whereas coordinators must have `--coordinator-id` and `--coordinator-port`.
-
-### Configuration Flags
-
-#### Data instance
-
-Memgraph data instance must use flag `--management-port=`. This flag is tied to the high availability feature, enables the coordinator to connect to the data instance,
-and allows the Memgraph data instance to use the high availability feature. The flag `--storage-wal-enabled` must be enabled, otherwise data instance won't be started.
-
-```
-docker run --name instance1 -p 7687:7687 -p 7444:7444 memgraph/memgraph-mage
---management-port=13011 \
---bolt-port=7692 \
-```
-
-#### Coordinator instance
-
-```
-docker run --name coord1 -p 7691:7691 -p 7445:7444 memgraph/memgraph-mage
---coordinator-port=10111
---bolt-port=7691
---coordinator-id=1
---coordinator-hostname=localhost
---management-port=12121
-```
-
-Coordinator IDs serve as identifiers, the coordinator port is used for synchronization and log replication between coordinators and management port is used to get health state of
-cluster from leader coordinator. Coordinator IDs, coordinator ports and management ports must be different for all coordinators.
-
-Configuration option `--coordinator-hostname` must be set on all coordinator instances. It is used on followers to ping the leader coordinator on the correct IP address and return
-the health state about the cluster. You can set this configuration flag to the IP address, the fully qualified domain name (FQDN), or even the DNS name.
-The suggested approach is to use DNS, otherwise, in case the IP address changes, network communication between instances in the cluster will stop working.
-
-When testing on a local setup, the flag `--coordinator-hostname` should be set to `localhost` for each instance.
-
-It is important that in the host you set the bolt ports distinct for every instance, regardless of them being a data instance, or a coordinator instance.
-
-### Env flags
-
-There is an additional way to set high availability instances using environment variables. It is important to say that for the following configuration options, you can either use
-environment variables or configuration flags:
-
-- bolt port
-- coordinator port
-- coordinator id
-- management port
-- path to nuraft log file
-- coordinator hostname
-
-#### Data instances
-
-Here are the environment variables you need to use to set data instance using only environment variables:
-
-```
-export MEMGRAPH_MANAGEMENT_PORT=13011
-export MEMGRAPH_BOLT_PORT=7692
-```
-
-When using any of these environment variables, flags `--bolt-port` and `--management-port` will be ignored.
-
-
-#### Coordinator instances
-
-```
-export MEMGRAPH_COORDINATOR_PORT=10111
-export MEMGRAPH_COORDINATOR_ID=1
-export MEMGRAPH_BOLT_PORT=7687
-export MEMGRAPH_NURAFT_LOG_FILE=""
-export MEMGRAPH_COORDINATOR_HOSTNAME="localhost"
-export MEMGRAPH_MANAGEMENT_PORT=12121
-```
-
-When using any of these environment variables, flags for `--bolt-port`, `--coordinator-port`, `--coordinator-id` and `--coordinator-hostname` will be ignored.
-
-
-There is an additional environment variable you can use to set the path to the file with cypher queries used to start a high availability cluster.
-Here, you can use queries we define in the next chapter called User API.
-
-```
-export MEMGRAPH_HA_CLUSTER_INIT_QUERIES=
-```
-After the coordinator instance is started, Memgraph will run queries one by one from this file to set up a high availability cluster.
-
-## User API
-
-### Register instance
-
-Registering instances should be done on a single coordinator. The chosen coordinator will become the cluster's leader.
-
-Register instance query will result in several actions:
-1. The coordinator instance will connect to the data instance on the `management_server` network address.
-2. The coordinator instance will start pinging the data instance every `--instance-health-check-frequency-sec` seconds to check its status.
-3. Data instance will be demoted from main to replica.
-4. Data instance will start the replication server on `replication_server`.
-
-```plaintext
-REGISTER INSTANCE instanceName ( AS ASYNC | AS STRICT_SYNC ) ? WITH CONFIG {"bolt_server": boltServer, "management_server": managementServer, "replication_server": replicationServer};
-```
-
-This operation will result in writing to the Raft log.
-
-In case the main instance already exists in the cluster, a replica instance will be automatically connected to the main. Constructs ( AS ASYNC | AS STRICT_SYNC ) serve to specify
-instance's replication mode when the instance behaves as replica. You can only have `STRICT_SYNC` and `ASYNC` or `SYNC` and `ASYNC` replicas together in the cluster. Combining `STRICT_SYNC`
-and `SYNC` replicas together doesn't have proper semantic meaning so it is forbidden.
-
-
-### Add coordinator instance
-
-The user can choose any coordinator instance to run cluster setup queries. This can be done before or after registering data instances,
-the order isn't important.
-
-```plaintext
-ADD COORDINATOR coordinatorId WITH CONFIG {"bolt_server": boltServer, "coordinator_server": coordinatorServer};
-```
-
-
-
-`ADD COORDINATOR` query needs to be run for all coordinators in the cluster.
-
-```
-ADD COORDINATOR 1 WITH CONFIG {"bolt_server": "127.0.0.1:7691", "coordinator_server": "127.0.0.1:10111", "management_server": "127.0.0.1:12111"};
-ADD COORDINATOR 2 WITH CONFIG {"bolt_server": "127.0.0.1:7692", "coordinator_server": "127.0.0.1:10112", "management_server": "127.0.0.1:12112"};
-ADD COORDINATOR 3 WITH CONFIG {"bolt_server": "127.0.0.1:7693", "coordinator_server": "127.0.0.1:10113", "management_server": "127.0.0.1:12113"};
-```
-
-
-
-### Remove coordinator instance
-
-If during cluster setup or at some later stage of cluster life, the user decides to remove some coordinator instance, `REMOVE COORDINATOR` query can be used.
-Only on leader can this query be executed in order to remove followers. Current cluster's leader cannot be removed since this is prohibited
-by NuRaft. In order to remove the current leader, you first need to trigger leadership change.
-
-```plaintext
-REMOVE COORDINATOR ;
-```
-
-
-### Set instance to main
-
-Once all data instances are registered, one data instance should be promoted to main. This can be achieved by using the following query:
-
-```plaintext
-SET INSTANCE instanceName to main;
-```
-
-This query will register all other instances as replicas to the new main. If one of the instances is unavailable, setting the instance to main will not succeed.
-If there is already a main instance in the cluster, this query will fail.
-
-This operation will result in writing to the Raft log.
-
-### Demote instance
-
-Demote instance query can be used by an admin to demote the current main to replica. In this case, the leader coordinator won't perform a failover, but as a user,
-you should choose promote one of the data instances to main using the `SET INSTANCE `instance` TO main` query.
-
-```plaintext
-DEMOTE INSTANCE instanceName;
-```
-
-This operation will result in writing to the Raft log.
-
-
-
-By combining the functionalities of queries `DEMOTE INSTANCE instanceName` and `SET INSTANCE instanceName TO main` you get the manual failover capability. This can be useful
-e.g during a maintenance work on the instance where the current main is deployed.
-
-
-
-
-### Unregister instance
-
-There are various reasons which could lead to the decision that an instance needs to be removed from the cluster. The hardware can be broken,
-network communication could be set up incorrectly, etc. The user can remove the instance from the cluster using the following query:
-
-```plaintext
-UNREGISTER INSTANCE instanceName;
-```
-
-When unregistering an instance, ensure that the instance being unregistered is
-**not** the main instance. Unregistering main can lead to an inconsistent
-cluster state. Additionally, the cluster must have an **alive** main instance
-during the unregistration process. If no main instance is available, the
-operation cannot be guaranteed to succeed.
-
-The instance requested to be unregistered will also be unregistered from the current main's replica set.
-
-### Force reset cluster state
-
-In case the cluster gets stuck there is an option to do the force reset of the cluster. You need to execute a command on the leader coordinator.
-This command will result in the following actions:
-
-1. The coordinator instance will demote each alive instance to replica.
-2. From the alive instance it will choose a new main instance.
-3. Instances that are down will be demoted to replicas once they come back up.
-
-```plaintext
-FORCE RESET CLUSTER STATE;
-```
-
-This operation will result in writing to the Raft log.
-
-### Show instances
-
-You can check the state of the whole cluster using the `SHOW INSTANCES` query. The query will display all the Memgraph servers visible in the cluster. With
-each server you can see the following information:
- 1. Network endpoints they are using for managing cluster state
- 2. Health state of server
- 3. Role - main, replica, LEADER, FOLLOWER or unknown if not alive
- 4. The time passed since the last response time to the leader's health ping
-
-This query can be run on either the leader or followers. Since only the leader knows the exact status of the health state and last response time,
-followers will execute actions in this exact order:
- 1. Try contacting the leader to get the health state of the cluster, since the leader has all the information.
- If the leader responds, the follower will return the result as if the `SHOW INSTANCES` query was run on the leader.
- 2. When the leader doesn't respond or currently there is no leader, the follower will return all the Memgraph servers
- with the health state set to "down".
-
-```plaintext
-SHOW INSTANCES;
-```
-
-
-### Show instance
-
-You can check the state of the current coordinator to which you are connected by running the following query:
-
-```plaintext
-SHOW INSTANCE;
-```
-
-This query will return the information about:
-1. instance name
-2. external bolt server to which you can connect using Memgraph clients
-3. coordinator server over which Raft communication is done
-4. management server which is also used for inter-coordinators communication and
-5. cluster role: whether the coordinator is currently a leader of the follower.
-
-If the query `ADD COORDINATOR` wasn't run for the current instance, the value of the bolt server will be "".
-
-### Show replication lag
-
-The user can find the current replication lag on each instance by running `SHOW REPLICATION LAG` on the cluster's leader. The replication lag is expressed with
-the number of committed transactions. Such an info is made durable through snapshots and WALs so restarts won't cause the information loss. The information
-about the replication lag can be useful when manually performing a failover to check whether there is a risk of a data loss.
-
-```plaintext
-SHOW REPLICATION LAG;
-```
-
-
-## Setting config for highly-available cluster
-
-There are several flags that you can use for managing the cluster. Flag `--management-port` is used by both data instances
-and coordinators. The provided flag needs to be unique. Setting a flag will create an RPC server on instances capable of
-responding to the coordinator's RPC messages.
-
-
-
-RPC (Remote Procedure Call) is a protocol for executing functions on a remote
-system. RPC enables direct communication in distributed systems and is crucial
-for replication and high availability tasks.
-
-
-
-Flags `--coordinator-id`, `--coordinator-port` and `--management-port` need to be unique and specified on coordinator instances. They will cause the creation of a Raft
-server that coordinator instances use for communication. Flag `--instance-health-check-frequency-sec` specifies how often should leader coordinator
-check the status of the replication instance to update its status. Flag `--instance-down-timeout-sec` gives the user the ability to control how much time should
-pass before the coordinator starts considering the instance to be down.
-
-There is a configuration option for specifying whether reads from the main are enabled. The configuration value is by default false but can be changed in run-time
-using the following query:
-
-```
-SET COORDINATOR SETTING 'enabled_reads_on_main' TO 'true'/'false' ;
-```
-
-Users can also choose whether failover to the async replica is allowed by using the following query:
-
-```
-SET COORDINATOR SETTING 'sync_failover_only' TO 'true'/'false' ;
-```
-
-Users can control the maximum transaction lag allowed during failover through configuration. If a replica is behind the main instance by more than the configured threshold,
-that replica becomes ineligible for failover. This prevents data loss beyond the user's acceptable limits.
-
-To implement this functionality, we employ a caching mechanism on the cluster leader that tracks replicas' lag. The cache gets updated with each StateCheckRpc response from
-replicas. During the brief failover window on the cooordinators' side, the new cluster leader may not have the current lag information for all data instances and in that case,
-any replica can become main. This trade-off is intentional and it avoids flooding Raft logs with frequently-changing lag data while maintaining failover safety guarantees
-in the large majority of situations.
-
-
-The configuration value can be controlled using the query:
-
-```
-SET COORDINATOR SETTING 'max_failover_replica_lag' TO '10' ;
-```
-
-
-
-
-By default, the value is `true`, which means that only sync replicas are candidates in the election. When the value is set to `false`, the async replica is also considered, but
-there is an additional risk of experiencing data loss. However, failover to an async replica may be necessary when other sync replicas are down and you want to
-manually perform a failover.
-
-
-Users can control the maximum allowed replica lag to maintain read consistency. When a replica falls behind the current main by more than `max_replica_read_lag_` transactions, the
-bolt+routing protocol will exclude that replica from read query routing to ensure data freshness.
-
-The configuration value can be controlled using the query:
-
-
-```
-SET COORDINATOR SETTING 'max_replica_read_lag_' TO '10' ;
-```
-
-All run-time configuration options can be retrieved using:
-
-```
-SHOW COORDINATOR SETTINGS ;
-```
-
-
-
-
-Consider the instance to be down only if several consecutive pings fail because a single ping can fail because of a large number of different reasons in distributed systems.
-
-
-
-### RPC timeouts
-
-For the majority of RPC messages, Memgraph uses a default timeout of 10s. This is to ensure that when sending a RPC request, the client
-will not block indefinitely before receiving a response if the communication between the client and the server is broken. The list of RPC messages
-for which the timeout is used is the following:
-
-- ShowInstancesReq -> coordinator sending to coordinator
-- DemoteMainToReplicaReq -> coordinator sending to data instances
-- PromoteToMainReq -> coordinator sending to data instances
-- RegisterReplicaOnMainReq -> coordinator sending to data instances
-- UnregisterReplicaReq -> coordinator sending to data instances
-- EnableWritingOnMainReq -> coordinator sending to data instances
-- GetInstanceUUIDReq -> coordinator sending to data instances
-- GetDatabaseHistoriesReq -> coordinator sending to data instances
-- StateCheckReq -> coordinator sending to data instances. The timeout is set to 5s.
-- SwapMainUUIDReq -> coordinator sending to data instances
-- FrequentHeartbeatReq -> main sending to replica. The timeout is set to 5s.
-- HeartbeatReq -> main sending to replica
-- TimestampReq -> main sending to replica
-- SystemHeartbeatReq -> main sending to replica
-- ForceResetStorageReq -> main sending to replica. The timeout is set to 60s.
-- SystemRecoveryReq -> main sending to replica. The timeout is set to 5s.
-- FinalizeCommitReq -> main sending to replica. The timeout is set to 10s.
-
-
-For RPC messages which are sending the variable number of storage deltas — PrepareCommitRpc, CurrentWalRpc, and
-WalFilesRpc — it is not practical to set a strict execution timeout. The
-processing time on the replica side is directly proportional to the number of
-deltas being transferred. To handle this, the replica sends periodic progress
-updates to the main instance after processing every 100,000 deltas. Since
-processing 100,000 deltas is expected to take a relatively consistent amount of
-time, we can enforce a timeout based on this interval. The default timeout for
-these RPC messages is 30 seconds, though in practice, processing 100,000 deltas
-typically takes less than 3 seconds.
-
-SnapshotRpc is also a replication-related RPC message, but its execution time
-is tracked a bit differently from RPC messages shipping deltas. The replica sends an update to the main instance after
-completing 1,000,000 units of work. The work units are assigned as follows:
-
-- Processing nodes, edges, or indexed entities (label index, label-property index,
- edge type index, edge type property index) = 1 unit
-- Processing a node inside a point or text index = 10 units
-- Processing a node inside a vector index (most computationally expensive) =
- 1,000 units
-
-With this unit-based tracking system, the replica is expected to report progress
-every 2–3 seconds. Given this, a timeout of 60 seconds is set to avoid
-unnecessary network instability while ensuring responsiveness.
-
-Except for timeouts on read and write operations, Memgraph also has a timeout of 5s
-for sockets when establishing a connection. Such a timeout helps in having a low p99
-latencies when using the RPC stack, which manifests for users as smooth and predictable
-network communication between instances.
-
-
-## Failover
-
-### Determining instance's health
-
-Every `--instance-health-check-frequency-sec` seconds, the coordinator contacts each instance.
-The instance is not considered to be down unless `--instance-down-timeout-sec` has passed and the instance hasn't responded to the coordinator in the meantime.
-Users must set `--instance-health-check-frequency-sec` to be less or equal to the `--instance-down-timeout-sec` but we advise users to set `--instance-down-timeout-sec` to
-a multiplier of `--instance-health-check-frequency-sec`. Set the multiplier coefficient to be N>=2.
-For example, set `--instance-down-timeout-sec=5` and `--instance-health-check-frequency-sec=1` which will result in coordinator contacting each instance every second and
-the instance is considered dead after it doesn't respond 5 times (5 seconds / 1 second).
-
-In case a replica doesn't respond to a health check, the leader coordinator will try to contact it again every `--instance-health-check-frequency-sec`.
-When the replica instance rejoins the cluster (comes back up), it always rejoins as replica. For main instance, there are two options.
-If it is down for less than `--instance-down-timeout-sec`, it will rejoin as main because it is still considered alive. If it is down for more than `--instance-down-timeout-sec`,
-the failover procedure is initiated. Whether main will rejoin as main depends on the success of the failover procedure. If the failover procedure succeeds, now old main
-will rejoin as replica. If failover doesn't succeed, main will rejoin as main once it comes back up.
-
-### Failover procedure - high level description
-
-From alive replicas coordinator chooses a new potential main and writes a log to the Raft storage about the new main. On the next leader's ping to the instance,
-it will send to the instance an RPC request to the new main, which is still in replica state, to promote itself to the main instance with info
-about other replicas to which it will replicate data. Once that request succeeds, the new main can start replication to the other instances and accept write queries.
-
-### Choosing new main from available replicas
-
-
-During failover, the coordinator must select a new main instance from available replicas, as some may be offline. The leader coordinator queries each live replica to
-retrieve the committed transaction count for every database.
-
-The selection algorithm prioritizes data recency using a two-phase approach:
-
-1. **Database majority rule**: The coordinator identifies which replica has the highest committed transaction count for each database. The replica that leads in the most
-databases becomes the preferred candidate.
-2. **Total transaction tiebreaker**: If multiple replicas tie for leading the most databases, the coordinator sums each replica's committed transactions across all databases.
-The replica with the highest total becomes the new main.
-
-This approach ensures the new main instance has the most up-to-date data across the cluster while maintaining consistency guarantees.
-
-### Old main rejoining to the cluster
-
-Once the old main gets back up, the coordinator sends an RPC request to demote the old main to replica. The coordinator tracks at all times which instance was the last main.
-
-The leader coordinator sends two RPC requests in the given order to demote old main to replica:
-1. Demote main to replica RPC request
-2. A request to store the UUID of the current main, which the old main, now acting as a replica instance, must listen to.
-
-### How replica knows which main to listen
-
-Each replica has a UUID of main it listens to. If a network partition happens where main can talk to a replica but the coordinator can't talk to the main, from the coordinator's
-point of view that main is down. From replica's point of view, the main instance is still alive. The coordinator will start the failover procedure, and we can end up with multiple mains
-where replicas can listen to both mains. To prevent such an issue, each replica gets a new UUID that no current main has. The coordinator generates the new UUID,
-which the new main will get to use on its promotion to main.
-
-If replica was down at one point, main could have changed. When replica gets back up, it doesn't listen to any main until the coordinator sends an RPC request to replica to start
-listening to main with the given UUID.
-
-### Replication concerns
-
-#### Force sync of data
-
-During a failover event, Memgraph selects the most up-to-date, alive instance to
-become the new main. The selection process works as follows:
-1. From the list of available replica instances, Memgraph chooses the one with
-the latest commit timestamp for the default database.
-2. If an instance that had more recent data was down during this selection
-process, it will not be considered for promotion to main.
-
-If a previously down instance had more up-to-date data but was unavailable
-during failover, it will go through a specific recovery process upon rejoining
-the cluster:
-- The replica will reset its storage.
-- The replica will receive all commits from the new main to
- synchronize its state.
-- The replica's old durability files will be preserved in a `.old` directory in
- `data_directory/snapshots` and `data_directory/wal` folders, allowing admins
- to manually recover data if needed.
-
-Depending on the replication mode used, there are different levels of data loss
-that can happen upon the failover. With the default `SYNC` replication mode,
-Memgraph prioritizes availability over strict consistency and can result in
-a non-zero Recovery Point Objective (RPO), that is, the loss of committed data, because:
-- The promoted main might not have received all commits from the previous main
- before the failure.
-- This design ensures that the main remains writable for the maximum possible
- time.
-
-With `ASYNC` replication mode, you also risk losing some data upon the failover because
-main can freely continue commiting no matter the status of ASYNC replicas.
-
-The `STRICT_SYNC` replication mode allows users experiencing a failover without any data loss
-in all situations. It comes with reduced throughput because of the cost of running two-phase commit protocol.
-
-
-## Actions on follower coordinators
-
-From follower coordinators you can only execute `SHOW INSTANCES`. Registration of data instance, unregistration of data instances, demoting instance, setting instance to main and
-force resetting cluster state are all disabled.
-
-
-## Instances' restart
-
-### Data instances' restart
-
-Data instances can fail both as main and as replica. When an instance that was replica comes back, it won't accept updates from any instance until the coordinator updates its
-responsible peer. This should happen automatically when the coordinator's ping to the instance passes. When the main instance comes back, any writing to the main instance will be
-forbidden until a ping from the coordinator passes.
-
-### Coordinator instances restart
-
-In case the coordinator instance dies and it is restarted, it will not lose any data from the RAFT log or RAFT snapshots, since coordinator data is always backed-up by a durable storage.
-For more details read about high availability durability in the durability chapter.
-
-
-## Durability
-
-All NuRaft data is made durable by default. This includes all Raft logs, Raft snapshots and information about cluster connectivity.
-The details about the cluster connectivity are made durable since without that information, the coordinator can't rejoin the cluster on its restart.
-
-Information about logs and snapshots is stored under one RocksDB instance in the `high_availability/raft_data/logs` directory stored under
-the top-level `--data-directory` folder. All the data stored there is recovered in case the coordinator restarts.
-
-Data about other coordinators is recovered from the `high_availability/raft_data/network` directory stored under
-the top-level `--data-directory` folder. When the coordinator rejoins, it will reestablish the communication with other coordinators and receive updates from the current leader.
-
-### First start
-
-On the first start of coordinators, each will store the current version of the `logs` and `network` durability store. From that point on,
-each RAFT log that is sent to the coordinator is also stored on disk. For every new coordinator instance, the server config is updated. Logs are created
-for each user action and failover action. Snapshots are created every N (N currently being 5) logs.
-
-
-### Restart of coordinator
-
-In case of the coordinator's failure, on the restart, it will read information about other coordinators stored under `high_availability/raft_data/network` directory.
-
-From the `network` directory we will recover the server state before the coordinator stopped, including the current term, for whom the coordinator voted, and whether
-election timer is allowed.
-
-It will also recover the following server config information:
-- other servers, including their endpoints, id, and auxiliary data
-- ID of the previous log
-- ID of the current log
-- additional data needed by nuRaft
-
-The following information will be recovered from a common RocksDB `logs` instance:
-- current version of `logs` durability store
-- snapshots found with `snapshot_id_` prefix in database:
- - coordinator cluster state - all data instances with their role (main or replica), all coordinator instances and UUID of main instance which replica is listening to
- - last log idx
- - last log term
- - last cluster config
-- logs found in the interval between the start index and the last log index
- - data - each log holds data on what has changed since the last state
- - term - nuRAFT term
- - log type - nuRAFT log type
-
-
-### Handling of durability errors
-
-If snapshots are not correctly stored, the exception is thrown and left for the nuRAFT library to handle the issue. Logs can be missed and not stored since they are compacted and
-deleted every two snapshots and will be removed relatively fast.
-
-Memgraph throws an error when failing to store cluster config, which is updated in the `high_availability/raft_data/network` folder.
-If this happens, it will happen only on the first cluster start when coordinators are connecting since
-coordinators are configured only once at the start of the whole cluster. This is a non-recoverable error since in case the coordinator rejoins the cluster and has
-the wrong state of other clusters, it can become a leader without being connected to other coordinators.
-
-
-## Recovering from errors
-
-Distributed systems can fail in numerous ways. Memgraph processes are resilient to network
-failures, omission faults and independent machine failures. Byzantine failures aren't tolerated since the Raft consensus protocol cannot deal with them either.
-
-Recovery Time Objective (RTO) is an often used term for measuring the maximum tolerable length of time that an instance or cluster can be down.
-Since every highly available Memgraph cluster has two types of instances, we need to analyze the failures of each separately.
-
-Raft is a quorum-based protocol and it needs a majority of instances alive in order to stay functional. Hence, with just one coordinator instance down, RTO is 0 since
-the cluster stays available. With 2+ coordinator instances down
-(in a cluster with RF = 3), the RTO depends on the time needed for instances to come back.
-
-Depending on the replica's replication mode, its failure can lead to different situations. If the replica was registered with STRICT_SYNC mode, then on its failure, writing
-on main will be disabled. On the other hand, if replica was registered as ASYNC or SYNC, further writes on main are still allowed. In both cases, reads are still allowed from
-main and other replicas.
-
-
-The most important thing to analyze is what happens when main gets down. In that case, the leader coordinator uses
-user-controllable parameters related to the frequency of health checks from the leader to replication instances (`--instance-health-check-frequency-sec`)
-and the time needed to realize the instance is down (`--instance-down-timeout-sec`). After collecting enough evidence, the leader concludes the main is down and performs failover
-using just a handful of RPC messages (correct time depends on the distance between instances). It is important to mention that the whole failover is performed without the loss of committed data
-if the newly chosen main (previously replica) had all up-to-date data.
-
-## Raft configuration parameters
-
-Several Raft-related parameters are important for the correct functioning of the cluster. The leader coordinator sends a heartbeat
-message to other coordinators every second to determine their health. This configuration option is connected with leader election timeout which
-is a randomized value from the interval [2000ms, 4000ms] and which is used by followers to decide when to trigger new election process. Leadership
-expiration is set to 2000ms so that cluster can never get into situation where multiple leaders exist. These specific values give a cluster
-the ability to survive occasional network hiccups without triggering leadership changes.
-
-
-## Data center failure
-
-The architecture we currently use allows us to deploy coordinators in 3 data centers and hence tolerate a failure of the whole data center. Data instances can be freely
-distributed in any way you want between data centers. The failover time will be slightly increased due to the network communication needed.
-
-## Kubernetes
-
-We support deploying Memgraph HA as part of the Kubernetes cluster through Helm charts.
-You can see example configurations [here](/getting-started/install-memgraph/kubernetes#memgraph-high-availability-helm-chart).
-
-## In-Service Software Upgrade (ISSU)
-
-Memgraph’s **High Availability** supports in-service software upgrades (ISSU).
-This guide explains the process when using [HA Helm
-charts]((/getting-started/install-memgraph/kubernetes#memgraph-high-availability-helm-chart)).
-The procedure is very similar for native deployments.
-
-
-
-**Important**: Although the upgrade process is designed to complete
-successfully, unexpected issues may occur. We strongly recommend doing a backup
-of your `lib` directory on all of your `StatefulSets` or native instances
-depending on the deployment type.
+
+**Users are advised to first read the guide on [how replication works](/clustering/concepts/how-replication-works), followed
+by the guide on [how high availability works](/clustering/concepts/how-high-availability-works)**.
+## [Setup HA cluster (Docker)](/clustering/high-availability/setup-ha-cluster-docker)
+Learn how to setup a high availability cluster with Docker.
-{ Prerequisites
}
-
-If you are using **HA Helm charts**, set the following configuration before
-doing any upgrade.
-
- ```yaml
- updateStrategy.type: OnDelete
- ```
-
- Depending on the infrastructure on which you have your Memgraph cluster, the
-details will differ a bit, but the backbone is the same.
-
-Prepare a backup of all data from all instances. This ensures you can safely
-downgrade cluster to the last stable version you had.
-
- - For **native deployments**, tools like `cp` or `rsync` are sufficient.
- - For **Kubernetes**, create a `VolumeSnapshotClass`with the yaml file fimilar
- to this:
-
- ```yaml
- apiVersion: snapshot.storage.k8s.io/v1
- kind: VolumeSnapshotClass
- metadata:
- name: csi-azure-disk-snapclass
- driver: disk.csi.azure.com
- deletionPolicy: Delete
- ```
-
- Apply it:
-
- ```bash
- kubectl apply -f azure_class.yaml
- ```
-
- - On **Google Kubernetes Engine**, the default CSI driver is
- `pd.csi.storage.gke.io` so make sure to change the field `driver`.
- - On **AWS EKS**, refer to the [AWS snapshot controller
- docs](https://docs.aws.amazon.com/eks/latest/userguide/csi-snapshot-controller.html).
-
-
-{ Create snapshots
}
-
-Now you can create a `VolumeSnapshot` of the lib directory using the yaml file:
-
-```yaml
-apiVersion: snapshot.storage.k8s.io/v1
-kind: VolumeSnapshot
-metadata:
- name: coord-3-snap # Use a unique name for each instance
- namespace: default
-spec:
- volumeSnapshotClassName: csi-azure-disk-snapclass
- source:
- persistentVolumeClaimName: memgraph-coordinator-3-lib-storage-memgraph-coordinator-3-0
-```
-
-Apply it:
-
-```bash
-kubectl apply -f azure_snapshot.yaml
-```
-
-Repeat for every instance in the cluster.
-
-
-{ Update configuration
}
-
-Next you should update `image.tag` field in the `values.yaml` configuration file
-to the version to which you want to upgrade your cluster.
-
-1. In your `values.yaml`, update the image version:
-
- ```yaml
- image:
- tag:
- ```
-2. Apply the upgrade:
-
- ```bash
- helm upgrade -f
- ```
-
- Since we are using `updateStrategy.type=OnDelete`, this step will not restart
- any pod, rather it will just prepare pods for running the new version.
- - For **native deployments**, ensure the new binary is available.
-
-
-{ Upgrade procedure (zero downtime)
}
-
-Our procedure for achieving zero-downtime upgrades consists of restarting one
-instance at a time. Memgraph uses **primary–secondary replication**. To avoid
-downtime:
-
-1. Upgrade **replicas** first.
-2. Upgrade the **main** instance.
-3. Upgrade **coordinator followers**, then the **leader**.
-
-In order to find out on which pod/server the current main and the current
-cluster leader sits, run:
-
-```cypher
-SHOW INSTANCES;
-```
-
-
-{ Upgrade replicas
}
-
-If you are using K8s, the upgrade can be performed by deleting the pod. Start by
-deleting the replica pod (in this example replica is running on the pod
-`memgraph-data-1-0`):
-
-```bash
-kubectl delete pod memgraph-data-1-0
-```
-
-**Native deployment:** stop the old binary and start the new one.
-
-Before starting the upgrade of the next pod, it is important to wait until all
-pods are ready. Otherwise, you may end up with a data loss. On K8s you can
-easily achieve that by running:
-
-```bash
-kubectl wait --for=condition=ready pod --all
-```
-
-For the native deployment, check if all your instances are alived manually.
-
-This step should be repeated for all of your replicas in the cluster.
-
-
-{ Upgrade the main
}
-
-Before deleting the main pod, check replication lag to see whether replicas are
-behind MAIN:
-
-```cypher
-SHOW REPLICATION LAG;
-```
-
-If replicas are behind, your upgrade will be prone to a data loss. In order to
-achieve zero-downtime upgrade without any data loss, either:
-
- - Use `STRICT_SYNC` mode (writes will be blocked during upgrade), or
- - Wait until replicas are fully caught up, then pause writes. This way, you
-can use any replication mode. Read queries should however work without any
-issues independently from the replica type you are using.
-
-Upgrade the main pod:
-
-```bash
-kubectl delete pod memgraph-data-0-0
-kubectl wait --for=condition=ready pod --all
-```
-
-
-{ Upgrade coordinators
}
-
-The upgrade of coordinators is done in exactly the same way. Start by upgrading
-followers and finish with deleting the leader pod:
-
-```bash
-kubectl delete pod memgraph-coordinator-3-0
-kubectl wait --for=condition=ready pod --all
-
-kubectl delete pod memgraph-coordinator-2-0
-kubectl wait --for=condition=ready pod --all
-
-kubectl delete pod memgraph-coordinator-1-0
-kubectl wait --for=condition=ready pod --all
-```
-
-
-
-{ Verify upgrade
}
-
-Your upgrade should be finished now, to check that everything works, run:
-
-```cypher
-SHOW VERSION;
-```
-
-It should show you the new Memgraph version.
-
-
-{ Rollback
}
-
-If during the upgrade, you figured out that an error happened or even after
-upgrading all of your pods something doesn't work (e.g. write queries don't
-pass), you can safely downgrade your cluster to the previous version using
-`VolumeSnapshots` you took on K8s or file backups for native deployments.
-
-- **Kubernetes:**
-
- ```bash
- helm uninstall
- ```
-
- In `values.yaml`, for all instances set:
-
- ```yaml
- restoreDataFromSnapshot: true
- ```
-
- Make sure to set correct name of the snapshot you will use to recover your
-instances.
-
-- **Native deployments:** restore from your file backups.
-
-
-
-
-If you're doing an upgrade on `minikube`, it is important to make sure that the
-snapshot resides on the same node on which the `StatefulSet` is installed.
-Otherwise, it won't be able to restore `StatefulSet's` attached
-PersistentVolumeClaim from the `VolumeSnapshot`.
-
-
-
-## Docker Compose
-
-The following example shows you how to setup Memgraph cluster using Docker Compose. The cluster will use user-defined bridge network.
-
-License file `license.cypherl` should be in the format:
-
-```
-
-SET DATABASE SETTING 'organization.name' TO '';
-SET DATABASE SETTING 'enterprise.license' TO '';
-
-```
-
-You can directly use initialization file `HA_register.cypherl`:
-
-```
-
-ADD COORDINATOR 1 WITH CONFIG {"bolt_server": "localhost:7691", "coordinator_server": "coord1:10111", "management_server": "coord1:12121"};
-ADD COORDINATOR 2 WITH CONFIG {"bolt_server": "localhost:7692", "coordinator_server": "coord2:10112", "management_server": "coord2:12122"};
-ADD COORDINATOR 3 WITH CONFIG {"bolt_server": "localhost:7693", "coordinator_server": "coord3:10113", "management_server": "coord3:12123"};
-
-REGISTER INSTANCE instance_1 WITH CONFIG {"bolt_server": "localhost:7687", "management_server": "instance1:13011", "replication_server": "instance1:10001"};
-REGISTER INSTANCE instance_2 WITH CONFIG {"bolt_server": "localhost:7688", "management_server": "instance2:13012", "replication_server": "instance2:10002"};
-REGISTER INSTANCE instance_3 WITH CONFIG {"bolt_server": "localhost:7689", "management_server": "instance3:13013", "replication_server": "instance3:10003"};
-SET INSTANCE instance_3 TO main;
-
-```
-
-Since the host can't resolve the IP for coordinators and data instances, Bolt
-servers in Docker Compose setup require `bolt_server` set to `localhost:`.
-
-You can directly use the following `docker-compose.yml` to start the cluster using `docker compose up`:
-
-```
-volumes:
- mg_lib1:
- mg_lib2:
- mg_lib3:
- mg_lib4:
- mg_lib5:
- mg_lib6:
- mg_log1:
- mg_log2:
- mg_log3:
- mg_log4:
- mg_log5:
- mg_log6:
-
-networks:
- memgraph_ha:
- name: memgraph_ha
- driver: bridge
- ipam:
- driver: default
- config:
- - subnet: "172.21.0.0/16"
-
-services:
- coord1:
- image: "memgraph/memgraph"
- container_name: coord1
- volumes:
- - ./license.cypherl:/tmp/init/license.cypherl:ro
- - ./HA_register.cypherl:/tmp/init/HA_register.cypherl:ro
- - mg_lib1:/var/lib/memgraph
- - mg_log1:/var/log/memgraph
- environment:
- - MEMGRAPH_HA_CLUSTER_INIT_QUERIES=/tmp/init/HA_register.cypherl
- command: [ "--init-file=/tmp/init/license.cypherl", "--log-level=TRACE", "--also-log-to-stderr", "--bolt-port=7691", "--coordinator-id=1", "--coordinator-port=10111", "--management-port=12121", "--coordinator-hostname=coord1", "--nuraft-log-file=/var/log/memgraph/nuraft"]
- networks:
- memgraph_ha:
- ipv4_address: 172.21.0.4
- ports:
- - "7691:7691"
- depends_on:
- - instance1
- - instance2
- - instance3
-
- coord2:
- image: "memgraph/memgraph"
- container_name: coord2
- volumes:
- - ./license.cypherl:/tmp/init/license.cypherl:ro
- - mg_lib2:/var/lib/memgraph
- - mg_log2:/var/log/memgraph
- command: [ "--init-file=/tmp/init/license.cypherl", "--log-level=TRACE", "--also-log-to-stderr", "--bolt-port=7692", "--coordinator-id=2", "--coordinator-port=10112", "--management-port=12122", "--coordinator-hostname=coord2" , "--nuraft-log-file=/var/log/memgraph/nuraft"]
- networks:
- memgraph_ha:
- ipv4_address: 172.21.0.2
- ports:
- - "7692:7692"
- depends_on:
- - instance1
- - instance2
- - instance3
-
- coord3:
- image: "memgraph/memgraph"
- container_name: coord3
- volumes:
- - ./license.cypherl:/tmp/init/license.cypherl:ro
- - mg_lib3:/var/lib/memgraph
- - mg_log3:/var/log/memgraph
- command: [ "--init-file=/tmp/init/license.cypherl", "--log-level=TRACE", "--also-log-to-stderr", "--bolt-port=7693", "--coordinator-id=3", "--coordinator-port=10113", "--management-port=12123", "--coordinator-hostname=coord3" , "--nuraft-log-file=/var/log/memgraph/nuraft"]
-
- networks:
- memgraph_ha:
- ipv4_address: 172.21.0.3
- ports:
- - "7693:7693"
- depends_on:
- - instance1
- - instance2
- - instance3
-
- instance1:
- image: "memgraph/memgraph"
- container_name: instance1
- volumes:
- - ./license.cypherl:/tmp/init/license.cypherl:ro
- - mg_lib4:/var/lib/memgraph
- - mg_log4:/var/log/memgraph
- command: ["--init-file=/tmp/init/license.cypherl", "--log-level=TRACE", "--also-log-to-stderr", "--bolt-port=7687", "--management-port=13011"]
- networks:
- memgraph_ha:
- ipv4_address: 172.21.0.6
- ports:
- - "7687:7687"
-
- instance2:
- image: "memgraph/memgraph"
- container_name: instance2
- volumes:
- - ./license.cypherl:/tmp/init/license.cypherl:ro
- - mg_lib5:/var/lib/memgraph
- - mg_log5:/var/log/memgraph
- command: ["--init-file=/tmp/init/license.cypherl", "--log-level=TRACE", "--also-log-to-stderr", "--bolt-port=7688", "--management-port=13012"]
- networks:
- memgraph_ha:
- ipv4_address: 172.21.0.7
- ports:
- - "7688:7688"
-
- instance3:
- image: "memgraph/memgraph"
- container_name: instance3
- volumes:
- - ./license.cypherl:/tmp/init/license.cypherl:ro
- - mg_lib6:/var/lib/memgraph
- - mg_log6:/var/log/memgraph
- command: ["--init-file=/tmp/init/license.cypherl", "--log-level=TRACE", "--also-log-to-stderr", "--bolt-port=7689", "--management-port=13013"]
- networks:
- memgraph_ha:
- ipv4_address: 172.21.0.8
- ports:
- - "7689:7689"
-```
-
-Cluster can be shut-down using `docker compose down`.
-
-## Manual Docker setup
-
-This example will show how to set up a highly available cluster in Memgraph using three coordinators and 3 data instances.
-
-
-
-### Start all instances
-
-1. Start coordinator1:
-```plaintext
-docker run --name coord1 --network=host -p 7691:7691 -p 7444:7444 -v mg_lib1:/var/lib/memgraph -v mg_log1:/var/log/memgraph -e MEMGRAPH_ORGANIZATION_NAME= -e MEMGRAPH_ENTERPRISE_LICENSE="" memgraph/memgraph --bolt-port=7691 --log-level=TRACE --also-log-to-stderr --coordinator-id=1 --coordinator-port=10111 --management-port=12121--coordinator-hostname=localhost --nuraft-log-file=/var/log/memgraph/nuraft
-```
-
-2. Start coordinator2:
-```plaintext
-docker run --name coord2 --network=host -p 7692:7692 -p 7445:7444 -v mg_lib2:/var/lib/memgraph -v mg_log2:/var/log/memgraph -e MEMGRAPH_ORGANIZATION_NAME= -e MEMGRAPH_ENTERPRISE_LICENSE="" memgraph/memgraph --bolt-port=7692 --log-level=TRACE --also-log-to-stderr --coordinator-id=2 --coordinator-port=10112 --management-port=12122--coordinator-hostname=localhost --nuraft-log-file=/var/log/memgraph/nuraft
-```
-
-3. Start coordinator3:
-```plaintext
-docker run --name coord3 --network=host -p 7693:7693 -p 7446:7444 -v mg_lib3:/var/lib/memgraph -v mg_log3:/var/log/memgraph -e MEMGRAPH_ORGANIZATION_NAME= -e MEMGRAPH_ENTERPRISE_LICENSE="" memgraph/memgraph --bolt-port=7693 --log-level=TRACE --also-log-to-stderr --coordinator-id=3 --coordinator-port=10113 --management-port=12123--coordinator-hostname=localhost --nuraft-log-file=/var/log/memgraph/nuraft
-```
-
-4. Start instance1:
-```plaintext
-docker run --name instance1 --network=host -p 7687:7687 -p 7447:7444 -v mg_lib4:/var/lib/memgraph -v mg_log4:/var/log/memgraph -e MEMGRAPH_ORGANIZATION_NAME= -e MEMGRAPH_ENTERPRISE_LICENSE="" memgraph/memgraph --bolt-port=7687 --log-level=TRACE --also-log-to-stderr --management-port=13011
-```
-
-5. Start instance2:
-```plaintext
-docker run --name instance2 --network=host -p 7688:7688 -p 7448:7444 -v mg_lib5:/var/lib/memgraph -v mg_log5:/var/log/memgraph -e MEMGRAPH_ORGANIZATION_NAME= -e MEMGRAPH_ENTERPRISE_LICENSE="" memgraph/memgraph --bolt-port=7688 --log-level=TRACE --also-log-to-stderr --management-port=13012
-```
-
-6. Start instance3:
-```plaintext
-docker run --name instance3 --network=host -p 7689:7689 -p 7449:7444 -v mg_lib6:/var/lib/memgraph -v mg_log6:/var/log/memgraph -e MEMGRAPH_ORGANIZATION_NAME= -e MEMGRAPH_ENTERPRISE_LICENSE="" memgraph/memgraph --bolt-port=7689 --log-level=TRACE --also-log-to-stderr --management-port=13013
-```
-
-### Register instances
-
-1. Start communication with any Memgraph client on any coordinator. Here we chose coordinator 1.
-```plaintext
-mgconsole --port=7691
-```
-2. Add coordinator instances to the cluster.
-
-```plaintext
-ADD COORDINATOR 1 WITH CONFIG {"bolt_server": "localhost:7691", "coordinator_server": "localhost:10111", "management_server": "localhost:12121"};
-ADD COORDINATOR 2 WITH CONFIG {"bolt_server": "localhost:7692", "coordinator_server": "localhost:10112", "management_server": "localhost:12122"};
-ADD COORDINATOR 3 WITH CONFIG {"bolt_server": "localhost:7693", "coordinator_server": "localhost:10113", "management_server": "localhost:12123"};
-```
-
-3. Register 3 data instances as part of the cluster:
-
-
-
-Replace `` with the container's IP address. This is necessary for Docker deployments where instances are not on the local host.
-
-
-
-```plaintext
-REGISTER INSTANCE instance_1 WITH CONFIG {"bolt_server": "localhost:7687", "management_server": "localhost:13011", "replication_server": "localhost:10001"};
-REGISTER INSTANCE instance_2 WITH CONFIG {"bolt_server": "localhost:7688", "management_server": "localhost:13012", "replication_server": "localhost:10002"};
-REGISTER INSTANCE instance_3 WITH CONFIG {"bolt_server": "localhost:7689", "management_server": "localhost:13013", "replication_server": "localhost:10003"};
-```
-
-4. Set instance_3 as main:
-
-```plaintext
-SET INSTANCE instance_3 TO main;
-```
-
-5. Connect to the leader coordinator and check cluster state with `SHOW INSTANCES`;
-
-| name | bolt_server | coordinator_server | management_server | health | role | last_succ_resp_ms |
-| ------------- | -------------- | ------------------ | ----------------- | ------ | -------- | ---------------- |
-| coordinator_1 | localhost:7691 | localhost:10111 | localhost:12121 | up | leader | 0 |
-| coordinator_2 | localhost:7692 | localhost:10112 | localhost:12122 | up | follower | 16 |
-| coordinator_3 | localhost:7693 | localhost:10113 | localhost:12123 | up | follower | 25 |
-| instance_1 | localhost:7687 | "" | localhost:13011 | up | replica | 39 |
-| instance_2 | localhost:7688 | "" | localhost:13012 | up | replica | 21 |
-| instance_3 | localhost:7689 | "" | localhost:13013 | up | main | 91 |
+## [Setup HA cluster (Docker Compose)](/clustering/high-availability/setup-ha-cluster-docker-compose)
+Learn how to setup a high availability cluster with Docker Compose.
-### Check automatic failover
+## [Setup HA cluster (Kubernetes)](/clustering/high-availability/setup-ha-cluster-k8s)
+Learn how to setup a high availability cluster with K8s and Helm Charts.
-Let's say that the current main instance is down for some reason. After `--instance-down-timeout-sec` seconds, the coordinator will realize
-that and automatically promote the first alive replica to become the new main. The output of running `SHOW INSTANCES` on the leader coordinator could then look like:
+## [Querying the cluster](/clustering/high-availability/querying-the-cluster)
+Learn how to query the cluster via the Bolt+routing protocol.
-| name | bolt_server | coordinator_server | management_server | health | role | last_succ_resp_ms |
-| ------------- | -------------- | ------------------ | ----------------- | ------ | -------- | ------------------|
-| coordinator_1 | localhost:7691 | localhost:10111 | localhost:12121 | up | leader | 0 |
-| coordinator_2 | localhost:7692 | localhost:10112 | localhost:12122 | up | follower | 34 |
-| coordinator_3 | localhost:7693 | localhost:10113 | localhost:12123 | up | follower | 28 |
-| instance_1 | localhost:7687 | "" | localhost:13011 | up | main | 61 |
-| instance_2 | localhost:7688 | "" | localhost:13012 | up | replica | 74 |
-| instance_3 | localhost:7689 | "" | localhost:13013 | down | unknown | 71222 |
+## [Best practices](/clustering/high-availability/best-practices)
+Introduce yourselves to the advices we give you in order to run a reliable cluster with Memgraph.
-
+## [HA commands reference guide](/clustering/high-availability/ha-commands-reference)
+The complete list of HA queries at your disposal.
diff --git a/pages/clustering/high-availability/_meta.ts b/pages/clustering/high-availability/_meta.ts
new file mode 100644
index 000000000..47919c1c3
--- /dev/null
+++ b/pages/clustering/high-availability/_meta.ts
@@ -0,0 +1,8 @@
+export default {
+ "setup-ha-cluster-docker": "Setup HA cluster with Docker",
+ "setup-ha-cluster-docker-compose": "Setup HA cluster with Docker Compose",
+ "setup-ha-cluster-k8s": "Setup HA cluster with K8s",
+ "querying-the-cluster": "Querying the cluster",
+ "best-practices": "Best practices",
+ "ha-commands-reference": "Reference commands",
+}
diff --git a/pages/clustering/high-availability/best-practices.mdx b/pages/clustering/high-availability/best-practices.mdx
new file mode 100644
index 000000000..e3207e766
--- /dev/null
+++ b/pages/clustering/high-availability/best-practices.mdx
@@ -0,0 +1,12 @@
+---
+title: Best practices when setting up high availability
+description: Various things for database administrators to bear in mind when deploying high availability with Memgraph.
+---
+
+import { Callout } from 'nextra/components'
+
+# Best practices when setting up high availability
+
+## How can I run data instances or coordinator instances?
+
+Depending on how configuration flags are set, Memgraph can run as a data instance or coordinator instance.
diff --git a/pages/clustering/high-availability/ha-commands-reference.mdx b/pages/clustering/high-availability/ha-commands-reference.mdx
new file mode 100644
index 000000000..84def6991
--- /dev/null
+++ b/pages/clustering/high-availability/ha-commands-reference.mdx
@@ -0,0 +1,346 @@
+---
+title: High availability reference queries
+description: Complete reference guide for all high availability commands in Memgraph, including cluster management, instance operations, and monitoring queries.
+---
+
+import { Callout } from 'nextra/components'
+import {CommunityLinks} from '/components/social-card/CommunityLinks'
+
+# High availability reference queries
+
+This page provides a comprehensive reference for all commands available in Memgraph's high availability cluster management.
+
+## Cluster setup commands
+
+### ADD COORDINATOR
+
+Adds a coordinator instance to the cluster.
+
+```cypher
+ADD COORDINATOR coordinatorId WITH CONFIG {"bolt_server": boltServer, "coordinator_server": coordinatorServer, "management_server": managementServer};
+```
+
+**Parameters:**
+- `coordinatorId` (integer): Unique identifier for the coordinator
+- `bolt_server` (string): External bolt server endpoint for client connections
+- `coordinator_server` (string): Server endpoint for Raft communication
+- `management_server` (string): Server endpoint for health checks and cluster management
+
+**Example:**
+```cypher
+ADD COORDINATOR 1 WITH CONFIG {"bolt_server": "127.0.0.1:7691", "coordinator_server": "127.0.0.1:10111", "management_server": "127.0.0.1:12111"};
+```
+
+
+This command needs to be run for all coordinators in the cluster. The order of adding coordinators and data instances doesn't matter.
+
+
+### REGISTER INSTANCE
+
+Registers a data instance to the cluster.
+
+```cypher
+REGISTER INSTANCE instanceName ( AS ASYNC | AS STRICT_SYNC ) ? WITH CONFIG {"bolt_server": boltServer, "management_server": managementServer, "replication_server": replicationServer};
+```
+
+**Parameters:**
+- `instanceName` (string): Unique name for the data instance
+- `AS ASYNC | AS STRICT_SYNC` (string, optional): Optional replication mode (defaults to SYNC)
+- `bolt_server` (string): External bolt server endpoint
+- `management_server` (string): Server endpoint for coordinator communication
+- `replication_server` (string): Server endpoint for data replication
+
+**Replication modes:**
+- `ASYNC`: Asynchronous replication (fastest, potential data loss)
+- `SYNC`: Synchronous replication (balanced, minimal data loss)
+- `STRICT_SYNC`: Strict synchronous replication (slowest, no data loss)
+
+**Example:**
+```cypher
+REGISTER INSTANCE instance_1 AS SYNC WITH CONFIG {"bolt_server": "localhost:7687", "management_server": "instance1:13011", "replication_server": "instance1:10001"};
+```
+
+## Instance management commands
+
+### SET INSTANCE TO MAIN
+
+Promotes a data instance to become the main (writable) instance.
+
+```cypher
+SET INSTANCE instanceName TO main;
+```
+
+**Parameters:**
+- `instanceName` (string): Name of the instance to promote
+
+**Behavior:**
+- Registers all other instances as replicas to the new main
+- Fails if the target instance is unavailable
+- Fails if there's already a main instance in the cluster
+- Results in writing to the Raft log
+
+**Example:**
+```cypher
+SET INSTANCE instance_3 TO main;
+```
+
+### DEMOTE INSTANCE
+
+Demotes the current main instance to replica status.
+
+```cypher
+DEMOTE INSTANCE instanceName;
+```
+
+**Parameters:**
+- `instanceName` (string): Name of the instance to demote
+
+**Behavior:**
+- Manually demotes the main instance to replica
+- Requires manual promotion of another instance to main
+- Results in writing to the Raft log
+
+
+Combining `DEMOTE INSTANCE` and `SET INSTANCE TO MAIN` provides manual failover capability, useful during maintenance work.
+
+
+### UNREGISTER INSTANCE
+
+Removes a data instance from the cluster.
+
+```cypher
+UNREGISTER INSTANCE instanceName;
+```
+
+**Parameters:**
+- `instanceName` (string): Name of the instance to remove
+
+**Requirements:**
+- The instance being unregistered must NOT be the main instance
+- The cluster must have an alive main instance during unregistration
+- The instance will be removed from the current main's replica set
+
+**Example:**
+```cypher
+UNREGISTER INSTANCE instance_2;
+```
+
+### REMOVE COORDINATOR
+
+Removes a coordinator instance from the cluster.
+
+```cypher
+REMOVE COORDINATOR coordinatorId;
+```
+
+**Parameters:**
+- `coordinatorId` (integer): ID of the coordinator to remove
+
+**Restrictions:**
+- Can only be executed on the leader coordinator
+- Cannot remove the current leader (prohibited by NuRaft)
+- To remove the current leader, first trigger a leadership change
+
+**Example:**
+```cypher
+REMOVE COORDINATOR 2;
+```
+
+## Cluster state management
+
+### FORCE RESET CLUSTER STATE
+
+Forces a reset of the cluster state when the cluster gets stuck.
+
+```cypher
+FORCE RESET CLUSTER STATE;
+```
+
+**Actions performed:**
+1. Demotes each alive instance to replica
+2. Chooses a new main instance from alive instances
+3. Instances that are down will be demoted to replicas when they come back up
+
+**Usage:**
+- Execute on the leader coordinator
+- Results in writing to the Raft log
+- Use only when the cluster is in an inconsistent state
+
+## Monitoring and information commands
+
+### SHOW INSTANCES
+
+Displays the state of all instances in the cluster.
+
+```cypher
+SHOW INSTANCES;
+```
+
+**Information displayed:**
+- Network endpoints for cluster management
+- Health state of each server
+- Role (main, replica, LEADER, FOLLOWER, or unknown)
+- Time since last response to leader's health ping
+
+**Behavior:**
+- Can be run on leader or followers
+- Followers forward the request to the leader for accurate information
+- If leader is unavailable, followers return instances with "down" health state
+
+**Example output:**
+```
+| name | bolt_server | coordinator_server | management_server | health | role | last_succ_resp_ms |
+| ------------- | -------------- | ------------------ | ----------------- | ------ | -------- | ---------------- |
+| coordinator_1 | localhost:7691 | localhost:10111 | localhost:12121 | up | leader | 0 |
+| instance_1 | localhost:7687 | "" | localhost:13011 | up | replica | 39 |
+| instance_3 | localhost:7689 | "" | localhost:13013 | up | main | 91 |
+```
+
+### SHOW INSTANCE
+
+Shows information about the current coordinator instance.
+
+```cypher
+SHOW INSTANCE;
+```
+
+**Information returned:**
+- Instance name
+- External bolt server endpoint
+- Coordinator server for Raft communication
+- Management server for inter-coordinator communication
+- Cluster role (leader or follower)
+
+**Note:** If `ADD COORDINATOR` wasn't run for the current instance, the bolt server value will be empty.
+
+### SHOW REPLICATION LAG
+
+Displays the current replication lag for each instance.
+
+```cypher
+SHOW REPLICATION LAG;
+```
+
+**Information provided:**
+- Replication lag expressed as number of committed transactions
+- Made durable through snapshots and WALs
+- Useful for manual failover to check data loss risk
+
+**Usage:**
+- Run on the cluster's leader
+- Helps assess data consistency before failover operations
+
+## Configuration commands
+
+### SET COORDINATOR SETTING
+
+Configures various cluster settings at runtime.
+
+#### Enable reads on main
+
+```cypher
+SET COORDINATOR SETTING 'enabled_reads_on_main' TO 'true'/'false';
+```
+
+**Parameters:**
+- `'enabled_reads_on_main'` (string): Setting name
+- `'true'/'false'` (string): Boolean value as string
+
+Controls whether read queries are allowed on the main instance (default: false).
+
+#### Sync failover only
+
+```cypher
+SET COORDINATOR SETTING 'sync_failover_only' TO 'true'/'false';
+```
+
+**Parameters:**
+- `'sync_failover_only'` (string): Setting name
+- `'true'/'false'` (string): Boolean value as string
+
+Controls whether failover to async replicas is allowed (default: true).
+
+#### Maximum failover replica lag
+
+```cypher
+SET COORDINATOR SETTING 'max_failover_replica_lag' TO '10';
+```
+
+**Parameters:**
+- `'max_failover_replica_lag'` (string): Setting name
+- `'10'` (string): Numeric value as string representing transaction count
+
+Sets the maximum transaction lag allowed during failover. Replicas behind by more than this threshold become ineligible for failover.
+
+#### Maximum replica read lag
+
+```cypher
+SET COORDINATOR SETTING 'max_replica_read_lag_' TO '10';
+```
+
+**Parameters:**
+- `'max_replica_read_lag_'` (string): Setting name
+- `'10'` (string): Numeric value as string representing transaction count
+
+Controls the maximum allowed replica lag for read consistency. Replicas behind by more than this threshold are excluded from read query routing.
+
+### SHOW COORDINATOR SETTINGS
+
+Displays all current coordinator configuration settings.
+
+```cypher
+SHOW COORDINATOR SETTINGS;
+```
+
+Returns all runtime configuration options and their current values.
+
+## Connection and routing
+
+### Bolt+routing protocol
+
+When using high availability, connect to coordinators using the `neo4j://` scheme instead of `bolt://`:
+
+**Standard connection:**
+```
+bolt://
+```
+
+**HA connection with routing:**
+```
+neo4j://
+```
+
+**Benefits:**
+- Automatic routing to current main instance
+- Prevents split-brain scenarios
+- Seamless failover handling
+- Write queries always go to the current main
+- Read queries can be distributed across replicas
+
+
+Cluster setup commands (registration, coordinator management) must be done using standard `bolt://` connections, not `neo4j://` routing connections.
+
+
+## Best practices
+
+### Command execution order
+
+1. **Start all instances** (coordinators and data instances)
+2. **Add coordinators** to the cluster
+3. **Register data instances** to the cluster
+4. **Set one instance as main**
+5. **Verify cluster state** with `SHOW INSTANCES`
+
+### Health monitoring
+
+- Use `SHOW INSTANCES` regularly to monitor cluster health
+- Check `SHOW REPLICATION LAG` before manual failovers
+- Monitor coordinator settings with `SHOW COORDINATOR SETTINGS`
+
+### Failover considerations
+
+- Use `SHOW REPLICATION LAG` to assess data loss risk
+- Consider replication modes when planning failovers
+- Test failover procedures in non-production environments
+
+
diff --git a/pages/clustering/high-availability/querying-the-cluster.mdx b/pages/clustering/high-availability/querying-the-cluster.mdx
new file mode 100644
index 000000000..be0a8d546
--- /dev/null
+++ b/pages/clustering/high-availability/querying-the-cluster.mdx
@@ -0,0 +1,275 @@
+---
+title: Querying the cluster
+description: Learn how to query Memgraph high availability clusters using the bolt+routing protocol with automatic failover and load balancing.
+---
+
+import { Callout } from 'nextra/components'
+import {CommunityLinks} from '/components/social-card/CommunityLinks'
+
+# Querying the cluster
+
+The bolt+routing protocol is Memgraph's solution for connecting to high availability clusters. It automatically routes queries to the appropriate instances and handles failover seamlessly, ensuring your applications remain connected even when cluster topology changes.
+
+## Overview
+
+In a high availability cluster, directly connecting to the main instance isn't recommended because the main instance can change due to various failures. The bolt+routing protocol solves this by providing intelligent routing that:
+
+- **Automatically routes write queries** to the current main instance
+- **Distributes read queries** across available replicas
+- **Handles failover** without application changes
+- **Prevents split-brain scenarios** by ensuring clients never write to old main instances
+
+## How bolt+routing works
+
+### Connection flow
+
+1. **Client connects** to any coordinator instance using `neo4j://` scheme
+2. **Coordinator responds** with a routing table containing:
+ - Instances from which data can be read
+ - The instance where data can be written
+ - Instances acting as routers
+3. **Client uses routing table** to direct subsequent queries appropriately
+
+### Routing table structure
+
+The routing table contains three types of entries:
+
+| Entry Type | Description | Usage |
+|------------|-------------|-------|
+| **Read instances** | Replica instances available for read queries | Load balancing read operations |
+| **Write instance** | Current main instance for write queries | All write operations |
+| **Router instances** | Coordinator instances for routing requests | Future routing requests |
+
+## Connection strings
+
+### Standard vs HA connections
+
+**Standard connection (single instance):**
+```
+bolt://:
+```
+
+**HA connection with routing:**
+```
+neo4j://:
+```
+
+### Connection examples
+
+```javascript
+// Standard connection
+const driver = neo4j.driver("bolt://localhost:7687", auth);
+
+// HA connection with routing
+const driver = neo4j.driver("neo4j://localhost:7691", auth);
+```
+
+## Client-side routing
+
+Bolt+routing is a **client-side routing protocol**, meaning network endpoint resolution happens inside the drivers. This provides several benefits:
+
+- **Transparent failover**: Applications don't need to handle connection changes
+- **Automatic load balancing**: Read queries are distributed across replicas
+- **Consistent routing**: All clients receive the same routing information
+- **Reliability**: Raft consensus ensures accurate routing data
+
+## Cluster roles and routing
+
+### Instance roles in routing
+
+| Role | Function | Query Types |
+|------|----------|-------------|
+| **Main instance** | Primary writable instance | Write queries only (by default) |
+| **Replica instances** | Read-only instances | Read queries |
+| **Coordinator instances** | Routing and cluster management | Routing requests only |
+
+### Routing behavior
+
+- **Write queries**: Always routed to the current main instance
+- **Read queries**: Distributed across available replica instances
+- **Routing requests**: Handled by coordinator instances
+- **Failover**: Automatic promotion of replica to main when needed
+
+## Configuration options
+
+### Enable reads on main
+
+By default, the main instance only handles write queries. You can enable read queries on the main instance:
+
+```cypher
+SET COORDINATOR SETTING 'enabled_reads_on_main' TO 'true';
+```
+
+**Parameters:**
+- `'enabled_reads_on_main'` (string): Setting name
+- `'true'` (string): Enable reads on main instance
+
+### Replica read lag control
+
+Control the maximum allowed replica lag for read consistency:
+
+```cypher
+SET COORDINATOR SETTING 'max_replica_read_lag_' TO '10';
+```
+
+**Parameters:**
+- `'max_replica_read_lag_'` (string): Setting name
+- `'10'` (string): Maximum transaction lag as string
+
+**Behavior:**
+- Replicas behind by more than this threshold are excluded from read routing
+- Ensures data freshness for read operations
+
+## Connection examples by language
+
+### Python
+
+```python
+from neo4j import GraphDatabase
+
+# HA connection
+driver = GraphDatabase.driver("neo4j://localhost:7691", auth=("username", "password"))
+
+# Use the driver normally - routing is handled automatically
+with driver.session() as session:
+ result = session.run("MATCH (n) RETURN count(n)")
+ print(result.single()[0])
+```
+
+### JavaScript/Node.js
+
+```javascript
+const neo4j = require('neo4j-driver');
+
+// HA connection
+const driver = neo4j.driver("neo4j://localhost:7691",
+ neo4j.auth.basic("username", "password"));
+
+// Use the driver normally
+const session = driver.session();
+session.run("MATCH (n) RETURN count(n)")
+ .then(result => console.log(result.records[0].get(0)));
+```
+
+### Java
+
+```java
+import org.neo4j.driver.*;
+
+// HA connection
+Driver driver = GraphDatabase.driver("neo4j://localhost:7691",
+ AuthTokens.basic("username", "password"));
+
+// Use the driver normally
+try (Session session = driver.session()) {
+ Result result = session.run("MATCH (n) RETURN count(n)");
+ System.out.println(result.single().get(0));
+}
+```
+
+### C#
+
+```csharp
+using Neo4j.Driver;
+
+// HA connection
+using var driver = GraphDatabase.Driver("neo4j://localhost:7691",
+ AuthTokens.Basic("username", "password"));
+
+// Use the driver normally
+using var session = driver.Session();
+var result = session.Run("MATCH (n) RETURN count(n)");
+Console.WriteLine(result.Single()[0]);
+```
+
+## Best practices
+
+### Connection management
+
+1. **Use connection pooling**: Most drivers provide built-in connection pooling
+2. **Handle connection failures**: Implement retry logic for transient failures
+3. **Monitor connection health**: Use driver health check features when available
+
+### Query optimization
+
+1. **Use read replicas**: Distribute read queries across replicas for better performance
+2. **Minimize cross-database queries**: Keep queries within the same database when possible
+3. **Use appropriate transaction types**: Read transactions for queries, write transactions for modifications
+
+### Error handling
+
+```python
+from neo4j import GraphDatabase
+from neo4j.exceptions import ServiceUnavailable
+
+def execute_with_retry(driver, query, max_retries=3):
+ for attempt in range(max_retries):
+ try:
+ with driver.session() as session:
+ return session.run(query)
+ except ServiceUnavailable as e:
+ if attempt == max_retries - 1:
+ raise e
+ time.sleep(2 ** attempt) # Exponential backoff
+```
+
+## Troubleshooting
+
+### Common issues
+
+**Connection refused errors:**
+- Verify coordinator instances are running
+- Check firewall settings for coordinator ports
+- Ensure proper network connectivity
+
+**Routing table errors:**
+- Verify cluster state with `SHOW INSTANCES`
+- Check coordinator health and leadership
+- Ensure proper cluster configuration
+
+**Query routing issues:**
+- Verify instance roles (main/replica)
+- Check replication lag settings
+- Monitor cluster health metrics
+
+### Debugging connections
+
+1. **Check cluster state:**
+ ```cypher
+ SHOW INSTANCES;
+ ```
+
+2. **Verify coordinator settings:**
+ ```cypher
+ SHOW COORDINATOR SETTINGS;
+ ```
+
+3. **Monitor replication lag:**
+ ```cypher
+ SHOW REPLICATION LAG;
+ ```
+
+## Limitations and considerations
+
+### Cluster setup restrictions
+
+- **Setup commands must use bolt://**: Cluster management commands (registration, coordinator setup) require direct `bolt://` connections
+- **Routing only for data queries**: `neo4j://` connections only handle data-oriented queries, not cluster management
+
+### Network requirements
+
+- **Stable network**: Requires reliable network connectivity between instances
+- **Port accessibility**: All coordinator and data instance ports must be accessible
+- **DNS resolution**: Use DNS names instead of IP addresses for better reliability
+
+### Performance considerations
+
+- **Routing overhead**: Small additional latency for routing table requests
+- **Connection pooling**: May need to adjust pool sizes for HA workloads
+- **Load balancing**: Read distribution depends on replica availability
+
+
+For detailed examples of bolt+routing usage in different programming languages, check the [Memgraph drivers repository](https://github.com/memgraph/memgraph/tree/master/tests/drivers).
+
+
+
diff --git a/pages/clustering/high-availability/setup-ha-cluster-docker-compose.mdx b/pages/clustering/high-availability/setup-ha-cluster-docker-compose.mdx
new file mode 100644
index 000000000..3397a1b1c
--- /dev/null
+++ b/pages/clustering/high-availability/setup-ha-cluster-docker-compose.mdx
@@ -0,0 +1,306 @@
+---
+title: Setup high availability cluster with Docker Compose
+description: See how one can setup a high availability cluster with Docker Compose.
+---
+
+import { Callout } from 'nextra/components'
+import { Steps } from 'nextra/components'
+
+# Setup high availability cluster with Docker Compose (Enterprise)
+
+## Docker Compose
+
+The following example shows you how to setup Memgraph cluster using Docker Compose.
+The cluster will use user-defined bridge network.
+
+
+
+**In production, it is always advised to run each instance on its dedicated server.** The setup however is
+very similar. Make sure that the IPs of the machines are accessible inside the cluster in order to do proper
+registration of the cluster.
+
+
+
+
+
+### Start the Docker Compose
+
+You can directly use the following `docker-compose.yml` to start the cluster using `docker compose up`.
+The only thing you need to modify is put your license organization name, and the enterprise key.
+
+```
+services:
+ coord1:
+ image: "memgraph/memgraph"
+ container_name: coord1
+ volumes:
+ - mg_lib1:/var/lib/memgraph
+ - mg_log1:/var/log/memgraph
+ command: [
+ "--log-level=TRACE",
+ "--also-log-to-stderr=true",
+ "--bolt-port=7691",
+ "--coordinator-id=1",
+ "--coordinator-port=10111",
+ "--management-port=12121",
+ "--coordinator-hostname=coord1",
+ "--nuraft-log-file=/var/log/memgraph/nuraft"
+ ]
+ networks:
+ memgraph_ha:
+ ipv4_address: 172.21.0.4
+ ports:
+ - "7691:7691"
+ environment:
+ - MEMGRAPH_ORGANIZATION_NAME=
+ - MEMGRAPH_ENTERPRISE_LICENSE=
+ depends_on:
+ - instance1
+ - instance2
+ - instance3
+
+ coord2:
+ image: "memgraph/memgraph"
+ container_name: coord2
+ volumes:
+ - mg_lib2:/var/lib/memgraph
+ - mg_log2:/var/log/memgraph
+ command: [
+ "--log-level=TRACE",
+ "--also-log-to-stderr=true",
+ "--bolt-port=7692",
+ "--coordinator-id=2",
+ "--coordinator-port=10112",
+ "--management-port=12122",
+ "--coordinator-hostname=coord2",
+ "--nuraft-log-file=/var/log/memgraph/nuraft"
+ ]
+ networks:
+ memgraph_ha:
+ ipv4_address: 172.21.0.2
+ ports:
+ - "7692:7692"
+ environment:
+ - MEMGRAPH_ORGANIZATION_NAME=
+ - MEMGRAPH_ENTERPRISE_LICENSE=
+ depends_on:
+ - instance1
+ - instance2
+ - instance3
+
+ coord3:
+ image: "memgraph/memgraph"
+ container_name: coord3
+ volumes:
+ - mg_lib3:/var/lib/memgraph
+ - mg_log3:/var/log/memgraph
+ command: [
+ "--log-level=TRACE",
+ "--also-log-to-stderr=true",
+ "--bolt-port=7693",
+ "--coordinator-id=3",
+ "--coordinator-port=10113",
+ "--management-port=12123",
+ "--coordinator-hostname=coord3",
+ "--nuraft-log-file=/var/log/memgraph/nuraft"
+ ]
+
+ networks:
+ memgraph_ha:
+ ipv4_address: 172.21.0.3
+ ports:
+ - "7693:7693"
+ environment:
+ - MEMGRAPH_ORGANIZATION_NAME=
+ - MEMGRAPH_ENTERPRISE_LICENSE=
+ depends_on:
+ - instance1
+ - instance2
+ - instance3
+
+ instance1:
+ image: "memgraph/memgraph"
+ container_name: instance1
+ volumes:
+ - mg_lib4:/var/lib/memgraph
+ - mg_log4:/var/log/memgraph
+ command: [
+ "--log-level=TRACE",
+ "--also-log-to-stderr=true",
+ "--bolt-port=7687",
+ "--management-port=13011"
+ ]
+ networks:
+ memgraph_ha:
+ ipv4_address: 172.21.0.6
+ ports:
+ - "7687:7687"
+ environment:
+ - MEMGRAPH_ORGANIZATION_NAME=
+ - MEMGRAPH_ENTERPRISE_LICENSE=
+
+ instance2:
+ image: "memgraph/memgraph"
+ container_name: instance2
+ volumes:
+ - mg_lib5:/var/lib/memgraph
+ - mg_log5:/var/log/memgraph
+ command: [
+ "--log-level=TRACE",
+ "--also-log-to-stderr=true",
+ "--bolt-port=7688",
+ "--management-port=13012"
+ ]
+ networks:
+ memgraph_ha:
+ ipv4_address: 172.21.0.7
+ ports:
+ - "7688:7688"
+ environment:
+ - MEMGRAPH_ORGANIZATION_NAME=
+ - MEMGRAPH_ENTERPRISE_LICENSE=
+
+ instance3:
+ image: "memgraph/memgraph"
+ container_name: instance3
+ volumes:
+ - mg_lib6:/var/lib/memgraph
+ - mg_log6:/var/log/memgraph
+ command: [
+ "--log-level=TRACE",
+ "--also-log-to-stderr=true",
+ "--bolt-port=7689",
+ "--management-port=13013"
+ ]
+ networks:
+ memgraph_ha:
+ ipv4_address: 172.21.0.8
+ ports:
+ - "7689:7689"
+ environment:
+ - MEMGRAPH_ORGANIZATION_NAME=
+ - MEMGRAPH_ENTERPRISE_LICENSE=
+
+volumes:
+ mg_lib1:
+ mg_lib2:
+ mg_lib3:
+ mg_lib4:
+ mg_lib5:
+ mg_lib6:
+ mg_log1:
+ mg_log2:
+ mg_log3:
+ mg_log4:
+ mg_log5:
+ mg_log6:
+
+networks:
+ memgraph_ha:
+ name: memgraph_ha
+ driver: bridge
+ ipam:
+ driver: default
+ config:
+ - subnet: "172.21.0.0/16"
+```
+
+### Validate license is correctly set
+
+All the following queries can be run by querying directly coordinator 1, which we can conventionally
+assume to be the leader for the cluster.
+
+First, let's validate that the license has been correctly set, by executing the following query:
+
+```
+SHOW LICENSE INFO;
+```
+
+```nocopy
+```
+
+### Register the coordinator instances
+
+Next, we proceed first by registering all the coordinators in the cluster.
+
+The following query is a self-registration of coordinator 1, which also needs to be executed:
+```
+ADD COORDINATOR 1 WITH CONFIG {
+ "bolt_server": "localhost:7691",
+ "coordinator_server": "coord1:10111",
+ "management_server": "coord1:12121"
+};
+```
+
+Next up, we register the two other coordinators in the cluster:
+```
+ADD COORDINATOR 2 WITH CONFIG {
+ "bolt_server": "localhost:7692",
+ "coordinator_server": "coord2:10112",
+ "management_server": "coord2:12122"
+};
+ADD COORDINATOR 3 WITH CONFIG {
+ "bolt_server": "localhost:7693",
+ "coordinator_server": "coord3:10113",
+ "management_server": "coord3:12123"
+};
+```
+
+We can see the state of the cluster by executing the following query:
+```
+SHOW INSTANCES
+```
+
+```nocopy
+```
+
+We observe that there is indeed one leader and two followers in the cluster:
+
+### Register the data instances
+
+We continue by registering the 3 data instances:
+```
+REGISTER INSTANCE instance_1 WITH CONFIG {
+ "bolt_server": "localhost:7687",
+ "management_server": "instance1:13011",
+ "replication_server": "instance1:10001"
+};
+REGISTER INSTANCE instance_2 WITH CONFIG {
+ "bolt_server": "localhost:7688",
+ "management_server": "instance2:13012",
+ "replication_server": "instance2:10002"
+};
+REGISTER INSTANCE instance_3 WITH CONFIG {
+ "bolt_server": "localhost:7689",
+ "management_server": "instance3:13013",
+ "replication_server": "instance3:10003"
+};
+```
+
+### Setup one of the instances as MAIN
+
+We promote one of the instances as MAIN. The rest of them will serve as REPLICAs.
+
+```
+SET INSTANCE instance_3 TO main;
+```
+
+### Observe the state of the cluster
+
+By issuing again the command:
+
+```
+SHOW INSTANCES;
+```
+
+We can observe that there is indeed one MAIN instance and 2 REPLICA instances:
+
+```nocopy
+```
+
+Since the host can't resolve the IP for coordinators and data instances, Bolt
+servers in Docker Compose setup require `bolt_server` set to `localhost:`.
+
+Cluster can be shut-down using `docker compose down`.
+
+
\ No newline at end of file
diff --git a/pages/clustering/high-availability/setup-ha-cluster-docker.mdx b/pages/clustering/high-availability/setup-ha-cluster-docker.mdx
new file mode 100644
index 000000000..f1a4ec9b7
--- /dev/null
+++ b/pages/clustering/high-availability/setup-ha-cluster-docker.mdx
@@ -0,0 +1,205 @@
+---
+title: Setup high availability cluster with Docker
+description: See how one can setup a high availability cluster with Docker.
+---
+
+import { Callout } from 'nextra/components'
+import { Steps } from 'nextra/components'
+
+# Setup high availability cluster with Docker (Enterprise)
+
+This example will show how to set up a highly available cluster in Memgraph using
+3 coordinators and 3 data instances. This setup example has been created locally using one server.
+
+
+
+**In production, it is always advised to run each instance on its dedicated server.** The setup however is
+very similar. Make sure that the IPs of the machines are accessible inside the cluster in order to do proper
+registration of the cluster.
+
+
+
+
+
+### Start all instances
+
+1. Start coordinator1:
+```plaintext
+docker run --name coord1 --network=host -p 7691:7691 -p 7444:7444
+ -v mg_lib1:/var/lib/memgraph
+ -v mg_log1:/var/log/memgraph
+ -e MEMGRAPH_ORGANIZATION_NAME=
+ -e MEMGRAPH_ENTERPRISE_LICENSE=""
+ memgraph/memgraph-mage
+ --bolt-port=7691
+ --log-level=TRACE
+ --also-log-to-stderr
+ --coordinator-id=1
+ --coordinator-port=10111
+ --management-port=12121
+ --coordinator-hostname=localhost
+ --nuraft-log-file=/var/log/memgraph/nuraft
+```
+
+2. Start coordinator2:
+```plaintext
+docker run --name coord2 --network=host -p 7692:7692 -p 7445:7444
+ -v mg_lib2:/var/lib/memgraph
+ -v mg_log2:/var/log/memgraph
+ -e MEMGRAPH_ORGANIZATION_NAME=
+ -e MEMGRAPH_ENTERPRISE_LICENSE=""
+ memgraph/memgraph-mage
+ --bolt-port=7692
+ --log-level=TRACE
+ --also-log-to-stderr
+ --coordinator-id=2
+ --coordinator-port=10112
+ --management-port=12122
+ --coordinator-hostname=localhost
+ --nuraft-log-file=/var/log/memgraph/nuraft
+```
+
+3. Start coordinator3:
+```plaintext
+docker run --name coord3 --network=host -p 7693:7693 -p 7446:7444
+ -v mg_lib3:/var/lib/memgraph
+ -v mg_log3:/var/log/memgraph
+ -e MEMGRAPH_ORGANIZATION_NAME=
+ -e MEMGRAPH_ENTERPRISE_LICENSE=""
+ memgraph/memgraph-mage
+ --bolt-port=7693
+ --log-level=TRACE
+ --also-log-to-stderr
+ --coordinator-id=3
+ --coordinator-port=10113
+ --management-port=12123
+ --coordinator-hostname=localhost
+ --nuraft-log-file=/var/log/memgraph/nuraft
+```
+
+4. Start instance1:
+```plaintext
+docker run --name instance1 --network=host -p 7687:7687 -p 7447:7444
+ -v mg_lib4:/var/lib/memgraph
+ -v mg_log4:/var/log/memgraph
+ -e MEMGRAPH_ORGANIZATION_NAME=
+ -e MEMGRAPH_ENTERPRISE_LICENSE=""
+ memgraph/memgraph-mage
+ --bolt-port=7687
+ --log-level=TRACE
+ --also-log-to-stderr
+ --management-port=13011
+```
+
+5. Start instance2:
+```plaintext
+docker run --name instance2 --network=host -p 7688:7688 -p 7448:7444
+ -v mg_lib5:/var/lib/memgraph
+ -v mg_log5:/var/log/memgraph
+ -e MEMGRAPH_ORGANIZATION_NAME=
+ -e MEMGRAPH_ENTERPRISE_LICENSE=""
+ memgraph/memgraph-mage
+ --bolt-port=7688
+ --log-level=TRACE
+ --also-log-to-stderr
+ --management-port=13012
+```
+
+6. Start instance3:
+```plaintext
+docker run --name instance3 --network=host -p 7689:7689 -p 7449:7444
+ -v mg_lib6:/var/lib/memgraph
+ -v mg_log6:/var/log/memgraph
+ -e MEMGRAPH_ORGANIZATION_NAME=
+ -e MEMGRAPH_ENTERPRISE_LICENSE=""
+ memgraph/memgraph-mage
+ --bolt-port=7689
+ --log-level=TRACE
+ --also-log-to-stderr
+ --management-port=13013
+```
+
+### Register instances
+
+1. Start communication with any Memgraph client on any coordinator. Here we chose coordinator 1.
+```plaintext
+mgconsole --port=7691
+```
+2. Add coordinator instances to the cluster.
+
+```plaintext
+ADD COORDINATOR 1 WITH CONFIG {
+ "bolt_server": "localhost:7691",
+ "coordinator_server": "localhost:10111",
+ "management_server": "localhost:12121"
+};
+ADD COORDINATOR 2 WITH CONFIG {
+ "bolt_server": "localhost:7692",
+ "coordinator_server": "localhost:10112",
+ "management_server": "localhost:12122"
+};
+ADD COORDINATOR 3 WITH CONFIG {
+ "bolt_server": "localhost:7693",
+ "coordinator_server": "localhost:10113",
+ "management_server": "localhost:12123"
+};
+```
+
+3. Register 3 data instances as part of the cluster:
+
+
+
+Replace `` with the container's IP address. This is necessary for Docker deployments where instances are not on the local host.
+
+
+
+```plaintext
+REGISTER INSTANCE instance_1 WITH CONFIG {
+ "bolt_server": "localhost:7687",
+ "management_server": "localhost:13011",
+ "replication_server": "localhost:10001"
+};
+REGISTER INSTANCE instance_2 WITH CONFIG {
+ "bolt_server": "localhost:7688",
+ "management_server": "localhost:13012",
+ "replication_server": "localhost:10002"
+};
+REGISTER INSTANCE instance_3 WITH CONFIG {
+ "bolt_server": "localhost:7689",
+ "management_server": "localhost:13013",
+ "replication_server": "localhost:10003"
+};
+```
+
+4. Set instance_3 as MAIN:
+
+```plaintext
+SET INSTANCE instance_3 TO MAIN;
+```
+
+5. Connect to the leader coordinator and check cluster state with `SHOW INSTANCES`;
+
+| name | bolt_server | coordinator_server | management_server | health | role | last_succ_resp_ms |
+| ------------- | -------------- | ------------------ | ----------------- | ------ | -------- | ---------------- |
+| coordinator_1 | localhost:7691 | localhost:10111 | localhost:12121 | up | leader | 0 |
+| coordinator_2 | localhost:7692 | localhost:10112 | localhost:12122 | up | follower | 16 |
+| coordinator_3 | localhost:7693 | localhost:10113 | localhost:12123 | up | follower | 25 |
+| instance_1 | localhost:7687 | "" | localhost:13011 | up | replica | 39 |
+| instance_2 | localhost:7688 | "" | localhost:13012 | up | replica | 21 |
+| instance_3 | localhost:7689 | "" | localhost:13013 | up | main | 91 |
+
+### Check automatic failover
+
+Let's say that the current main instance is down for some reason. After `--instance-down-timeout-sec` seconds, the coordinator will realize
+that and automatically promote the first alive replica to become the new main. The output of running `SHOW INSTANCES` on the leader coordinator could then look like:
+
+| name | bolt_server | coordinator_server | management_server | health | role | last_succ_resp_ms |
+| ------------- | -------------- | ------------------ | ----------------- | ------ | -------- | ------------------|
+| coordinator_1 | localhost:7691 | localhost:10111 | localhost:12121 | up | leader | 0 |
+| coordinator_2 | localhost:7692 | localhost:10112 | localhost:12122 | up | follower | 34 |
+| coordinator_3 | localhost:7693 | localhost:10113 | localhost:12123 | up | follower | 28 |
+| instance_1 | localhost:7687 | "" | localhost:13011 | up | main | 61 |
+| instance_2 | localhost:7688 | "" | localhost:13012 | up | replica | 74 |
+| instance_3 | localhost:7689 | "" | localhost:13013 | down | unknown | 71222 |
+
+
diff --git a/pages/clustering/high-availability/setup-ha-cluster-k8s.mdx b/pages/clustering/high-availability/setup-ha-cluster-k8s.mdx
new file mode 100644
index 000000000..ab99b6431
--- /dev/null
+++ b/pages/clustering/high-availability/setup-ha-cluster-k8s.mdx
@@ -0,0 +1,251 @@
+---
+title: Setup high availability cluster with Kubernetes
+description: See how one can setup a high availability cluster with Kubernetes.
+---
+
+import { Callout } from 'nextra/components'
+import { Steps } from 'nextra/components'
+
+# Setup high availability cluster with K8s (Enterprise)
+
+
+## Kubernetes
+
+We support deploying Memgraph HA as part of the Kubernetes cluster through Helm charts.
+You can see example configurations [here](/getting-started/install-memgraph/kubernetes#memgraph-high-availability-helm-chart).
+
+## In-Service Software Upgrade (ISSU)
+
+Memgraph’s **High Availability** supports in-service software upgrades (ISSU).
+This guide explains the process when using [HA Helm
+charts]((/getting-started/install-memgraph/kubernetes#memgraph-high-availability-helm-chart)).
+The procedure is very similar for native deployments.
+
+
+
+**Important**: Although the upgrade process is designed to complete
+successfully, unexpected issues may occur. We strongly recommend doing a backup
+of your `lib` directory on all of your `StatefulSets` or native instances
+depending on the deployment type.
+
+
+
+
+
+{ Prerequisites
}
+
+If you are using **HA Helm charts**, set the following configuration before
+doing any upgrade.
+
+ ```yaml
+ updateStrategy.type: OnDelete
+ ```
+
+ Depending on the infrastructure on which you have your Memgraph cluster, the
+details will differ a bit, but the backbone is the same.
+
+Prepare a backup of all data from all instances. This ensures you can safely
+downgrade cluster to the last stable version you had.
+
+ - For **native deployments**, tools like `cp` or `rsync` are sufficient.
+ - For **Kubernetes**, create a `VolumeSnapshotClass`with the yaml file fimilar
+ to this:
+
+ ```yaml
+ apiVersion: snapshot.storage.k8s.io/v1
+ kind: VolumeSnapshotClass
+ metadata:
+ name: csi-azure-disk-snapclass
+ driver: disk.csi.azure.com
+ deletionPolicy: Delete
+ ```
+
+ Apply it:
+
+ ```bash
+ kubectl apply -f azure_class.yaml
+ ```
+
+ - On **Google Kubernetes Engine**, the default CSI driver is
+ `pd.csi.storage.gke.io` so make sure to change the field `driver`.
+ - On **AWS EKS**, refer to the [AWS snapshot controller
+ docs](https://docs.aws.amazon.com/eks/latest/userguide/csi-snapshot-controller.html).
+
+
+{ Create snapshots
}
+
+Now you can create a `VolumeSnapshot` of the lib directory using the yaml file:
+
+```yaml
+apiVersion: snapshot.storage.k8s.io/v1
+kind: VolumeSnapshot
+metadata:
+ name: coord-3-snap # Use a unique name for each instance
+ namespace: default
+spec:
+ volumeSnapshotClassName: csi-azure-disk-snapclass
+ source:
+ persistentVolumeClaimName: memgraph-coordinator-3-lib-storage-memgraph-coordinator-3-0
+```
+
+Apply it:
+
+```bash
+kubectl apply -f azure_snapshot.yaml
+```
+
+Repeat for every instance in the cluster.
+
+
+{ Update configuration
}
+
+Next you should update `image.tag` field in the `values.yaml` configuration file
+to the version to which you want to upgrade your cluster.
+
+1. In your `values.yaml`, update the image version:
+
+ ```yaml
+ image:
+ tag:
+ ```
+2. Apply the upgrade:
+
+ ```bash
+ helm upgrade -f
+ ```
+
+ Since we are using `updateStrategy.type=OnDelete`, this step will not restart
+ any pod, rather it will just prepare pods for running the new version.
+ - For **native deployments**, ensure the new binary is available.
+
+
+{ Upgrade procedure (zero downtime)
}
+
+Our procedure for achieving zero-downtime upgrades consists of restarting one
+instance at a time. Memgraph uses **primary–secondary replication**. To avoid
+downtime:
+
+1. Upgrade **replicas** first.
+2. Upgrade the **main** instance.
+3. Upgrade **coordinator followers**, then the **leader**.
+
+In order to find out on which pod/server the current main and the current
+cluster leader sits, run:
+
+```cypher
+SHOW INSTANCES;
+```
+
+
+{ Upgrade replicas
}
+
+If you are using K8s, the upgrade can be performed by deleting the pod. Start by
+deleting the replica pod (in this example replica is running on the pod
+`memgraph-data-1-0`):
+
+```bash
+kubectl delete pod memgraph-data-1-0
+```
+
+**Native deployment:** stop the old binary and start the new one.
+
+Before starting the upgrade of the next pod, it is important to wait until all
+pods are ready. Otherwise, you may end up with a data loss. On K8s you can
+easily achieve that by running:
+
+```bash
+kubectl wait --for=condition=ready pod --all
+```
+
+For the native deployment, check if all your instances are alived manually.
+
+This step should be repeated for all of your replicas in the cluster.
+
+
+{ Upgrade the main
}
+
+Before deleting the main pod, check replication lag to see whether replicas are
+behind MAIN:
+
+```cypher
+SHOW REPLICATION LAG;
+```
+
+If replicas are behind, your upgrade will be prone to a data loss. In order to
+achieve zero-downtime upgrade without any data loss, either:
+
+ - Use `STRICT_SYNC` mode (writes will be blocked during upgrade), or
+ - Wait until replicas are fully caught up, then pause writes. This way, you
+can use any replication mode. Read queries should however work without any
+issues independently from the replica type you are using.
+
+Upgrade the main pod:
+
+```bash
+kubectl delete pod memgraph-data-0-0
+kubectl wait --for=condition=ready pod --all
+```
+
+
+{ Upgrade coordinators
}
+
+The upgrade of coordinators is done in exactly the same way. Start by upgrading
+followers and finish with deleting the leader pod:
+
+```bash
+kubectl delete pod memgraph-coordinator-3-0
+kubectl wait --for=condition=ready pod --all
+
+kubectl delete pod memgraph-coordinator-2-0
+kubectl wait --for=condition=ready pod --all
+
+kubectl delete pod memgraph-coordinator-1-0
+kubectl wait --for=condition=ready pod --all
+```
+
+
+
+{ Verify upgrade
}
+
+Your upgrade should be finished now, to check that everything works, run:
+
+```cypher
+SHOW VERSION;
+```
+
+It should show you the new Memgraph version.
+
+
+{ Rollback
}
+
+If during the upgrade, you figured out that an error happened or even after
+upgrading all of your pods something doesn't work (e.g. write queries don't
+pass), you can safely downgrade your cluster to the previous version using
+`VolumeSnapshots` you took on K8s or file backups for native deployments.
+
+- **Kubernetes:**
+
+ ```bash
+ helm uninstall
+ ```
+
+ In `values.yaml`, for all instances set:
+
+ ```yaml
+ restoreDataFromSnapshot: true
+ ```
+
+ Make sure to set correct name of the snapshot you will use to recover your
+instances.
+
+- **Native deployments:** restore from your file backups.
+
+
+
+
+If you're doing an upgrade on `minikube`, it is important to make sure that the
+snapshot resides on the same node on which the `StatefulSet` is installed.
+Otherwise, it won't be able to restore `StatefulSet's` attached
+PersistentVolumeClaim from the `VolumeSnapshot`.
+
+
diff --git a/pages/clustering/replication.mdx b/pages/clustering/replication.mdx
index d4c66ee81..b36eb3e50 100644
--- a/pages/clustering/replication.mdx
+++ b/pages/clustering/replication.mdx
@@ -4,764 +4,34 @@ description: Dive into the documentation page for Memgraph and learn how to conf
---
import { Callout } from 'nextra/components'
+import { CommunityLinks } from '/components/social-card/CommunityLinks'
-# Replication
-
-
-
-Instances need to remember their role and configuration details in a replication
-cluster upon restart, and the `--replication-restore-state-on-startup` needs to
-be set to `true` when first initializing the instances and remain `true`
-throughout the instances' lifetime for replication to work correctly. If the
-flag is set to `false`, MAIN can't communicate with instance, because each
-REPLICA has a UUID of MAIN which can communicate with it, and it is set up only
-on instance registration. In case the flag is set to `false`, the way to go
-forward is first to unregister the instance on MAIN and register it again.
-
-When reinstating a cluster, it is advised first to initialize the MAIN instance,
-then the REPLICA instances.
-
-Data replication currently **works only in the in-memory transactional [storage
-mode](/fundamentals/storage-memory-usage)**.
-
-If you're using in-memory analytical storage mode for the fast import, please first import your data, then set up the replication.
-
-
-
-When distributing data across several instances, Memgraph uses replication to
-provide a satisfying ratio of the following properties, known from the CAP
-theorem:
-
-1. **Consistency** (C) - every node has the same view of data at a given point
- in time
-2. **Availability** (A) - all clients can find a replica of the data, even in
- the case of a partial node failure
-3. **Partition tolerance** (P) - the system continues to work as expected
- despite a partial network failure
-
-In the replication process, the data is replicated from one storage (MAIN
-instance) to another (REPLICA instances).
+# How to setup replication with Memgraph (Community)
-From version 2.4 it is no longer possible to specify a timeout when registering
-a sync replica. To mimic this behavior in higher releases, please use ASYNC
-replication instead.
+This guide is for **Memgraph Community** users who want to set up data replication across multiple instances.
+If you have a **Memgraph Enterprise** license, we recommend using the [high availability features](/clustering/high-availability) instead, which provide automatic failover,
+load balancing, and comprehensive cluster management capabilities.
-## Data replication implementation basics
-
-In Memgraph, all instances are MAIN upon starting. When creating a replication
-cluster, one instance has to be chosen as the MAIN instance. The rest of the
-instances have to be demoted to REPLICA roles and have a port defined using a
-Cypher query.
-
-
-
-
-For replication, ensure all machines (Main and Replica instances) have exactly
-the same amount of RAM and the same CPU. This uniformity is crucial for
-consistent performance and reliability.
-
-
-
-If you want instances to remember their role and configuration in a replication
-cluster upon restart, they need to be initialized with the
-`--replication-restore-state-on-startup` set to `true` and remain `true`
-throughout the instances' lifetime. Otherwise and by default, restarted
-instances will start as MAIN instances disconnected from any replication
-cluster.
-
-Once demoted to REPLICA instances, they will no longer accept write queries. In
-order to start the replication, each REPLICA instance needs to be registered
-from the MAIN instance by setting a replication mode (SYNC, ASYNC or
-STRICT_SYNC) and specifying the REPLICA instance's socket address.
-
-The replication mode defines the terms by which the MAIN instance can commit the
-changes to the database, thus modifying the system to prioritize either
-consistency or availability:
-
-
-- **STRICT_SYNC** - After committing a transaction, the MAIN instance will
-communicate the changes to all REPLICA instances and wait until it receives a
-response or information that a timeout is reached. The STRICT_SYNC mode ensures
-consistency and partition tolerance (CP), but not availability for writes. If
-the primary database has multiple replicas, the system is highly available for
-reads. But, when a replica fails, the MAIN instance can't process the write due
-to the nature of synchronous replication. It is implemented as two-phase commit
-protocol.
-
-
-- **SYNC** - After committing a transaction, the MAIN instance will communicate
-the changes to all REPLICA instances and wait until it receives a response or
-information that a timeout is reached. It is different from **STRICT_SYNC** mode
-because it the MAIN can continue committing even in situations when **SYNC**
-replica is down.
-
-
-- **ASYNC** - The MAIN instance will commit a transaction without receiving
- confirmation from REPLICA instances that they have received the same
- transaction. ASYNC mode ensures system availability and partition tolerance
- (AP), while data can only be eventually consistent.
-
-
-
-
-Users are advised to use the same value for configuration flag
-`--storage-wal-file-flush-every-n-txn` on MAIN and SYNC REPLICAs. Otherwise, the
-situation could occur in which there is a data which is fsynced on REPLICA and
-not on MAIN. In the case MAIN crashes, this could leave to conflicts in system
-that would need to be manually resolved by users.
-
-
-
-Once the REPLICA instances are registered, data storage of the MAIN instance is
-replicated and synchronized using transaction timestamps and durability files
-(snapshot files and WALs). Memgraph does not support replication of
-authentication configurations, query and authentication modules, and audit logs.
-
-By using the timestamp, the MAIN instance knows the current state of the
-REPLICA. If the REPLICA is not synchronized with the MAIN instance, the MAIN
-instance sends the correct data for synchronization kept as deltas within WAL
-files. Deltas are the smallest possible updates of the database, but they carry
-enough information to synchronize the data on a REPLICA. Memgraph stores only
-`remove` actions as deltas, for example, `REMOVE key:value ON node_id`.
-
-If the REPLICA is so far behind the MAIN instance that the synchronization using
-WAL files and deltas within it is impossible, Memgraph will use snapshots to
-synchronize the REPLICA to the state of the MAIN instance.
-
-From Memgraph version 2.15, a REPLICA instance has integrated support to only
-listen to one MAIN. This part is introduced to support the high availability but
-also reflects on the replication. The mechanism that is used is a unique
-identifier that which MAIN instance sends to all REPLICAs when REPLICA is first
-registered on a MAIN. A REPLICA stores the UUID of the MAIN instance it listens
-to. The MAIN's UUID is also stored on a disk, in case of restart of an instance
-to continue listening to the correct MAIN instance. When REPLICA restarts,
-`--replication-restore-state-on-startup` must be set to `true` to continue
-getting updates from the MAIN.
-
-## Auth data replication (Enterprise)
-
-If you are using a Memgraph Enterprise license, all authentication/authorization
-data, including users, roles, and associated permissions, will be replicated.
-
-## Auth modules replication (Enterprise)
-
-Authentication modules are not replicated and must be configured manually by the
-administrator.
-
-## Multi-tenant data replication (Enterprise)
-
-When you are using a Memgraph Enterprise license, multi-tenant commands are
-replicated as any other data command. Database manipulation is allowed only on
-MAIN. However, REPLICAs have the ability to use databases and read data
-contained in them.
-
-When dropping a database used on a REPLICA, the REPLICA will receive the command
-and will partially drop the database. It will hide the database and prevent any
-new usage. Once all clients have released the database, it will be deleted
-entirely.
-
-
-
-As of Memgraph v3.5 replication queries (such as `REGISTER REPLICA`, `SHOW
-REPLICAS`, `DROP REPLICA`, etc.) target the default "memgraph" database and
-require access to it. The recommendation is to use the default "memgraph"
-database as an admin/system database and store graphs under other databases.
-
-
-
-### Requirements for replication queries
-
-To execute replication queries, users must have:
-1. The `REPLICATION` privilege
-2. **AND** access to the default "memgraph" database
-
-### Impact on multi-tenant environments
-
-In multi-tenant environments where users might not have access to the "memgraph"
-database, replication management operations will fail. This reinforces the
-recommendation to treat the "memgraph" database as an administrative/system
-database.
-
-{Example: Admin user with replication privileges
}
-
-```cypher
--- Create admin role with replication privileges
-CREATE ROLE replication_admin;
-GRANT REPLICATION TO replication_admin;
-GRANT DATABASE memgraph TO replication_admin;
-
--- Create user with replication admin role
-CREATE USER repl_admin IDENTIFIED BY 'admin_password';
-SET ROLE FOR repl_admin TO replication_admin;
-```
-
-In this setup, `repl_admin` can:
-- Execute all replication queries (`REGISTER REPLICA`, `SHOW REPLICAS`, etc.)
-- Access the "memgraph" database for administrative operations
-- Manage the replication cluster configuration
-
-
-## Running multiple instances
-
-When running multiple instances, each on its own machine, run Memgraph as you
-usually would.
-
-If you are exploring replication and running multiple instances on one machine,
-you can run Memgraph with Docker, but if you are using volumes, they need to be
-called differently and each instance needs to be exposed via a different port.
-
-Check the example of creating [a replication
-cluster](#set-up-a-replication-cluster).
-
-## Assigning roles
-
-Each Memgraph instance has the role of the MAIN instance when it is first
-started.
-
-Also, by default, each crashed instance restarts with its previous role (MAIN as
-MAIN, REPLICA as REPLICA). To change this behavior, set the
-`--replication-restore-state-on-startup` to `false` when first initializing the
-instance. In this way, all instances will get restarted as MAIN.
-
-### Assigning the REPLICA role
-
-Once you decide what instance will be the MAIN instance, all the other instances
-that will serve as REPLICA instances need to be demoted and have the port set
-using the following query:
-
-```plaintext
-SET REPLICATION ROLE TO REPLICA WITH PORT ;
-```
-
-If you set the port of each REPLICA instance to `10000`, it will be easier to
-register replicas later on because the query for registering replicas uses a
-port 10000 as the default one.
-
-Otherwise, you can use any unassigned port between 1000 and 10000.
-
-### Assigning the MAIN role
-
-The replication cluster should only have one MAIN instance in order to avoid
-errors in the replication system. If the original MAIN instance fails, you can
-promote a REPLICA instance to be the new MAIN instance by running the following
-query:
-
-```plaintext
-SET REPLICATION ROLE TO MAIN;
-```
-
-If the original instance was still alive when you promoted a new MAIN, you need
-to resolve any conflicts and manage replication manually.
-
-If you demote the new MAIN instance back to the REPLICA role, it will not
-retrieve its original function. You need to [drop
-it](#dropping-a-replica-instance) from the MAIN and register it again.
-
-If the crashed MAIN instance goes back online once a new MAIN is already
-assigned, it cannot reclaim its previous role. It can be cleaned and demoted to
-become a REPLICA instance of the new MAIN instance.
-
-### Checking the assigned role
-
-To check the replication role of an instance, run the following query:
-
-```plaintext
-SHOW REPLICATION ROLE;
-```
-
-## Registering REPLICA instances
-
-Once all the nodes in the cluster are assigned with appropriate roles, you can
-enable replication in the MAIN instance by registering REPLICA instances,
-setting a replication mode (SYNC and ASYNC), and specifying the REPLICA
-instance's socket address. Memgraph doesn't support chaining REPLICA instances,
-that is, a REPLICA instance cannot be replicated on another REPLICA instance.
-
-If you want to register a REPLICA instance with a SYNC replication mode, run the
-following query:
-
-```plaintext
-REGISTER REPLICA name SYNC TO ;
-```
-
-If you want to register a REPLICA instance with an ASYNC replication mode, run
-the following query:
-
-```plaintext
-REGISTER REPLICA name ASYNC TO ;
-```
-
-
-If you want to register a REPLICA instance with an STRICT_SYNC replication mode,
-run the following query:
-
-```plaintext
-REGISTER REPLICA name STRICT_SYNC TO ;
-```
-
-The socket address must be a string value as follows:
-
-```plaintext
-"IP_ADDRESS:PORT_NUMBER"
-```
-
-where `IP_ADDRESS` is a valid IP address, and `PORT_NUMBER` is a valid port
-number, for example:
-
-```plaintext
-"172.17.0.4:10050"
-```
-
-The default value of the `PORT_NUMBER` is `10000`, so if you set REPLICA roles
-using that port, you can define the socket address specifying only the valid IP
-address:
-
-```plaintext
-"IP_ADDRESS"
-```
-
-Example of a `` using only `IP_ADDRESS`:
-
-```plaintext
-"172.17.0.5"
-```
-
-Also, you can register REPLICA instances using DNS names. In that case, the
-socket address must be a string value as follows:
-
-```plaintext
-"DOMAIN_NAME:PORT_NUMBER"
-```
-
-where `DOMAIN_NAME` is a valid domain name, and `PORT_NUMBER` is a valid port
-number, for example:
-
-```plaintext
-"memgraph-replica.memgraph.net:10050"
-```
-
-If you set REPLICA roles using port `10000`, you can define the socket address
-specifying only the valid domain name, for example:
-
-```plaintext
-"memgraph-replica.memgraph.net"
-```
-
-When a REPLICA instance is registered, it will start replication in ASYNC mode
-until it synchronizes to the current state of the database. Upon
-synchronization, REPLICA instances will either continue working in the ASYNC,
-STRICT_SYNC or SYNC mode.
-
-### Listing all registered REPLICA instances
-
-You can check all the registered REPLICA instances and their details by running
-the following query:
-
-```plaintext
-SHOW REPLICAS;
-```
-
-### Dropping a REPLICA instance
-
-To drop a replica, run the following query:
-
-```plaintext
-DROP REPLICA ;
-```
-
-## MAIN and REPLICA synchronization
-
-By comparing timestamps, the MAIN instance knows when a REPLICA instance is not
-synchronized and is missing some earlier transactions. The REPLICA instance is
-then set into a RECOVERY state, where it remains until it is fully synchronized
-with the MAIN instance#synchronizing-instances.
-
-The missing data changes can be sent as snapshots or WAL files. Snapshot files
-represent an image of the current state of the database and are much larger than
-the WAL files, which only contain the changes, deltas. Because of the difference
-in file size, Memgraph favors the WAL files. It is important to note that
-replicas receive only changes which are made durable on the MAIN instance, in
-other words changes which are already fsynced.
-
-While the REPLICA instance is in the RECOVERY state, the MAIN instance
-calculates the optimal synchronization path based on the REPLICA instance's
-timestamp and the current state of the durability files while keeping the
-overall size of the files necessary for synchronization to a minimum.
-
-## Set up a replication cluster
-
-In the replication process, the data is replicated from one storage (MAIN
-instance) to another (REPLICA instances), thus providing a combination of
-consistency, availability and partition tolerance when distributing data over
-several instances.
-
-This example demonstrates how to create a simple cluster of nodes running
-Memgraph instances, and set up replication using various replication modes.
-
-### Cluster topology
-
-The cluster will consist of three nodes, one MAIN instance and two REPLICA
-instances. In order to showcase the creation of REPLICA instances with different
-replication modes, we will create:
-
-- The MAIN instance - contains the original data that will be replicated to
- REPLICA instances
-- REPLICA instance 1 - replication in the SYNC mode
-- REPLICA instance 2 - replication in the ASYNC mode
-
-### Run multiple instances
-
-If you are running multiple instances, each on its own machine, run Memgraph as
-you usually would.
-
-If you are exploring replication and running multiple instances on one machine,
-you need expose different ports for each instance.
-
-The MAIN instance:
-
-```
-docker run -p 7687:7687 memgraph/memgraph-mage --data-recovery-on-startup=true
-```
-
-REPLICA instance 1:
-
-```
-docker run -p 7688:7687 memgraph/memgraph-mage --data-recovery-on-startup=true
-```
-
-REPLICA instance 2:
-
-```
-docker run -p 7689:7687 memgraph/memgraph-mage --data-recovery-on-startup=true
-```
-
-You can connect to each instance using the Memgraph Lab desktop application, or any
-other external application by changing the port:
-
-- the MAIN instance - `localhost:7687`
-- REPLICA instance 1 - `localhost:7688`
-- REPLICA instance 2 - `localhost:7689`
-
-If you need to define volumes, each volume needs to be called differently.
-
-### Demote an instance to a REPLICA role
-
-Run the following query in both REPLICA instances to demote them to the
-REPLICA role:
-
-```
-SET REPLICATION ROLE TO REPLICA WITH PORT 10000;
-```
-
-If you set the port of each REPLICA instance to `10000`, it will be easier to
-register replicas later on because the query for registering replicas uses port
-`10000` as the default one.
-
-Otherwise, you can use any unassigned port between 1000 and 10000.
-
-### Register REPLICA instances
-
-To register a REPLICA instance, you need to find out the IP address of each
-instance.
-
-The IP addresses will probably be:
-
-- the MAIN instance - `172.17.0.2`
-- REPLICA instance 1 - `172.17.0.3`
-- REPLICA instance 2 - `172.17.0.4`
-
-If they are not, please change the IP addresses in the following queries to
-match the [IP addresses on your cluster](/getting-started/install-memgraph/docker#issues-with-the-ip-address).
-
-Then, run the following queries from the MAIN instance to register REPLICA
-instances:
-
-1. REPLICA instance 1 at `172.17.0.3`
-
- ```
- REGISTER REPLICA REP1 SYNC TO "172.17.0.3";
- ```
-
- REPLICA instance 1 is called REP1, its replication mode is SYNC, and it is
- located at IP address `172.17.0.3.` with port `10000`.
-
- Once the MAIN instance commits a transaction, it will communicate the changes
- to all REPLICA instances running in SYNC mode and wait until it receives a response that the changes have been applied to the REPLICAs or that a timeout has been reached.
-
- If you used any port other than `10000` while demoting a REPLICA instance,
- you will need to specify it like this: "172.17.0.3:5000"
-
-2. REPLICA instance 2 at `172.17.0.4`
-
- ```
- REGISTER REPLICA REP2 ASYNC TO "172.17.0.4";
- ```
-
- REPLICA instance 2 is called REP2, its replication mode is ASYNC, and it is
- located at IP address `172.17.0.4.` with port `10000`.
-
- When the REPLICA instance is running in ASYNC mode, the MAIN instance will
- commit a transaction without receiving confirmation from REPLICA instances
- that they have received the same transaction. ASYNC mode ensures system
- availability and partition tolerance.
-
- If you used any port other than `10000` while demoting a REPLICA instance,
- you will need to specify it like this: "172.17.0.4:5000"
-
-### Check info about registered REPLICA instances
-
-Check REPLICA instances by running the following query from the MAIN
-instance:
-
-```
-SHOW REPLICAS;
-```
-
-The result has information regarding each individual replica:
-1. replica's name
-2. address
-3. type (sync/async)
-4. system information
-5. multi-tenant information (for each database, we provide the current timestamp, how many tick is the replica's version behind and the current status)
-
-## Underlying implementation
-
-Uninterrupted data and operational availability in production systems are
-critical and can be achieved in many ways. In Memgraph we opted for replication.
-
-In distributed systems theory the CAP theorem, also named Brewer's theorem,
-states that any distributed system can simultaneously guarantee two out of the
-three properties:
-
-1. **Consistency** (C) - every node has the same view of data at a given point in
- time
-2. **Availability** (A) - all clients can find a replica of the data, even in the
- case of a partial node failure
-3. **Partition tolerance** (P) - the system continues to work as expected despite a
- partial network failure
-
-
-
-Most of the Memgraph use cases do not benefit from well-known algorithms that
-strive to achieve all three CAP properties, such as Raft, because due to their
-complexity, they produce performance issues. Memgraph use-cases are based on
-running analytical graph workloads on real-time data, demanding a simpler
-concept such as **replication**.
-
-Replication consists of replicating data from one storage to one or several
-other storages. The downside of its simplicity is that only two out of three CAP
-properties can be achieved.
-
-### Replication implementation in Memgraph
-
-To enable replication, there must be at least two instances of Memgraph in a
-cluster. Each instance has one of two roles: MAIN or REPLICA. The MAIN instance
-accepts read and write queries to the database and REPLICA instances accept only
-read queries.
-
-The changes or state of the MAIN instance are replicated to the REPLICA
-instances in a SYNC, STRICT_SYNC or ASYNC mode. The STRICT_SYNC mode ensures consistency and
-partition tolerance (CP), but not availability for writes. The ASYNC mode
-ensures system availability and partition tolerance (AP), while data can only be
-eventually consistent. The SYNC mode is something in between because it waits
-for writes to be accepted on replicas but MAIN can still commit even in situations
-when one of REPLICAs is down.
-
-By using the timestamp, the MAIN instance knows the current state of the
-REPLICA. If the REPLICA is not synchronized with the MAIN instance, the MAIN
-instance sends the correct data for synchronization as WAL files.
-
-If the REPLICA is so far behind the MAIN instance that the synchronization using
-WAL files is impossible, Memgraph will use snapshots.
-
-### Replication modes
-
-
+
-From version 2.4 it is no longer possible to specify a timeout when registering
-a SYNC replica. To mimic this behavior in higher releases, please use ASYNC
-replication instead.
+**Users are advised to first read the guide on [how replication works](/clustering/concepts/how-replication-works).**
-Replication mode defines the terms by which the MAIN instance can commit the
-changes to the database, thus modifying the system to prioritize either
-consistency or availability. There are two possible replication modes
-implemented in Memgraph replication:
-
-- SYNC
-- ASYNC
-
-
-
-When a REPLICA instance is registered and added to the cluster, it will start
-replicating in ASYNC mode. That will allow it to catch up to the current state
-of the MAIN instance. When the REPLICA instance synchronizes with the MAIN
-instance, the replication mode will change according to the mode defined during
-registration.
-
-#### SYNC replication mode
-
-SYNC mode is the most straightforward replication mode in which the main storage
-thread waits for the response and cannot continue until the response is
-received or a timeout is reached.
-
-The following diagrams express the behavior of the MAIN instance in cases when
-SYNC REPLICA doesn't answer within the expected timeout.
-
-**SYNC REPLICA going down when creating index, uniqueness constraint or existence constraint**
-
-
-
-**SYNC REPLICA going down when dropping index, uniqueness constraint or existence constraint**
-
-
-
-**SYNC REPLICA going down adding/updating/deleting data**
-
-
-
-
-#### STRICT_SYNC replication mode
-
-The STRICT_SYNC replication mode behaves very similarly to a
-SYNC mode except that MAIN won't commit a transaction locally in a situation in
-which one of STRICT_SYNC replicas is down. To achieve that, all instances run
-together a two-commit protocol which allows you such a synchronization. This
-reduces the throughout but such a mode is super useful in a high-availability
-scenario in which a failover is the most operation to support. Such a mode then
-allows you a failover without the fear of experiencing a data loss.
-
-#### ASYN replication mode
-
-In the ASYNC replication mode, the MAIN instance will commit a transaction
-without receiving confirmation from REPLICA instances that they have received
-the same transaction. This means that the MAIN instance does not wait for the
-response from the REPLICA instances in the main thread but in some other thread.
-
-A new thread can be created every time a transaction needs to be replicated to
-the REPLICA instance, but because transactions are committed often and use a lot
-of resources, each REPLICA instance has one permanent thread connecting it with
-the MAIN instance. Using this background thread, the MAIN instance pushes
-replication tasks to the REPLICA instance, creates a custom thread pool pattern,
-and receives confirmations of successful replication from the REPLICATION
-instance.
-
-
-
-ASYNC mode ensures system availability and partition tolerance.
-
-
-### Synchronizing instances
-
-By comparing timestamps, the MAIN instance knows when a REPLICA instance is not
-synchronized and is missing some earlier transactions. The REPLICA instance is
-then set into a RECOVERY state, where it remains until it is fully synchronized
-with the MAIN instance.
-
-The missing data changes can be sent as snapshots or WAL files. Snapshot files
-represent an image of the current state of the database and are much larger than
-the WAL files, which only contain the changes, deltas. Because of the difference
-in file size, Memgraph favors the WAL files.
-
-While the REPLICA instance is in the RECOVERY state, the MAIN instance
-calculates the optimal synchronization path based on the REPLICA instance's
-timestamp and the current state of the durability files while keeping the
-overall size of the files necessary for synchronization to a minimum.
-
-
-
-Imagine there were 5 changes made to the database. Each change is saved in a WAL
-file, so there are 5 WAL files, and the snapshot was created after 2 changes.
-The REPLICA instance can be synchronized using a snapshot and the 3 latest WAL
-files or using 5 WAL files. Both options would correctly synchronize the
-instances, but 5 WAL files are much smaller.
-
-The durability files are constantly being created, deleted, and updated. Also,
-each replica could need a different set of files to sync. There are several ways
-to ensure that the necessary files persist and that instances can read the WAL
-files currently being updated without affecting the performance of the rest of
-the database.
-
-#### Locking durability files
-
-Durability files are also used for recovery and are periodically deleted to
-eliminate redundant data. The problem is that they can be deleted while they are
-being used to synchronize a REPLICA with the MAIN instance.
-
-To delay the file deletion, Memgraph uses a file retainer that consists of
-multiple lockers. Threads can store and lock the files they found while
-searching for the optimal recovery path in the lockers, thus ensuring the files
-will still exist once they are sent to the REPLICA instance as a part of the
-synchronization process. If another part of the system sends a deletion
-request for a certain file, the file retainer first checks if that file is
-locked in a locker. If it is not, it is deleted immediately. If the file is
-locked, the file retainer adds the file to the deletion queue. The file retainer
-will periodically clean the queue by deleting the files that are no longer
-locked inside the locker.
-
-#### Writing and reading files simultaneously
-
-Memgraph internal file buffer is used when writing deltas to WAL files, and
-mid-writing, the content of one WAL file can be divided across two locations. If
-at that point that WAL file is used to synchronize the REPLICA instance, once
-the data is being read from the internal buffer, the buffer can be flushed, and
-the REPLICA could receive an invalid WAL file because it is missing a chunk of
-data. It could also happen that the WAL file is sent before all the transactions
-are written to the internal buffer.
-
-To avoid these issues, flushing of that internal buffer is disabled while the
-current WAL is sent to a REPLICA instance. To get all the data necessary for the
-synchronization, the replication thread reads the content directly from the WAL
-file, then reads how many bytes are written in the buffer and copies the data to
-another location. Then the flushing is enabled again, and the transaction is
-replicated using the copied buffer. Because the access to the internal buffer
-was not blocked, new data can be written. The content of the buffer (including
-any new data) is then written in a new WAL file that will be sent in the next
-synchronization process.
-
-
-
-#### Fixing timestamp consistency
-
-Timestamps are used to compare the state of the REPLICA instance in comparison
-to the MAIN instance.
-
-At first, we used the current timestamp without increasing its value for global
-operations, like creating an index or creating a constraint. By using a single
-timestamp, it was impossible to know which operations the REPLICA had applied
-because sequential global operations had the same timestamp. To avoid this
-issue, a unique timestamp is assigned to each global operation.
-
-As replicas allow read queries, each of those queries was assigned with its own
-timestamp. Those timestamps caused issues when the replicated write transactions
-were assigned an older timestamp. A read transaction would return different data
-from the same read query if a transaction was replicated between those two read
-transactions which obstructed the snapshot isolation. To avoid this problem, the
-timestamp on REPLICA instances isn't increased because the read transactions
-don't produce any changes, so no deltas need to be timestamped.
+## [Setup replication cluster (Docker)](/clustering/replication/setup-replication-cluster-docker)
+Learn how to connect a Memgraph replication cluster using Docker images.
-#### Incompatible instances
+## [Setup replication cluster (K8s)](/clustering/replication/setup-replication-cluster-k8s)
+Memgraph currently does not support Helm charts for Community edition.
-To avoid issues when the durability files of two different database instances
-are stored in the same folder, a unique ID is assigned to each storage instance.
-The same ID is then assigned to the durability files. Replication uses the
-instance ID to validate that the files and the database are compatible.
+## [Replication best practices](/clustering/replication/best-practices)
+Learn about what things to watch for, when creating a Memgraph replication cluster.
-A unique ID `epoch_id` is also assigned each time an instance is run as the MAIN
-instance in the replication cluster to check if the data is compatible for
-replication. The `epoch_id` is necessary when the original MAIN instance fails,
-a REPLICA instance becomes a new MAIN, and after some time, the original MAIN
-instance is brought back online. If no transactions were run on the original
-MAIN instance, the difference in timestamps will indicate that it is behind the
-new MAIN, and it would be impossible to set the original MAIN-REPLICA
-relationship. But if the transactions were run on the original MAIN after it was
-brought back online, the timestamp would be of no help, but the `epoch_id` would
-indicate incomparability, thus preventing the original MAIN from reclaiming its
-original role.
+## [Replication commands reference guide](/clustering/replication/replication-commands-reference)
+Queries at your disposal to manage Memgraph cluster.
-
+
\ No newline at end of file
diff --git a/pages/clustering/replication/_meta.ts b/pages/clustering/replication/_meta.ts
new file mode 100644
index 000000000..3c251fb7c
--- /dev/null
+++ b/pages/clustering/replication/_meta.ts
@@ -0,0 +1,7 @@
+export default {
+ "setup-replication-cluster-docker": "Setup replication cluster with Docker",
+ "setup-replication-cluster-k8s": "Setup replication cluster with K8s",
+ "best-practices": "Best practices",
+ "replication-commands-reference": "Reference commands",
+ "system-replication": "System replication"
+}
diff --git a/pages/clustering/replication/best-practices.mdx b/pages/clustering/replication/best-practices.mdx
new file mode 100644
index 000000000..9aa682a1b
--- /dev/null
+++ b/pages/clustering/replication/best-practices.mdx
@@ -0,0 +1,171 @@
+---
+title: Best practices when setting up replication
+description: Various things for database administrators to bear in mind when deploying replication with Memgraph.
+---
+
+import { Callout } from 'nextra/components'
+
+# Best practices when setting up replication
+
+
+
+This guide is for **Memgraph Community** users who want to set up data replication across multiple instances.
+If you have a **Memgraph Enterprise** license, we recommend using the [high availability features](/clustering/high-availability) instead, which provide automatic failover,
+load balancing, and comprehensive cluster management capabilities.
+
+
+
+## Which replication mode should I use?
+If your critical requirement is **performance**, and you're okay with being **eventually consistent**, then you should
+**use ASYNC replication mode**, as that replication mode replicates using a background thread, and doesn't wait
+for responses or timeouts when committing.
+
+If your critical requirement is **no data loss** - **use STRICT_SYNC replication mode**,
+as it implements two-phase commit protocol to ensure no data loss replication.
+
+The most commonly used replication mode is SYNC. It ensures decent performance along with consistency.
+There is, however, a minimal probability of a data loss.
+
+For **cross data center deployment, it is best to use ASYNC replication mode** as in the most cases the replication latency
+requirements are relaxed in that case.
+
+### Combining different replication modes
+You're well welcome to combine different replication modes for respective replicas. However, you can not pair SYNC with
+STRICT_SYNC replicas, because SYNC implies that MAIN can continue the commit synchronously,
+while STRICT_SYNC doesn't allow that.
+Possible pairs are:
+- SYNC + ASYNC replicas
+- STRICT_SYNC + ASYNC replicas
+
+## Which storage mode to use?
+
+Data replication currently **works only in the in-memory transactional [storage
+mode](/fundamentals/storage-memory-usage)**.
+
+If you're using in-memory analytical storage mode for the fast import:
+1. import your data
+2. switch to in-memory transactional storage mode
+3. then and only then set up replication
+
+## Hardware requirements
+For replication, ensure all machines (MAIN and REPLICA instances) have exactly the same amount of
+RAM and the same CPU. This uniformity is crucial for consistent performance and reliability.
+
+## Deployment requirements
+When running multiple instances, each on its own machine, run Memgraph as you
+usually would.
+
+If you are exploring replication and running multiple instances on one machine,
+you can run Memgraph with Docker, but if you are using volumes, they need to be
+called differently and each instance needs to be exposed via a different port.
+
+Check the example of creating [a replication
+cluster](/clustering/replication/setup-replication-cluster-docker).
+
+## Which command line flags should I use?
+
+#### Data recovery on startup
+
+**By default, Memgraph sets the data recovery on startup to true:**
+
+```bash
+--data_recovery_on_startup=true
+```
+The flag controls whether Memgraph will recover the persisted data during startup. It's necessary
+to keep this value to true so instances which have temporarily shut down can recover their data when
+they get back up.
+
+**Advice:** Do nothing since this is enforced by default.
+
+#### Restoring replication state on startup
+Instances need to remember their role and configuration details in a replication
+cluster upon restart, and that is **by default enforced** with the flag:
+
+```bash
+--replication-restore-state-on-startup=true
+```
+The flag should remain `true` throughout the instances' lifetime for replication to work correctly.
+If the flag is set to `false`, MAIN can't communicate with instance, because each
+REPLICA has a UUID of MAIN which can communicate with it, and it is set up only
+on instance registration. In case the flag is set to `false`, the way to go
+forward is first to unregister the instance on MAIN and register it again.
+
+**Advice:** Do nothing since this is enforced by default.
+
+#### Storage WAL file flush
+Users are advised to use the same value for configuration flag
+```bash
+--storage-wal-file-flush-every-n-txn
+```
+on MAIN and SYNC REPLICAs. Otherwise, the situation could occur in which there is a data which is
+*fsynced* on REPLICA and not on MAIN. In the case MAIN crashes, this could leave to conflicts in system
+that would need to be manually resolved by users.
+
+**Advice:** Do nothing since this the value is identical for all instances by default. If you change the value
+for the flag, change it for all the respective instances accordingly.
+
+## Permissions to run replication queries
+
+As of Memgraph v3.5 replication queries (such as `REGISTER REPLICA`, `SHOW
+REPLICAS`, `DROP REPLICA`, etc.) target the default "memgraph" database and
+require access to it. The recommendation is to use the default "memgraph"
+database as an admin/system database and store graphs under other databases.
+
+**In Memgraph community, every user is an admin user and there are no roles or privileges, so
+users will be able to execute any replication query.**
+
+### Requirements for replication queries (Enterprise)
+
+To execute replication queries, users must have:
+1. The `REPLICATION` privilege
+2. **AND** access to the default "memgraph" database
+
+**In Memgraph Enterprise edition, the very first created user is an admin user, which will be able to execute
+any replication query.**
+
+{Example: Admin user with replication privileges
}
+
+```cypher
+-- Create admin role with replication privileges
+CREATE ROLE replication_admin;
+GRANT REPLICATION TO replication_admin;
+GRANT DATABASE memgraph TO replication_admin;
+
+-- Create user with replication admin role
+CREATE USER repl_admin IDENTIFIED BY 'admin_password';
+SET ROLE FOR repl_admin TO replication_admin;
+```
+
+In this setup, `repl_admin` can:
+- Execute all replication queries (`REGISTER REPLICA`, `SHOW REPLICAS`, etc.)
+- Access the "memgraph" database for administrative operations
+- Manage the replication cluster configuration
+
+## How to manage replication with Memgraph Community Edition?
+
+### Manual failover
+
+**Leader election / automatic failover** is a part of Memgraph Enterprise Edition. For Memgraph
+Community edition, users need to perform manual failover routines.
+
+The replication cluster should only have one MAIN instance in order to avoid
+errors in the replication system. If the original MAIN instance fails, you can
+promote a REPLICA instance to be the new MAIN instance by running the following
+query:
+
+```plaintext
+SET REPLICATION ROLE TO MAIN;
+```
+
+If the original instance was still alive when you promoted a new MAIN, you need
+to resolve any conflicts and manage replication manually.
+
+If you demote the new MAIN instance back to the REPLICA role, it will not
+retrieve its original function. You need to [drop
+it](/clustering/replication/replication-commands-reference#drop-replica) from the MAIN and
+[register it](/clustering/replication/replication-commands-reference#replica-registration-commands) again.
+
+If the crashed MAIN instance goes back online once a new MAIN is already
+assigned, it cannot reclaim its previous role. It needs to be cleaned and demoted to
+become a REPLICA instance of the new MAIN instance. In the worst case, restarting that instance
+clean with new fresh storage is needed, in order for the REPLICA registration to pass successfully.
diff --git a/pages/clustering/replication/replication-commands-reference.mdx b/pages/clustering/replication/replication-commands-reference.mdx
new file mode 100644
index 000000000..8184d8fc3
--- /dev/null
+++ b/pages/clustering/replication/replication-commands-reference.mdx
@@ -0,0 +1,272 @@
+---
+title: Replication commands reference
+description: Complete reference guide for all replication commands in Memgraph Community Edition, including role management, replica registration, and cluster monitoring.
+---
+
+import { Callout } from 'nextra/components'
+import {CommunityLinks} from '/components/social-card/CommunityLinks'
+
+# Replication commands reference
+
+This page provides a comprehensive reference for all commands available in Memgraph Community
+Edition replication.
+
+
+
+This reference is for **Memgraph Community** users. If you have a **Memgraph Enterprise**
+license, we recommend using the [high availability features](/clustering/high-availability)
+instead, which provide automatic failover, load balancing, and comprehensive cluster management
+capabilities.
+
+
+
+## Role management commands
+
+### SET REPLICATION ROLE TO REPLICA
+
+Demotes an instance to the REPLICA role and sets the replication port.
+
+```cypher
+SET REPLICATION ROLE TO REPLICA WITH PORT ;
+```
+
+**Parameters:**
+- `port_number` (integer): Port number for replication communication (1000-10000)
+
+**Behavior:**
+- Demotes the current instance from MAIN to REPLICA
+- Sets the replication port for communication with the MAIN instance
+- REPLICA instances can only accept read queries
+- Default port 10000 is recommended for easier registration (used implicitly in other commands)
+
+**Example:**
+```cypher
+SET REPLICATION ROLE TO REPLICA WITH PORT 10000;
+```
+
+**When to use:**
+Use this as a first command on the instances you have chosen to be replicas.
+
+### SET REPLICATION ROLE TO MAIN
+
+Promotes a REPLICA instance to become the MAIN instance.
+
+```cypher
+SET REPLICATION ROLE TO MAIN;
+```
+
+**Behavior:**
+- Promotes the current REPLICA instance to MAIN
+- If the instance is already MAIN, command will make an exception
+- Only one MAIN instance should exist in a replication cluster (pay close attention when managing this)
+- If the original MAIN is still alive, conflicts may occur
+- Requires manual conflict resolution if multiple MAINs exist
+
+**When to use:**
+Memgraph when being run first time is always thinking it's main, so there's no real need to run this query at the start.
+Only time you would need this query is to perform manual leader election.
+
+### SHOW REPLICATION ROLE
+
+Displays the current replication role of the instance.
+
+```cypher
+SHOW REPLICATION ROLE;
+```
+
+**Returns:**
+- Current replication role: "main" or "replica"
+
+**Example output:**
+```nocopy
++------------------+
+| replication role |
++------------------+
+| "replica" |
++------------------+
+```
+
+**When to use:**
+Since there is no out of the box routing mechanism in Memgraph Community, you can use this command to determine which instance is the MAIN instance.
+It should give you enough information to decide on which instance you can perform writes or reads.
+
+## Replica registration commands
+
+### REGISTER REPLICA (SYNC)
+
+Registers a REPLICA instance with synchronous replication mode.
+
+```cypher
+REGISTER REPLICA name SYNC TO ;
+```
+
+**Parameters:**
+- `name` (string): Unique name for the replica
+- `socket_address` (string): Network address in format `"IP_ADDRESS|DNS_NAME:PORT_NUMBER"` or `"IP_ADDRESS|DNS_NAME"` (default port `10000`)
+
+**Behavior:**
+- MAIN will register a REPLICA instance under SYNC replication mode and sync its own data with the REPLICA
+
+**Implications:**
+- MAIN waits for confirmation from SYNC replicas before committing transactions
+- if SYNC replica is down, MAIN can still commit and move on (unlike STRICT_SYNC),
+ while the REPLICAs are expected to recover and catch-up eventually
+- ensures data consistency, but with slower write performance
+- there is a minimal chance for data loss, in which case manual intervention is needed to recover the data
+
+**Example:**
+```cypher
+REGISTER REPLICA REP1 SYNC TO "172.17.0.3:10000";
+```
+
+### REGISTER REPLICA (ASYNC)
+
+Registers a REPLICA instance with asynchronous replication mode.
+
+```cypher
+REGISTER REPLICA name ASYNC TO ;
+```
+
+**Parameters:**
+- `name` (string): Unique name for the replica
+- `socket_address` (string): Network address in format "IP_ADDRESS|DNS_NAME:PORT_NUMBER" or "IP_ADDRESS|DNS_NAME" (default port 10000)
+
+**Behavior:**
+- MAIN will register a REPLICA instance under ASYNC replication mode and sync its own data with the REPLICA
+
+**Implications:**
+- MAIN commits transactions without waiting for replica confirmation, as data is replicated using a background thread
+- ensures high availability and partition tolerance
+- data is eventually consistent
+- best performance but potential data loss on failover
+
+**Example:**
+```cypher
+REGISTER REPLICA REP2 ASYNC TO "172.17.0.4";
+```
+
+### REGISTER REPLICA (STRICT_SYNC)
+
+Registers a REPLICA instance with strict synchronous replication mode.
+
+```cypher
+REGISTER REPLICA name STRICT_SYNC TO ;
+```
+
+**Parameters:**
+- `name` (string): Unique name for the replica
+- `socket_address` (string): Network address in format "IP_ADDRESS|DNS_NAME:PORT_NUMBER" or "IP_ADDRESS|DNS_NAME" (default port 10000)
+
+**Behavior:**
+- MAIN will register a REPLICA instance under STRICT_SYNC replication mode and sync its own data with the REPLICA
+
+**Implications:**
+- uses two-phase commit protocol (2PC)
+- MAIN cannot commit if STRICT_SYNC replica is down
+- ensures no data loss but reduces throughput because of 2PC
+- best for high-availability scenarios requiring zero data loss
+
+**Example:**
+```cypher
+REGISTER REPLICA REP3 STRICT_SYNC TO "172.17.0.5:10000";
+```
+
+### DROP REPLICA
+
+Removes a REPLICA instance from the replication cluster.
+
+```cypher
+DROP REPLICA ;
+```
+
+**Parameters:**
+- `name` (string): Name of the replica to remove
+
+**Behavior:**
+- unregisters the replica from the MAIN instance
+- stops replication to the specified replica
+- replica instance continues running as REPLICA but is no longer part of the cluster
+
+**Example:**
+```cypher
+DROP REPLICA REP1;
+```
+
+**When to use:**
+If all else fails, and the REPLICA can not recover, you can always drop and re-register the REPLICA to start fresh. We advise `DROP REPLICA`
+followed by `REGISTER REPLICA` after which the recovery process should start, and the REPLICA should again catch up with the MAIN.
+
+## Monitoring commands
+
+### SHOW REPLICAS
+
+Lists all registered REPLICA instances and their details.
+
+```cypher
+SHOW REPLICAS;
+```
+
+**Returns:**
+- replica name
+- network address of the REPLICA + port of the replication server
+- replication mode (SYNC/ASYNC/STRICT_SYNC)
+- system information
+- tenant information (lag, status, timestamp)
+
+**Example output:**
+```nocopy
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+| name | socket_address | sync_mode | system_info | data_info |
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+| "REP1" | "172.17.0.3:10000" | "sync" | Null | {memgraph: {behind: 0, status: "ready", ts: 0}} |
+| "REP2" | "172.17.0.4:10000" | "async" | Null | {memgraph: {behind: 0, status: "ready", ts: 0}} |
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+```
+
+## Socket address formats
+
+### IP address format
+
+```cypher
+"IP_ADDRESS|DNS_NAME:PORT_NUMBER"
+```
+
+**Example:**
+```cypher
+"172.17.0.4:10050"
+```
+
+### IP address with default port
+
+```cypher
+"IP_ADDRESS"
+```
+
+**Example:**
+```cypher
+"172.17.0.5"
+```
+
+### DNS name format
+
+```cypher
+"DNS_NAME:PORT_NUMBER"
+```
+
+**Example:**
+```cypher
+"memgraph-replica.memgraph.net:10050"
+```
+
+### DNS name with default port
+
+```cypher
+"DNS_NAME"
+```
+
+**Example:**
+```cypher
+"memgraph-replica.memgraph.net"
+```
+
+
diff --git a/pages/clustering/replication/setup-replication-cluster-docker.mdx b/pages/clustering/replication/setup-replication-cluster-docker.mdx
new file mode 100644
index 000000000..87c83effc
--- /dev/null
+++ b/pages/clustering/replication/setup-replication-cluster-docker.mdx
@@ -0,0 +1,226 @@
+---
+title: Setup replication cluster with Docker
+description: See how one can setup a replication cluster with Docker inside the Memgraph Community Edition.
+---
+
+import { Callout } from 'nextra/components'
+import { Steps } from 'nextra/components'
+
+# Setup replication cluster with Docker (Community)
+
+
+
+This guide is for **Memgraph Community** users who want to set up data replication across multiple instances.
+If you have a **Memgraph Enterprise** license, we recommend using the [high availability features](/clustering/high-availability) instead, which provide automatic failover,
+load balancing, and comprehensive cluster management capabilities.
+
+
+
+This example demonstrates how to create a simple cluster of nodes running
+Memgraph Community instances, and set up replication using various replication modes.
+
+### Cluster topology
+
+The cluster will consist of three nodes, one MAIN instance and two REPLICA
+instances. In order to showcase the creation of REPLICA instances with different
+replication modes, we will create:
+
+- The MAIN instance - contains the original data that will be replicated to
+ REPLICA instances
+- REPLICA instance 1 - replication in the SYNC mode
+- REPLICA instance 2 - replication in the ASYNC mode
+
+
+
+The example is made on the local server, which poses a single point of failure if the server goes down. In production,
+it is best advised to deploy one Memgraph instance per server, to ensure robustness.
+
+
+
+
+
+### Run multiple instances
+
+This examples sets up replication on a single instance, so every Bolt port for receiving queries will need to be
+distinct on the host.
+
+The MAIN instance:
+
+```
+docker run -p 7687:7687 memgraph/memgraph-mage --also-log-to-stderr=true
+```
+
+REPLICA instance 1:
+
+```
+docker run -p 7688:7687 memgraph/memgraph-mage --also-log-to-stderr=true
+```
+
+REPLICA instance 2:
+
+```
+docker run -p 7689:7687 memgraph/memgraph-mage --also-log-to-stderr=true
+```
+
+Memgraph by default [sets all the required flags](/clustering/replication/best-practices#which-command-line-flags-should-i-use)
+start replication without changing any configuration.
+
+You can connect to each instance using the Memgraph Lab, mgconsole, or a database driver, by changing the port:
+
+- MAIN instance - `localhost:7687`
+- REPLICA instance 1 - `localhost:7688`
+- REPLICA instance 2 - `localhost:7689`
+
+If you need to define volumes, each volume needs to be called differently.
+
+### Demote an instance to a REPLICA role
+
+Run the following query in both REPLICA instances to demote them to the
+REPLICA role:
+
+```
+SET REPLICATION ROLE TO REPLICA WITH PORT 10000;
+```
+
+This command does 2 things:
+- the instance is now aware that it is a REPLICA instance, and is no longer allowed to execute write queries
+- a replication server is started at port 10000, which receives data for the REPLICA
+
+The port 10000 is the default one in some of the replication commands, so the best practice is to use exactly
+that port for setting up the replication server.
+
+Otherwise, you can use any unassigned port between 1000 and 10000.
+
+### Register REPLICA instances
+
+To register a REPLICA instance, you need to find out the IP address of each
+instance. The container's IP address can be read by using the following command:
+
+```bash
+docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}'
+```
+
+The IP addresses will probably be:
+
+- MAIN instance - `172.17.0.2`
+- REPLICA instance 1 - `172.17.0.3`
+- REPLICA instance 2 - `172.17.0.4`
+
+If they are not, please change the IP addresses in the following queries to
+match the [IP addresses on your cluster](/getting-started/install-memgraph/docker#issues-with-the-ip-address).
+
+Then, run the following queries from the MAIN instance to register REPLICA
+instances:
+
+1. REPLICA instance 1 at `172.17.0.3`
+
+ ```
+ REGISTER REPLICA REP1 SYNC TO "172.17.0.3";
+ ```
+
+ The command can be interpreted as following:
+ - REPLICA instance 1 is called REP1
+ - REP1 is registered using SYNC replication mode
+ - REP1 is located at the IP address `172.17.0.3.`
+ - REP1 has the replication server port `10000` open (it is the default one)
+
+ Once the MAIN instance commits a transaction, it will communicate the changes
+ to all REPLICA instances running in SYNC mode and wait until it receives a response that the changes have been applied to the REPLICAs or that a timeout has been reached.
+
+ If you used any port other than `10000` while demoting a REPLICA instance,
+ you will need to specify it like this: "172.17.0.3:5000"
+
+2. REPLICA instance 2 at `172.17.0.4`
+
+ ```
+ REGISTER REPLICA REP2 ASYNC TO "172.17.0.4";
+ ```
+
+ REPLICA instance 2 is called REP2, its replication mode is ASYNC, and it is
+ located at IP address `172.17.0.4.` with port `10000`.
+
+ When the REPLICA instance is running in ASYNC mode, the MAIN instance will
+ commit a transaction without receiving confirmation from REPLICA instances
+ that they have received the same transaction. ASYNC mode ensures system
+ availability and partition tolerance.
+
+ If you used any port other than `10000` while demoting a REPLICA instance (e.g. 5000),
+ you will need to specify it like this: "172.17.0.4:5000"
+
+### Check info about registered REPLICA instances
+
+Check REPLICA instances by running the following query from the MAIN
+instance:
+
+```
+SHOW REPLICAS;
+```
+
+```nocopy
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+| name | socket_address | sync_mode | system_info | data_info |
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+| "REP1" | "172.17.0.3:10000" | "sync" | Null | {memgraph: {behind: 0, status: "ready", ts: 0}} |
+| "REP2" | "172.17.0.4:10000" | "async" | Null | {memgraph: {behind: 0, status: "ready", ts: 0}} |
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+```
+
+The result has information regarding each individual replica:
+1. replica's name
+2. IP address where the REPLICA is reachable and the port where its replication server is open
+3. replication mode (sync/asyncs/trict_sync)
+4. system information
+5. tenant information (for each database, we provide the current timestamp, how many tick is the replica's version behind and the current status)
+
+### Create a node on MAIN
+
+On the MAIN instance, execute a write query
+
+```cypher
+CREATE (:Node);
+```
+
+### Observe the replicated data
+
+By showing replicas, we can see that the timestamp data info for the database `memgraph` changed.
+Replicas are not behind (`behind` is 0), and they're in a `ready` state. This means data has been
+successfully replicated.
+
+```
+SHOW REPLICAS;
+```
+
+```nocopy
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+| name | socket_address | sync_mode | system_info | data_info |
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+| "REP1" | "172.17.0.3:10000" | "sync" | Null | {memgraph: {behind: 0, status: "ready", ts: 2}} |
+| "REP2" | "172.17.0.4:10000" | "async" | Null | {memgraph: {behind: 0, status: "ready", ts: 2}} |
++--------+--------------------+-----------+-------------+-------------------------------------------------+
+```
+
+If we now log into the REPLICA and execute:
+
+```cypher
+MATCH (n) RETURN n;
+```
+
+We get the replicated data:
+
+
+```nocopy
++---------+
+| n |
++---------+
+| (:Node) |
++---------+
+```
+
+And that's it! You have successfully executed your first query on a replicated Memgraph server!
+
+
+### Next steps
+
+We suggest you check our [supported replication queries in the
+Memgraph community version](/clustering/replication/replication-commands-reference),
+so that you can become an expert in managing a replicated Memgraph cluster.
\ No newline at end of file
diff --git a/pages/clustering/replication/setup-replication-cluster-k8s.mdx b/pages/clustering/replication/setup-replication-cluster-k8s.mdx
new file mode 100644
index 000000000..c71e1b136
--- /dev/null
+++ b/pages/clustering/replication/setup-replication-cluster-k8s.mdx
@@ -0,0 +1,28 @@
+---
+title: Setup replication cluster with K8s
+description: See how one can setup a replication cluster with K8s inside the Memgraph Community Edition.
+---
+
+import { Callout } from 'nextra/components'
+
+# Setup replication cluster with Kubernetes (Community)
+
+
+
+This guide is for **Memgraph Community** users who want to set up data replication across multiple instances.
+If you have a **Memgraph Enterprise** license, we recommend using the [high availability features](/clustering/high-availability) instead, which provide automatic failover,
+load balancing, and comprehensive cluster management capabilities.
+
+
+
+
+
+Memgraph does not offer Helm charts compatible with Memgraph replication in Community Edition.
+Memgraph currently only supports Helm charts with the [high availability
+Enterprise edition](https://github.com/memgraph/helm-charts/tree/main/charts/memgraph-high-availability).
+If you want to set up replication with Community Memgraph using Helm charts, you are encouraged to create
+the setup on your own, by [reusing the Docker replication guide](/clustering/replication/setup-replication-cluster-docker)
+and [Memgraph standalone Helm Charts](https://github.com/memgraph/helm-charts/tree/main/charts/memgraph)
+for setting up the cluster.
+
+
diff --git a/pages/clustering/replication/system-replication.mdx b/pages/clustering/replication/system-replication.mdx
index d96fd47e8..862d2140b 100644
--- a/pages/clustering/replication/system-replication.mdx
+++ b/pages/clustering/replication/system-replication.mdx
@@ -16,24 +16,33 @@ When using system replication features, the user can now have a more
complete duplicate of MAIN. MAIN will automatically replicate any database
creation/drop and any changes to the auth setup.
-## Replication implementation basics
+## Audit logs
-Please refer to [Replication](/clustering/replication) for the basics of replication.
+Audit logs are currently not replicated inside Memgraph.
-## Auth data replication
+## Auth data replication (Enterprise)
-All authentication/authorization data are now replicated. This includes users,
-roles, and all of the permissions associated with them.
+If you are using a Memgraph Enterprise license, all authentication/authorization
+data, including users, roles, and associated permissions, will be replicated.
-Auth modules are not replicated at this time and must be set up by the admin.
+**Memgraph Community does not replicate users and roles.** It only ensures data replication
+(nodes, relationships, indices, constraints, and other graph constructs).
-## Multi-tenant data replication
+## Auth modules replication
-Multi-tenant commands are now replicated as any other data command.
-Database manipulation is allowed only on MAIN. However, REPLICAs have the
-ability to use databases and read data contained in them.
+Authentication modules are not replicated and must be configured manually by the
+administrator.
+
+## Multi-tenant data replication (Enterprise)
+
+When you are using a Memgraph Enterprise license, multi-tenant commands are
+replicated as any other data command.
+
+Database manipulation is allowed only on MAIN.
+REPLICA instances have the ability to use databases and read data
+contained in them.
When dropping a database used on a REPLICA, the REPLICA will receive the command
-and will partially drop the database. It will hide it and not allow any new
-usage of the database but will wait for every client to release the database
-before deleting it entirely.
+and will partially drop the database. It will hide the database and prevent any
+new usage. Once all clients have released the database, it will be deleted
+entirely.
\ No newline at end of file
diff --git a/pages/deployment/environments/aws.mdx b/pages/deployment/environments/aws.mdx
index e52ed9e29..d4d931491 100644
--- a/pages/deployment/environments/aws.mdx
+++ b/pages/deployment/environments/aws.mdx
@@ -192,6 +192,21 @@ Depending on what you are using, you can follow the [Linux](./linux.mdx) or
[Docker](./docker.mdx) deployment guide for more information on how to manage
the Memgraph deployment.
+## Deploying Memgraph with High Availability
+
+For production environments requiring high availability, we recommend running Memgraph under Kubernetes, as that is currently the most widely used method of HA deployment.
+Other methods of deployment include connecting your HA cluster with Memgraph built from source, as well as using plain Docker images.
+
+### High Availability on AWS EKS
+
+The recommended approach for deploying Memgraph with high availability on AWS is to use Amazon Elastic Kubernetes Service (EKS) with our Helm charts. This provides:
+
+For detailed instructions on deploying Memgraph high availability on AWS EKS, follow our
+[Helm charts guide for AWS](https://github.com/memgraph/helm-charts/tree/main/charts/memgraph-high-availability/aws).
+
+To understand how Memgraph high availability works, including cluster setup, replication modes, and failover procedures, refer to our comprehensive
+[high availability documentation](/clustering/high-availability).
+
## Where to next?
diff --git a/public/pages/clustering/replication/memgraph-replication-async-sync.png b/public/pages/clustering/replication/memgraph-replication-async-sync.png
deleted file mode 100644
index cc524e7ce..000000000
Binary files a/public/pages/clustering/replication/memgraph-replication-async-sync.png and /dev/null differ
diff --git a/public/pages/clustering/replication/multi-tenant-replication.png b/public/pages/clustering/replication/multi-tenant-replication.png
new file mode 100644
index 000000000..f043c4c81
Binary files /dev/null and b/public/pages/clustering/replication/multi-tenant-replication.png differ
diff --git a/public/pages/clustering/replication/replication-modes.png b/public/pages/clustering/replication/replication-modes.png
new file mode 100644
index 000000000..b7c5d2dcf
Binary files /dev/null and b/public/pages/clustering/replication/replication-modes.png differ
diff --git a/public/pages/clustering/replication/replication-state-diagram.png b/public/pages/clustering/replication/replication-state-diagram.png
new file mode 100644
index 000000000..d645dbeee
Binary files /dev/null and b/public/pages/clustering/replication/replication-state-diagram.png differ
diff --git a/public/pages/clustering/replication/workflow_diagram_data_definition_creation.drawio.png b/public/pages/clustering/replication/workflow_diagram_data_definition_creation.drawio.png
deleted file mode 100644
index 13a40da2e..000000000
Binary files a/public/pages/clustering/replication/workflow_diagram_data_definition_creation.drawio.png and /dev/null differ
diff --git a/public/pages/clustering/replication/workflow_diagram_data_definition_dropping.drawio.png b/public/pages/clustering/replication/workflow_diagram_data_definition_dropping.drawio.png
deleted file mode 100644
index 4b3200099..000000000
Binary files a/public/pages/clustering/replication/workflow_diagram_data_definition_dropping.drawio.png and /dev/null differ
diff --git a/public/pages/clustering/replication/workflow_diagram_data_manipulation.drawio.png b/public/pages/clustering/replication/workflow_diagram_data_manipulation.drawio.png
deleted file mode 100644
index a7c1de551..000000000
Binary files a/public/pages/clustering/replication/workflow_diagram_data_manipulation.drawio.png and /dev/null differ