dstackai · peterschmidt85 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/docs/assets/javascripts/extra.js b/docs/assets/javascripts/extra.js
@@ -155,4 +155,11 @@ window.addEventListener("DOMContentLoaded", function() {
             }
         });
     })
+
+    document.querySelectorAll('a[href^="http"]').forEach(link => {
+        if (!link.href.includes(location.hostname)) {
+          link.setAttribute('target', '_blank');
+          link.setAttribute('rel', 'noopener noreferrer');
+        }
+      });
 })()
diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css
@@ -1350,7 +1350,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) {
         visibility: visible;
     }*/
 
-    .twemoji.external {
+    /* .twemoji.external {
         position: relative;
         top: 2.5px;
         height: 18.5px;
@@ -1364,7 +1364,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) {
         position: relative;
         top: 1.5px;
         margin-right: -7px;
-    }
+    } */
 
     /*.md-tabs__item:nth-child(6) .md-tabs__link:before {
         position: relative;
@@ -1585,7 +1585,7 @@ html .md-footer-meta.md-typeset a:is(:focus,:hover) {
 
 .md-typeset.md-banner__inner a {
     color: var(--md-default-bg-color);
-    border-bottom: 1.5px dotted;
+    /* border-bottom: 1.5px dotted; */
     font-weight: 600;
 }
 
@@ -1801,3 +1801,37 @@ img.border {
     font-size: 12px !important;;
     padding: 30px !important;
 }
+
+/* External link indicator */
+a[href^="http"]:not(:where(
+  /* skip if marked with external-skip */
+  .external-skip,
+  /* exclude http:// dstack links */
+  [href^="http://dstack.ai"],
+  /* exclude https://dstack.ai links */
+  [href^="https://dstack.ai"],
+  /* exclude md-content__button links */
+  .md-content__button,
+)):after {
+  content: '';
+  display: inline-block;
+  width: 18.5px;
+  height: 18.5px;
+  margin-left: 0.15em;
+  vertical-align: -0.2em;
+  background-color: currentColor;
+  mask-image: url('data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor"><path d="m11.93 5 2.83 2.83L5 17.59 6.42 19l9.76-9.75L19 12.07V5z"></path></svg>');
+  mask-size: 100%;
+  mask-repeat: no-repeat;
+  mask-position: center;
+  -webkit-mask-image: url('data:image/svg+xml,<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="currentColor"><path d="m11.93 5 2.83 2.83L5 17.59 6.42 19l9.76-9.75L19 12.07V5z"></path></svg>');
+  -webkit-mask-size: 100%;
+  -webkit-mask-repeat: no-repeat;
+  -webkit-mask-position: center;
+  text-decoration: none;
+}
+
+/* Exclude links inside .md-social */
+.md-social a[href^="http"]:after {
+  display: none;
+}
diff --git a/docs/assets/stylesheets/landing.css b/docs/assets/stylesheets/landing.css
@@ -327,7 +327,7 @@
     margin-right: -7px;
 }
 
-.md-button-secondary.external:after {
+/* .md-button-secondary.external:after {
     content: url('data:image/svg+xml,<svg fill="rgba(0, 0, 0, 0.87)" xmlns="http://www.w3.org/2000/svg" width="20px" height="20px" viewBox="0 0 16 16"><polygon points="5 4.31 5 5.69 9.33 5.69 2.51 12.51 3.49 13.49 10.31 6.67 10.31 11 11.69 11 11.69 4.31 5 4.31" data-v-e1bdab2c=""></polygon></svg>');
     line-height: 14px;
     margin-left: 5px;
@@ -343,7 +343,7 @@
     position: relative;
     top: 2.5px;
     margin-right: -7px;
-}
+} */
 
 .md-header__buttons .md-button-secondary,
 .md-typeset .md-button-secondary,
@@ -702,13 +702,13 @@
     line-height: 32px;
 }
 
-.tx-landing__highlights_grid h3.external:after {
+/* .tx-landing__highlights_grid h3.external:after {
     content: url('data:image/svg+xml,<svg fill="black" xmlns="http://www.w3.org/2000/svg" width="22px" height="22px" viewBox="0 0 16 16"><polygon points="5 4.31 5 5.69 9.33 5.69 2.51 12.51 3.49 13.49 10.31 6.67 10.31 11 11.69 11 11.69 4.31 5 4.31" data-v-e1bdab2c=""></polygon></svg>');
     margin-left: 2px;
     position: relative;
     top: 3px;
     margin-right: -7px;
-}
+} */
 
 .tx-landing__highlights_grid p {
     font-size: 16px;

diff --git a/docs/blog/archive/ambassador-program.md b/docs/blog/archive/ambassador-program.md
@@ -58,8 +58,8 @@ yourself and your experience. We’ll reach out with a starter kit and next step
     Get involved
 </a>
 
-Have questions? Reach out via [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"}!
+Have questions? Reach out via [Discord](https://discord.gg/u8SmfwPpMd)!
 
 > 💜 In the meantime, we’re thrilled to
-> welcome [Park Chansung :material-arrow-top-right-thin:{ .external }](https://x.com/algo_diver){:target="_blank"}, the
+> welcome [Park Chansung](https://x.com/algo_diver), the
 > first `dstack` ambassador.
diff --git a/docs/blog/archive/efa.md b/docs/blog/archive/efa.md
@@ -10,7 +10,7 @@ categories:
 
 # Efficient distributed training with AWS EFA
 
-[Amazon Elastic Fabric Adapter (EFA) :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/hpc/efa/){:target="_blank"} is a high-performance network interface designed for AWS EC2 instances, enabling
+[Amazon Elastic Fabric Adapter (EFA)](https://aws.amazon.com/hpc/efa/) is a high-performance network interface designed for AWS EC2 instances, enabling
 ultra-low latency and high-throughput communication between nodes. This makes it an ideal solution for scaling
 distributed training workloads across multiple GPUs and instances.
 
@@ -39,7 +39,7 @@ network interfaces, you’ll need to disable public IPs. Note, the `dstack`
 server in this case should have access to the private subnet of the VPC.
 
 You’ll also need to specify an AMI that includes the GDRCopy drivers. For example, you can use the 
-[AWS Deep Learning Base GPU AMI :material-arrow-top-right-thin:{ .external }](https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-22-04/){:target="_blank"}.
+[AWS Deep Learning Base GPU AMI](https://aws.amazon.com/releasenotes/aws-deep-learning-base-gpu-ami-ubuntu-22-04/).
 
 Here’s an example backend configuration:
 
@@ -164,10 +164,10 @@ $ dstack apply -f examples/misc/efa/task.dstack.yml -R
 EFA.
 
 > Have questions? You're welcome to join
-> our [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} or talk
-> directly to [our team :material-arrow-top-right-thin:{ .external }](https://calendly.com/dstackai/discovery-call){:target="_blank"}.
+> our [Discord](https://discord.gg/u8SmfwPpMd) or talk
+> directly to [our team](https://calendly.com/dstackai/discovery-call).
 
 !!! info "What's next?"
     1. Check [fleets](../../docs/concepts/fleets.md), [tasks](../../docs/concepts/tasks.md), and [volumes](../../docs/concepts/volumes.md)
     2. Also see [dev environments](../../docs/concepts/dev-environments.md) and [services](../../docs/concepts/services.md)
-    3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"}
+    3. Join [Discord](https://discord.gg/u8SmfwPpMd)
diff --git a/docs/blog/posts/amd-mi300x-inference-benchmark.md b/docs/blog/posts/amd-mi300x-inference-benchmark.md
@@ -12,7 +12,7 @@ categories:
 
 At `dstack`, we've been adding support for AMD GPUs with [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets), 
 so we saw this as a great chance to test our integration by benchmarking AMD GPUs. Our friends at 
-[Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"}, who build top-tier 
+[Hot Aisle](https://hotaisle.xyz/), who build top-tier 
 bare metal compute for AMD GPUs, kindly provided the hardware for the benchmark.
 
 <img src="https://dstack.ai/static-assets/static-assets/images/dstack-hotaisle-amd-mi300x-prompt-v5.png" width="750" />
@@ -106,7 +106,7 @@ Here is the spec of the bare metal machine we got:
 ??? info "TGI"
     The `ghcr.io/huggingface/text-generation-inference:sha-11d7af7-rocm` Docker image was used.
 
-For conducting the tests, we've been using the [`benchmark_serving` :material-arrow-top-right-thin:{ .external }](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py){:target="_blank"} provided by vLLM. 
+For conducting the tests, we've been using the [`benchmark_serving`](https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py) provided by vLLM. 
 
 ## Observations
 
@@ -175,7 +175,7 @@ to vLLM.
 
 <img src="https://raw.githubusercontent.com/dstackai/benchmarks/refs/heads/main/amd/inference/gpu_vram_tgi_vllm.png" width="750" />
 
-This difference may be related to how vLLM [pre-allocates GPU cache :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/latest/models/performance.html){:target="_blank"}.
+This difference may be related to how vLLM [pre-allocates GPU cache](https://docs.vllm.ai/en/latest/models/performance.html).
 
 ## Conclusion
 
@@ -203,22 +203,22 @@ like the H100 and H200, as well as possibly Google TPU.
 ### Source code
 
 The source code used for this benchmark can be found in our 
-[GitHub repo :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/main/amd/inference){:target="_blank"}.
+[GitHub repo](https://github.com/dstackai/benchmarks/tree/main/amd/inference).
 
 If you have questions, feedback, or want to help improve the benchmark, please reach out to our team.
 
 ## Thanks to our friends
 
 ### Hot Aisle
 
-[Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} 
+[Hot Aisle](https://hotaisle.xyz/) 
 is the primary sponsor of this benchmark, and we are sincerely grateful for their hardware and support.  
 
 If you'd like to use top-tier bare metal compute with AMD GPUs, we recommend going
 with Hot Aisle. Once you gain access to a cluster, it can be easily accessed via `dstack`'s [SSH fleet](../../docs/concepts/fleets.md#ssh-fleets) easily.
 
 ### RunPod
 If you’d like to use on-demand compute with AMD GPUs at affordable prices, you can configure `dstack` to
-use [RunPod :material-arrow-top-right-thin:{ .external }](https://runpod.io/){:target="_blank"}. In
+use [RunPod](https://runpod.io/). In
 this case, `dstack` will be able to provision fleets automatically when you run dev environments, tasks, and
 services.
diff --git a/docs/blog/posts/amd-on-runpod.md b/docs/blog/posts/amd-on-runpod.md
@@ -33,14 +33,14 @@ One of the main advantages of the `MI300X` is its VRAM. For example, with the `H
 version of Llama 3.1 405B into a single node with 8 GPUs—you'd have to use FP8 instead. However, with the `MI300X`, you
 can fit FP16 into a single node with 8 GPUs, and for FP8, you'd only need 4 GPUs.
 
-With the [latest update :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/releases/0.18.11rc1){:target="_blank"},
+With the [latest update](https://github.com/dstackai/dstack/releases/0.18.11rc1),
 you can now specify an AMD GPU under `resources`. Below are a few examples.
 
 ## Configuration
 
 === "Service"
     Here's an example of a [service](../../docs/concepts/services.md) that deploys
-    Llama 3.1 70B in FP16 using [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}.
+    Llama 3.1 70B in FP16 using [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd).
 
     <div editor-title="examples/inference/tgi/amd/service.dstack.yml"> 
 
@@ -72,7 +72,7 @@ you can now specify an AMD GPU under `resources`. Below are a few examples.
 
 === "Dev environment"
     Here's an example of a [dev environment](../../docs/concepts/dev-environments.md) using
-    [TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}'s
+    [TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd)'s
     Docker image:
 
     ```yaml
@@ -111,11 +111,11 @@ cloud resources and run the configuration.
 ## What's next?
 
 1. The examples above demonstrate the use of
-[TGI :material-arrow-top-right-thin:{ .external }](https://huggingface.co/docs/text-generation-inference/en/installation_amd){:target="_blank"}. 
+[TGI](https://huggingface.co/docs/text-generation-inference/en/installation_amd). 
 AMD accelerators can also be used with other frameworks like vLLM, Ollama, etc., and we'll be adding more examples soon.
 2. RunPod is the first cloud provider where dstack supports AMD. More cloud providers will be supported soon as well.
-3. Want to give RunPod and `dstack` a try? Make sure you've signed up for [RunPod :material-arrow-top-right-thin:{ .external }](https://www.runpod.io/){:target="_blank"}, 
+3. Want to give RunPod and `dstack` a try? Make sure you've signed up for [RunPod](https://www.runpod.io/), 
    then [set up](../../docs/reference/server/config.yml.md#runpod) the `dstack server`. 
 
-> Have questioned or feedback? Join our [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd){:target="_blank"} 
+> Have questioned or feedback? Join our [Discord](https://discord.gg/u8SmfwPpMd) 
 server.
diff --git a/docs/blog/posts/amd-on-tensorwave.md b/docs/blog/posts/amd-on-tensorwave.md
@@ -14,7 +14,7 @@ Since last month, when we introduced support for private clouds and data centers
 to orchestrate AI containers with any AI cloud vendor, whether they provide on-demand compute or reserved clusters.
 
 In this tutorial, we’ll walk you through how `dstack` can be used with
-[TensorWave :material-arrow-top-right-thin:{ .external }](https://tensorwave.com/){:target="_blank"} using
+[TensorWave](https://tensorwave.com/) using
 [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets).
 
 <img src="https://dstack.ai/static-assets/static-assets/images/dstack-tensorwave-v2.png" width="630"/>
@@ -237,4 +237,4 @@ Want to see how it works? Check out the video below:
 !!! info "What's next?"
     1. See [SSH fleets](../../docs/concepts/fleets.md#ssh-fleets)
     2. Read about [dev environments](../../docs/concepts/dev-environments.md), [tasks](../../docs/concepts/tasks.md), and [services](../../docs/concepts/services.md)
-    3. Join [Discord :material-arrow-top-right-thin:{ .external }](https://discord.gg/u8SmfwPpMd)
+    3. Join [Discord](https://discord.gg/u8SmfwPpMd)
diff --git a/docs/blog/posts/benchmark-amd-containers-and-partitions.md b/docs/blog/posts/benchmark-amd-containers-and-partitions.md
@@ -16,7 +16,7 @@ Our new benchmark explores two important areas for optimizing AI workloads on AM
 
 <!-- more -->
 
-This benchmark was supported by [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"},
+This benchmark was supported by [Hot Aisle](https://hotaisle.xyz/),
 a provider of AMD GPU bare-metal and VM infrastructure.
 
 ## Benchmark 1: Bare-metal vs containers
@@ -56,11 +56,11 @@ Our experiments consistently demonstrate that running multi-node AI workloads in
 
 ## Benchmark 2: Partition performance isolated vs mesh
 
-The AMD GPU can be [partitioned :material-arrow-top-right-thin:{ .external }](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html){:target="_blank"} into smaller, independent units (e.g., NPS4 mode splits one GPU into four partitions). This promises better memory bandwidth utilization. Does this theoretical gain translate to better performance in practice?
+The AMD GPU can be [partitioned](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html) into smaller, independent units (e.g., NPS4 mode splits one GPU into four partitions). This promises better memory bandwidth utilization. Does this theoretical gain translate to better performance in practice?
 
 ### Finding 1: Higher performance for isolated partitions
 
-First, we sought to reproduce and extend findings from the [official ROCm blog :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html){:target="_blank"}. We benchmarked the memory bandwidth of a single partition (in CPX/NPS4 mode) against a full, unpartitioned GPU (in SPX/NPS1 mode).
+First, we sought to reproduce and extend findings from the [official ROCm blog](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html). We benchmarked the memory bandwidth of a single partition (in CPX/NPS4 mode) against a full, unpartitioned GPU (in SPX/NPS1 mode).
 
 <img src="https://dstack.ai/static-assets/static-assets/images/benchmark-amd-containers-and-partitions-chart4a.png" width="750"/>
 
@@ -100,7 +100,7 @@ GPU partitioning is only practical if used dynamically—for instance, to run mu
 #### Limitations
 
 1. **Reproducibility**: AMD’s original blog post on partitioning lacked detailed setup information, so we had to reconstruct the benchmarks independently.
-2. **Network tuning**: These benchmarks were run on a default, out-of-the-box network configuration. Our results for RCCL (~339 GB/s) and RDMA (~726 Gbps) are slightly below the peak figures [reported by Dell :material-arrow-top-right-thin:{ .external }](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/){:target="_blank"}. This suggests that further performance could be unlocked with expert tuning of network topology, MTU size, and NCCL environment variables.
+2. **Network tuning**: These benchmarks were run on a default, out-of-the-box network configuration. Our results for RCCL (~339 GB/s) and RDMA (~726 Gbps) are slightly below the peak figures [reported by Dell](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/). This suggests that further performance could be unlocked with expert tuning of network topology, MTU size, and NCCL environment variables.
 
 ## Benchmark setup
 
@@ -352,7 +352,7 @@ The `SIZE` value is `1M`, `2M`, .., `8G`.
 
 **vLLM data parallel**
 
-1. Build nginx container (see [vLLM-nginx :material-arrow-top-right-thin:{ .external }](https://docs.vllm.ai/en/stable/deployment/nginx.html#build-nginx-container){:target="_blank"}).
+1. Build nginx container (see [vLLM-nginx](https://docs.vllm.ai/en/stable/deployment/nginx.html#build-nginx-container)).
 
 2. Create `nginx.conf`
 
@@ -471,13 +471,13 @@ HIP_VISIBLE_DEVICES=0 python3 toy_inference_benchmark.py \
 
 ## Source code
 
-All source code and findings are available in [our GitHub repo :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/benchmarks/tree/main/amd/baremetal_container_partition){:target="_blank"}.
+All source code and findings are available in [our GitHub repo](https://github.com/dstackai/benchmarks/tree/main/amd/baremetal_container_partition).
 
 ## References
 
-* [AMD Instinct MI300X GPU partitioning overview :material-arrow-top-right-thin:{ .external }](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html){:target="_blank"}
-* [Deep dive into partition modes by AMD :material-arrow-top-right-thin:{ .external }](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html){:target="_blank"}.
-* [RCCL and PerfTest for cluster validation by Dell :material-arrow-top-right-thin:{ .external }](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/){:target="_blank"}.
+* [AMD Instinct MI300X GPU partitioning overview](https://instinct.docs.amd.com/projects/amdgpu-docs/en/latest/gpu-partitioning/mi300x/overview.html)
+* [Deep dive into partition modes by AMD](https://rocm.blogs.amd.com/software-tools-optimization/compute-memory-modes/README.html).
+* [RCCL and PerfTest for cluster validation by Dell](https://infohub.delltechnologies.com/en-us/l/generative-ai-in-the-enterprise-with-amd-accelerators/rccl-and-perftest-for-cluster-validation-1/4/).
 
 ## What's next?
 
@@ -487,5 +487,5 @@ Benchmark the performance impact of VMs vs bare-metal for inference and training
 
 #### Hot Aisle
 
-Big thanks to [Hot Aisle :material-arrow-top-right-thin:{ .external }](https://hotaisle.xyz/){:target="_blank"} for providing the compute power behind these benchmarks. 
+Big thanks to [Hot Aisle](https://hotaisle.xyz/) for providing the compute power behind these benchmarks. 
 If you’re looking for fast AMD GPU bare-metal or VM instances, they’re definitely worth checking out.