From 8f8f64a5a77f9fb84784250357450a4180e30fa9 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Sat, 6 Aug 2022 13:22:51 +0100
Subject: [PATCH 01/51] add minimal pair_mace (only the required pure virtual
 functions).

---
 src/ML-MACE/pair_mace.cpp | 51 +++++++++++++++++++++++++++++++++++++++
 src/ML-MACE/pair_mace.h   | 48 ++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 src/ML-MACE/pair_mace.cpp
 create mode 100644 src/ML-MACE/pair_mace.h

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
new file mode 100644
index 00000000000..a494d6dff28
--- /dev/null
+++ b/src/ML-MACE/pair_mace.cpp
@@ -0,0 +1,51 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributors
+      William C Witt (University of Cambridge)
+------------------------------------------------------------------------- */
+
+#include "pair_mace.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairMACE::PairMACE(LAMMPS *lmp) : Pair(lmp)
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairMACE::~PairMACE()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMACE::compute(int eflag, int vflag)
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMACE::settings(int narg, char **arg)
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMACE::coeff(int narg, char **arg)
+{
+}
diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
new file mode 100644
index 00000000000..a0759ca7621
--- /dev/null
+++ b/src/ML-MACE/pair_mace.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributors
+      William C Witt (University of Cambridge)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(mace,PairMACE);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_MACE_H
+#define LMP_PAIR_MACE_H
+
+#include "pair.h"
+
+namespace LAMMPS_NS {
+
+class PairMACE : public Pair {
+
+ public:
+
+  PairMACE(class LAMMPS *);
+  ~PairMACE() override;
+  void compute(int, int) override;
+  void settings(int, char **) override;
+  void coeff(int, char **) override;
+
+ protected:
+
+};
+}    // namespace LAMMPS_NS
+
+#endif
+#endif

From 5875569738767268eebed13194c80bb35e0a0ec2 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Sat, 6 Aug 2022 13:25:33 +0100
Subject: [PATCH 02/51] enable cmake build with -DPKG_ML-MACE=ON.

---
 cmake/CMakeLists.txt                 | 3 ++-
 cmake/Modules/Packages/ML-MACE.cmake | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 cmake/Modules/Packages/ML-MACE.cmake

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d4dcdcff88a..37b156b4403 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -255,6 +255,7 @@ set(STANDARD_PACKAGES
   MISC
   ML-HDNNP
   ML-IAP
+  ML-MACE
   ML-PACE
   ML-QUIP
   ML-RANN
@@ -499,7 +500,7 @@ else()
 endif()
 
 foreach(PKG_WITH_INCL KSPACE PYTHON ML-IAP VORONOI COLVARS ML-HDNNP MDI MOLFILE NETCDF
-        PLUMED QMMM ML-QUIP SCAFACOS MACHDYN VTK KIM LATTE MSCG COMPRESS ML-PACE)
+        PLUMED QMMM ML-QUIP SCAFACOS MACHDYN VTK KIM LATTE MSCG COMPRESS ML-PACE ML-MACE)
   if(PKG_${PKG_WITH_INCL})
     include(Packages/${PKG_WITH_INCL})
   endif()
diff --git a/cmake/Modules/Packages/ML-MACE.cmake b/cmake/Modules/Packages/ML-MACE.cmake
new file mode 100644
index 00000000000..c2d5253ac5b
--- /dev/null
+++ b/cmake/Modules/Packages/ML-MACE.cmake
@@ -0,0 +1 @@
+message("Hello from ML-MACE.cmake.")

From 4607e225df5456fc2686407c8dd5140816ac4c2f Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Sat, 20 Aug 2022 17:57:01 +0100
Subject: [PATCH 03/51] add basic compute functionality to pair_mace.

---
 cmake/Modules/Packages/ML-MACE.cmake |   9 ++
 src/ML-MACE/pair_mace.cpp            | 157 +++++++++++++++++++++++++++
 src/ML-MACE/pair_mace.h              |   7 ++
 3 files changed, 173 insertions(+)

diff --git a/cmake/Modules/Packages/ML-MACE.cmake b/cmake/Modules/Packages/ML-MACE.cmake
index c2d5253ac5b..984298c75fd 100644
--- a/cmake/Modules/Packages/ML-MACE.cmake
+++ b/cmake/Modules/Packages/ML-MACE.cmake
@@ -1 +1,10 @@
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+
 message("Hello from ML-MACE.cmake.")
+
+find_package(Torch REQUIRED)
+
+#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+#target_include_directories(lammps PRIVATE "${TORCH_INCLUDE_DIRS}")
+target_link_libraries(lammps PRIVATE "${TORCH_LIBRARIES}")
+set_property(TARGET lammps PROPERTY CXX_STANDARD 14)
diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index a494d6dff28..33b9d8232a3 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -18,34 +18,191 @@
 
 #include "pair_mace.h"
 
+#include "atom.h"
+#include "memory.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+
+#include <iostream>
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
 PairMACE::PairMACE(LAMMPS *lmp) : Pair(lmp)
 {
+  std::cout << "Hello from MACE constructor." << std::endl;
+  std::cout << "Goodbye from MACE constructor." << std::endl;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairMACE::~PairMACE()
 {
+  std::cout << "Hello from MACE destructor." << std::endl;
+  std::cout << "Goodbye from MACE destructor." << std::endl;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMACE::compute(int eflag, int vflag)
 {
+  std::cout << "Hello from MACE compute." << std::endl;
+
+  ev_init(eflag, vflag);
+
+  // ----- positions -----
+  int n_nodes = atom->nlocal + atom->nghost;
+  auto positions = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
+  for (int ii = 0; ii < n_nodes; ii++)
+    for (int jj = 0; jj < 3; jj++)
+      positions[ii][jj] = atom->x[ii][jj];
+
+  // ----- cell -----
+  auto cell = torch::zeros({3,3}, torch::dtype(torch::kFloat64));
+  for (int ii = 0; ii < 3; ii++) {
+    cell[ii][ii] = 50.0;
+  }
+
+  // ----- edge_index -----
+  int n_edges = 0;
+  for (int ii = 0; ii < list->inum; ii++)
+    n_edges += list->numneigh[list->ilist[ii]];
+  auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
+  int k = 0;
+  for (int ii = 0; ii < list->inum; ii++) {
+    int i = list->ilist[ii];
+    int *jlist = list->firstneigh[i];
+    int jnum = list->numneigh[i];
+    for (int jj = 0; jj < jnum; jj++) {
+      edge_index[0][k] = i;
+      edge_index[1][k] = jlist[jj];
+      //edge_index[1,k] = (jlist[jj] & NEIGHMASK) + 1;
+      k++;
+    }
+  }
+
+  // node_attrs involves atomic numbers
+  int n_node_feats = atom->ntypes;
+  auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch::dtype(torch::kFloat64));
+  // TODO: generalize this
+  for (int ii = 0; ii < list->inum; ii++) {
+    if (atom->type[ii] == 1) {
+      node_attrs[ii][1] = 1.0;
+    } else if (atom->type[ii]==2) {
+      node_attrs[ii][0] = 1.0;
+    }
+  }
+
+  // TODO: consider from_blob to avoid copy
+  auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
+  auto energy = torch::empty({1}, torch::dtype(torch::kFloat64));
+  auto forces = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
+  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));  //? size
+  auto shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64)); //zeros instead of empty
+  auto weight = torch::empty({1}, torch::dtype(torch::kFloat64));
+  ptr[0] = 0;
+  ptr[1] = 3;
+  weight[0] = 1.0;
+
+  c10::Dict<std::string, torch::Tensor> input;
+  input.insert("batch", batch);
+  //std::cout << "batch" << std::endl;
+  //std::cout << batch << std::endl;
+  //input.insert("cell", cell);
+  //std::cout << "cell" << std::endl;
+  //std::cout << cell << std::endl;
+  input.insert("edge_index", edge_index);
+  //std::cout << "edge_index" << std::endl;
+  //std::cout << edge_index << std::endl;
+  input.insert("energy", energy);
+  //std::cout << "energy" << std::endl;
+  //std::cout << energy << std::endl;
+  input.insert("forces", forces);
+  //std::cout << "forces" << std::endl;
+  //std::cout << forces << std::endl;
+  input.insert("node_attrs", node_attrs);
+  //std::cout << "node_attrs" << std::endl;
+  //std::cout << node_attrs << std::endl;
+  input.insert("positions", positions);
+  //std::cout << "positions" << std::endl;
+  //std::cout << positions << std::endl;
+  input.insert("ptr", ptr);
+  //std::cout << "ptr" << std::endl;
+  //std::cout << ptr << std::endl;
+  input.insert("shifts", shifts);
+  //std::cout << "shifts" << std::endl;
+  //std::cout << shifts << std::endl;
+  input.insert("weight", weight);
+  //std::cout << "weight" << std::endl;
+  //std::cout << weight << std::endl;
+
+  std::cout << "evaluating model" << std::endl;
+  auto output = model.forward({input, true}).toGenericDict();
+  energy = output.at("energy").toTensor();
+  auto contributions = output.at("contributions").toTensor();
+  forces = output.at("forces").toTensor();
+  std::cout << energy << std::endl;
+  std::cout << contributions << std::endl;
+  std::cout << forces << std::endl;
+
+  std::cout << "Goodbye from MACE compute." << std::endl;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMACE::settings(int narg, char **arg)
 {
+  std::cout << "Hello from MACE settings." << std::endl;
+  std::cout << "Goodbye from MACE settings." << std::endl;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMACE::coeff(int narg, char **arg)
 {
+  std::cout << "Hello from MACE coeff." << std::endl;
+
+  if (!allocated) allocate();
+
+  std::cout << "Loading MACE model from \"" << arg[2] << "\" ...";
+  model = torch::jit::load(arg[2]);
+  std::cout << " finished." << std::endl;
+
+  for (int i=1; i<atom->ntypes+1; i++)
+    for (int j=i; j<atom->ntypes+1; j++)
+      setflag[i][j] = 1;
+
+  std::cout << "Goodbye from MACE coeff." << std::endl;
+}
+
+void PairMACE::init_style()
+{
+  std::cout << "Hello from MACE init_coef." << std::endl;
+  // require full neighbor list
+  neighbor->add_request(this, NeighConst::REQ_FULL);
+  std::cout << "Goodbye from MACE init_coef." << std::endl;
+}
+
+double PairMACE::init_one(int i, int j)
+{
+  // TODO: address neighbor list skin distance (2A) differently
+  return model.attr("r_max").toDouble() - 2.0;
+}
+
+void PairMACE::allocate()
+{
+  std::cout << "Hello from MACE allocate." << std::endl;
+
+  allocated = 1;
+
+  memory->create(setflag, atom->ntypes+1, atom->ntypes+1, "pair:setflag");
+  for (int i=1; i<atom->ntypes+1; i++)
+    for (int j=i; j<atom->ntypes+1; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cutsq, atom->ntypes+1, atom->ntypes+1, "pair:cutsq");
+  std::cout << "WARNING: may need to overload init_one, which sets cutsq." << std::endl;
+
+  std::cout << "Goodbye from MACE allocate." << std::endl;
 }
diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index a0759ca7621..c57f76a5dd0 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -27,6 +27,8 @@ PairStyle(mace,PairMACE);
 
 #include "pair.h"
 
+#include <torch/script.h>
+
 namespace LAMMPS_NS {
 
 class PairMACE : public Pair {
@@ -38,9 +40,14 @@ class PairMACE : public Pair {
   void compute(int, int) override;
   void settings(int, char **) override;
   void coeff(int, char **) override;
+  void init_style() override;
+  double init_one(int, int) override;
+  void allocate();
 
  protected:
 
+  torch::jit::script::Module model;
+
 };
 }    // namespace LAMMPS_NS
 

From 9d8991e1958214973923ce21c349ded8f5155906 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 4 Oct 2022 12:04:48 +0100
Subject: [PATCH 04/51] various things.

---
 src/ML-MACE/pair_mace.cpp | 58 ++++++++++++++++++++++++++++++++-------
 src/ML-MACE/pair_mace.h   |  6 ++++
 2 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 33b9d8232a3..46b9231d53c 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -23,6 +23,7 @@
 #include "neigh_list.h"
 #include "neighbor.h"
 
+#include <algorithm>
 #include <iostream>
 
 using namespace LAMMPS_NS;
@@ -82,34 +83,48 @@ void PairMACE::compute(int eflag, int vflag)
     }
   }
 
+  std::cout << "mace atomic numbers size" << mace_atomic_numbers.size() << std::endl;
+  std::cout << "mace atomic numbers  " << mace_atomic_numbers << std::endl;
+  auto get_mace_type = [this](int lammps_type) {
+    for (int i=0; i<mace_atomic_numbers.size(); ++i) {
+      std::cout << "compare " << i << ":  " << mace_atomic_numbers[i] << "  " << lammps_atomic_numbers[lammps_type-1] << std::endl;
+      if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
+        return i+1;
+      }
+    }
+    // TODO: should throw error
+    return -1000;
+  };
+
   // node_attrs involves atomic numbers
-  int n_node_feats = atom->ntypes;
+  int n_node_feats = mace_atomic_numbers.size();
   auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch::dtype(torch::kFloat64));
   // TODO: generalize this
   for (int ii = 0; ii < list->inum; ii++) {
-    if (atom->type[ii] == 1) {
-      node_attrs[ii][1] = 1.0;
-    } else if (atom->type[ii]==2) {
-      node_attrs[ii][0] = 1.0;
-    }
+
+    // map lammps type to mace type
+    int mace_type = get_mace_type(atom->type[ii]);
+    std::cout << "mace_type " << mace_type << std::endl;
+    node_attrs[ii][mace_type-1] = 1.0;
   }
 
   // TODO: consider from_blob to avoid copy
   auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
   auto energy = torch::empty({1}, torch::dtype(torch::kFloat64));
   auto forces = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
-  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));  //? size
+  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
   auto shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64)); //zeros instead of empty
+  auto unit_shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64)); //zeros instead of empty
   auto weight = torch::empty({1}, torch::dtype(torch::kFloat64));
-  ptr[0] = 0;
-  ptr[1] = 3;
+  ptr[0] = 0;  // always zero
+  ptr[1] = 3;  // always n_atoms
   weight[0] = 1.0;
 
   c10::Dict<std::string, torch::Tensor> input;
   input.insert("batch", batch);
   //std::cout << "batch" << std::endl;
   //std::cout << batch << std::endl;
-  //input.insert("cell", cell);
+  input.insert("cell", cell);
   //std::cout << "cell" << std::endl;
   //std::cout << cell << std::endl;
   input.insert("edge_index", edge_index);
@@ -133,6 +148,9 @@ void PairMACE::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   //std::cout << "shifts" << std::endl;
   //std::cout << shifts << std::endl;
+  input.insert("unit_shifts", unit_shifts);
+  //std::cout << "unit_shifts" << std::endl;
+  //std::cout << unit_shifts << std::endl;
   input.insert("weight", weight);
   //std::cout << "weight" << std::endl;
   //std::cout << weight << std::endl;
@@ -169,6 +187,26 @@ void PairMACE::coeff(int narg, char **arg)
   model = torch::jit::load(arg[2]);
   std::cout << " finished." << std::endl;
 
+  std::cout << "attributes" << std::endl;
+  for (const auto& pair : model.named_attributes()) {
+    //std::cout << pair.name << ": " << pair.value << std::endl;
+    std::cout << pair.name << std::endl;
+  }
+
+  r_max = model.attr("r_max").toDouble();
+  std::cout << "  - The r_max is: " << r_max << "." << std::endl;
+  std::cout << "  xxxx atomic numbers " << model.attr("atomic_numbers") << std::endl;
+  mace_atomic_numbers = model.attr("atomic_numbers").toIntVector();
+  std::cout << "  - The model atomic numbers are: " << mace_atomic_numbers << "." << std::endl;
+  //std::cout << "  - The atomic numbers are: " << model.attr("atomic_numbers") << "." << std::endl;
+
+  for (int i=3; i<narg; ++i) {
+    auto iter = std::find(periodic_table.begin(), periodic_table.end(), arg[i]);
+    int index = std::distance(periodic_table.begin(), iter) + 1;
+    lammps_atomic_numbers.push_back(index);
+  }
+  std::cout << "  - The pair_coeff atomic numbers are: " << lammps_atomic_numbers << "." << std::endl;
+
   for (int i=1; i<atom->ntypes+1; i++)
     for (int j=i; j<atom->ntypes+1; j++)
       setflag[i][j] = 1;
diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index c57f76a5dd0..94de254c57e 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -47,6 +47,12 @@ class PairMACE : public Pair {
  protected:
 
   torch::jit::script::Module model;
+  double r_max;
+  std::vector<int64_t> mace_atomic_numbers;
+  std::vector<int64_t> lammps_atomic_numbers;
+  const std::array<std::string,10> periodic_table =
+    {"H", "He",
+     "Li", "Be", "B", "C", "N", "O", "F", "Ne"};
 
 };
 }    // namespace LAMMPS_NS

From be70511b445ac5372fd1fce9c43e5fc7e501f41b Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 4 Oct 2022 13:14:13 +0100
Subject: [PATCH 05/51] save global energy.

---
 src/ML-MACE/pair_mace.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 46b9231d53c..dfc6284e225 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -164,6 +164,7 @@ void PairMACE::compute(int eflag, int vflag)
   std::cout << contributions << std::endl;
   std::cout << forces << std::endl;
 
+  eng_vdwl = output.at("energy").toTensor()[0].item<double>();
   std::cout << "Goodbye from MACE compute." << std::endl;
 }
 

From b9207fa240de55ba14aa2acd671526e61f7c1eb1 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 11 Oct 2022 17:42:27 +0100
Subject: [PATCH 06/51] more progress, including reading site energies.

---
 src/ML-MACE/pair_mace.cpp | 52 ++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index dfc6284e225..9770a7177a6 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -52,13 +52,17 @@ void PairMACE::compute(int eflag, int vflag)
 
   ev_init(eflag, vflag);
 
+  std::cout << "nlocal:  " << atom->nlocal << std::endl;
+  std::cout << "nghost:  " << atom->nghost << std::endl;
+
   // ----- positions -----
   int n_nodes = atom->nlocal + atom->nghost;
   auto positions = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
-  for (int ii = 0; ii < n_nodes; ii++)
-    for (int jj = 0; jj < 3; jj++)
+  for (int ii = 0; ii < n_nodes; ii++) {
+    for (int jj = 0; jj < 3; jj++) {
       positions[ii][jj] = atom->x[ii][jj];
-
+    }
+  }
   // ----- cell -----
   auto cell = torch::zeros({3,3}, torch::dtype(torch::kFloat64));
   for (int ii = 0; ii < 3; ii++) {
@@ -83,11 +87,8 @@ void PairMACE::compute(int eflag, int vflag)
     }
   }
 
-  std::cout << "mace atomic numbers size" << mace_atomic_numbers.size() << std::endl;
-  std::cout << "mace atomic numbers  " << mace_atomic_numbers << std::endl;
   auto get_mace_type = [this](int lammps_type) {
     for (int i=0; i<mace_atomic_numbers.size(); ++i) {
-      std::cout << "compare " << i << ":  " << mace_atomic_numbers[i] << "  " << lammps_atomic_numbers[lammps_type-1] << std::endl;
       if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
         return i+1;
       }
@@ -117,7 +118,7 @@ void PairMACE::compute(int eflag, int vflag)
   auto unit_shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64)); //zeros instead of empty
   auto weight = torch::empty({1}, torch::dtype(torch::kFloat64));
   ptr[0] = 0;  // always zero
-  ptr[1] = 3;  // always n_atoms
+  ptr[1] = n_nodes;  // always n_atoms
   weight[0] = 1.0;
 
   c10::Dict<std::string, torch::Tensor> input;
@@ -156,13 +157,17 @@ void PairMACE::compute(int eflag, int vflag)
   //std::cout << weight << std::endl;
 
   std::cout << "evaluating model" << std::endl;
-  auto output = model.forward({input, true}).toGenericDict();
+  // when should stress be printed?
+  auto output = model.forward({input, false, true, false, false}).toGenericDict();
+  std::cout << "energy" << std::endl;
   energy = output.at("energy").toTensor();
-  auto contributions = output.at("contributions").toTensor();
-  forces = output.at("forces").toTensor();
   std::cout << energy << std::endl;
-  std::cout << contributions << std::endl;
+  std::cout << "forces" << std::endl;
+  forces = output.at("forces").toTensor();
   std::cout << forces << std::endl;
+  std::cout << "site energies" << std::endl;
+  auto site_energies = output.at("node_energy").toTensor();
+  std::cout << site_energies << std::endl;
 
   eng_vdwl = output.at("energy").toTensor()[0].item<double>();
   std::cout << "Goodbye from MACE compute." << std::endl;
@@ -188,19 +193,22 @@ void PairMACE::coeff(int narg, char **arg)
   model = torch::jit::load(arg[2]);
   std::cout << " finished." << std::endl;
 
-  std::cout << "attributes" << std::endl;
-  for (const auto& pair : model.named_attributes()) {
-    //std::cout << pair.name << ": " << pair.value << std::endl;
-    std::cout << pair.name << std::endl;
-  }
+//  std::cout << "attributes" << std::endl;
+//  for (const auto& pair : model.named_attributes()) {
+//    //std::cout << pair.name << ": " << pair.value << std::endl;
+//    std::cout << pair.name << std::endl;
+//  }
 
-  r_max = model.attr("r_max").toDouble();
+  r_max = model.attr("r_max").toTensor().item<double>();
   std::cout << "  - The r_max is: " << r_max << "." << std::endl;
-  std::cout << "  xxxx atomic numbers " << model.attr("atomic_numbers") << std::endl;
-  mace_atomic_numbers = model.attr("atomic_numbers").toIntVector();
-  std::cout << "  - The model atomic numbers are: " << mace_atomic_numbers << "." << std::endl;
-  //std::cout << "  - The atomic numbers are: " << model.attr("atomic_numbers") << "." << std::endl;
 
+  // extract atomic numbers from mace model
+  auto a_n = model.attr("atomic_numbers").toTensor();
+  for (int i=0; i<a_n.size(0); ++i) {
+    mace_atomic_numbers.push_back(a_n[i].item<int64_t>());
+  }
+  std::cout << "  - The model atomic numbers are: " << mace_atomic_numbers << "." << std::endl;
+  // extract atomic numbers from pair_coeff
   for (int i=3; i<narg; ++i) {
     auto iter = std::find(periodic_table.begin(), periodic_table.end(), arg[i]);
     int index = std::distance(periodic_table.begin(), iter) + 1;
@@ -226,7 +234,7 @@ void PairMACE::init_style()
 double PairMACE::init_one(int i, int j)
 {
   // TODO: address neighbor list skin distance (2A) differently
-  return model.attr("r_max").toDouble() - 2.0;
+  return model.attr("r_max").toTensor().item<double>() - 2.0;
 }
 
 void PairMACE::allocate()

From 637f1769a4d6c54c2e4a5f4241e403e603572c5b Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 11 Oct 2022 18:37:49 +0100
Subject: [PATCH 07/51] tidy.

---
 src/ML-MACE/pair_mace.cpp | 83 ++++++---------------------------------
 1 file changed, 13 insertions(+), 70 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 9770a7177a6..07c29ab700f 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -32,29 +32,20 @@ using namespace LAMMPS_NS;
 
 PairMACE::PairMACE(LAMMPS *lmp) : Pair(lmp)
 {
-  std::cout << "Hello from MACE constructor." << std::endl;
-  std::cout << "Goodbye from MACE constructor." << std::endl;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairMACE::~PairMACE()
 {
-  std::cout << "Hello from MACE destructor." << std::endl;
-  std::cout << "Goodbye from MACE destructor." << std::endl;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMACE::compute(int eflag, int vflag)
 {
-  std::cout << "Hello from MACE compute." << std::endl;
-
   ev_init(eflag, vflag);
 
-  std::cout << "nlocal:  " << atom->nlocal << std::endl;
-  std::cout << "nghost:  " << atom->nghost << std::endl;
-
   // ----- positions -----
   int n_nodes = atom->nlocal + atom->nghost;
   auto positions = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
@@ -87,7 +78,7 @@ void PairMACE::compute(int eflag, int vflag)
     }
   }
 
-  auto get_mace_type = [this](int lammps_type) {
+  auto mace_type = [this](int lammps_type) {
     for (int i=0; i<mace_atomic_numbers.size(); ++i) {
       if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
         return i+1;
@@ -102,11 +93,8 @@ void PairMACE::compute(int eflag, int vflag)
   auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch::dtype(torch::kFloat64));
   // TODO: generalize this
   for (int ii = 0; ii < list->inum; ii++) {
-
     // map lammps type to mace type
-    int mace_type = get_mace_type(atom->type[ii]);
-    std::cout << "mace_type " << mace_type << std::endl;
-    node_attrs[ii][mace_type-1] = 1.0;
+    node_attrs[ii][mace_type(atom->type[ii])-1] = 1.0;
   }
 
   // TODO: consider from_blob to avoid copy
@@ -114,91 +102,53 @@ void PairMACE::compute(int eflag, int vflag)
   auto energy = torch::empty({1}, torch::dtype(torch::kFloat64));
   auto forces = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
   auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
-  auto shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64)); //zeros instead of empty
-  auto unit_shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64)); //zeros instead of empty
+  auto shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64));
+  auto unit_shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64));
   auto weight = torch::empty({1}, torch::dtype(torch::kFloat64));
-  ptr[0] = 0;  // always zero
-  ptr[1] = n_nodes;  // always n_atoms
+  ptr[0] = 0;
+  ptr[1] = n_nodes;
   weight[0] = 1.0;
 
+  // pack the input, call the model, extract the output
   c10::Dict<std::string, torch::Tensor> input;
   input.insert("batch", batch);
-  //std::cout << "batch" << std::endl;
-  //std::cout << batch << std::endl;
   input.insert("cell", cell);
-  //std::cout << "cell" << std::endl;
-  //std::cout << cell << std::endl;
   input.insert("edge_index", edge_index);
-  //std::cout << "edge_index" << std::endl;
-  //std::cout << edge_index << std::endl;
   input.insert("energy", energy);
-  //std::cout << "energy" << std::endl;
-  //std::cout << energy << std::endl;
   input.insert("forces", forces);
-  //std::cout << "forces" << std::endl;
-  //std::cout << forces << std::endl;
   input.insert("node_attrs", node_attrs);
-  //std::cout << "node_attrs" << std::endl;
-  //std::cout << node_attrs << std::endl;
   input.insert("positions", positions);
-  //std::cout << "positions" << std::endl;
-  //std::cout << positions << std::endl;
   input.insert("ptr", ptr);
-  //std::cout << "ptr" << std::endl;
-  //std::cout << ptr << std::endl;
   input.insert("shifts", shifts);
-  //std::cout << "shifts" << std::endl;
-  //std::cout << shifts << std::endl;
   input.insert("unit_shifts", unit_shifts);
-  //std::cout << "unit_shifts" << std::endl;
-  //std::cout << unit_shifts << std::endl;
   input.insert("weight", weight);
-  //std::cout << "weight" << std::endl;
-  //std::cout << weight << std::endl;
-
-  std::cout << "evaluating model" << std::endl;
-  // when should stress be printed?
+  // TODO: when should stress be printed?
   auto output = model.forward({input, false, true, false, false}).toGenericDict();
-  std::cout << "energy" << std::endl;
-  energy = output.at("energy").toTensor();
-  std::cout << energy << std::endl;
-  std::cout << "forces" << std::endl;
-  forces = output.at("forces").toTensor();
-  std::cout << forces << std::endl;
-  std::cout << "site energies" << std::endl;
+  // TODO: remove this unused energy call
+  //energy = output.at("energy").toTensor();
   auto site_energies = output.at("node_energy").toTensor();
-  std::cout << site_energies << std::endl;
+  forces = output.at("forces").toTensor();
 
+  // post-process
   eng_vdwl = output.at("energy").toTensor()[0].item<double>();
-  std::cout << "Goodbye from MACE compute." << std::endl;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMACE::settings(int narg, char **arg)
 {
-  std::cout << "Hello from MACE settings." << std::endl;
-  std::cout << "Goodbye from MACE settings." << std::endl;
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMACE::coeff(int narg, char **arg)
 {
-  std::cout << "Hello from MACE coeff." << std::endl;
-
   if (!allocated) allocate();
 
   std::cout << "Loading MACE model from \"" << arg[2] << "\" ...";
   model = torch::jit::load(arg[2]);
   std::cout << " finished." << std::endl;
 
-//  std::cout << "attributes" << std::endl;
-//  for (const auto& pair : model.named_attributes()) {
-//    //std::cout << pair.name << ": " << pair.value << std::endl;
-//    std::cout << pair.name << std::endl;
-//  }
-
   r_max = model.attr("r_max").toTensor().item<double>();
   std::cout << "  - The r_max is: " << r_max << "." << std::endl;
 
@@ -208,6 +158,7 @@ void PairMACE::coeff(int narg, char **arg)
     mace_atomic_numbers.push_back(a_n[i].item<int64_t>());
   }
   std::cout << "  - The model atomic numbers are: " << mace_atomic_numbers << "." << std::endl;
+
   // extract atomic numbers from pair_coeff
   for (int i=3; i<narg; ++i) {
     auto iter = std::find(periodic_table.begin(), periodic_table.end(), arg[i]);
@@ -219,16 +170,12 @@ void PairMACE::coeff(int narg, char **arg)
   for (int i=1; i<atom->ntypes+1; i++)
     for (int j=i; j<atom->ntypes+1; j++)
       setflag[i][j] = 1;
-
-  std::cout << "Goodbye from MACE coeff." << std::endl;
 }
 
 void PairMACE::init_style()
 {
-  std::cout << "Hello from MACE init_coef." << std::endl;
   // require full neighbor list
   neighbor->add_request(this, NeighConst::REQ_FULL);
-  std::cout << "Goodbye from MACE init_coef." << std::endl;
 }
 
 double PairMACE::init_one(int i, int j)
@@ -239,8 +186,6 @@ double PairMACE::init_one(int i, int j)
 
 void PairMACE::allocate()
 {
-  std::cout << "Hello from MACE allocate." << std::endl;
-
   allocated = 1;
 
   memory->create(setflag, atom->ntypes+1, atom->ntypes+1, "pair:setflag");
@@ -250,6 +195,4 @@ void PairMACE::allocate()
 
   memory->create(cutsq, atom->ntypes+1, atom->ntypes+1, "pair:cutsq");
   std::cout << "WARNING: may need to overload init_one, which sets cutsq." << std::endl;
-
-  std::cout << "Goodbye from MACE allocate." << std::endl;
 }

From 51329f4304f76d1bac9c347dbad6ce687a21c442 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 11 Oct 2022 18:48:34 +0100
Subject: [PATCH 08/51] get cell info from lammps.

---
 src/ML-MACE/pair_mace.cpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 07c29ab700f..dede53fc85d 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -19,6 +19,7 @@
 #include "pair_mace.h"
 
 #include "atom.h"
+#include "domain.h"
 #include "memory.h"
 #include "neigh_list.h"
 #include "neighbor.h"
@@ -56,9 +57,15 @@ void PairMACE::compute(int eflag, int vflag)
   }
   // ----- cell -----
   auto cell = torch::zeros({3,3}, torch::dtype(torch::kFloat64));
-  for (int ii = 0; ii < 3; ii++) {
-    cell[ii][ii] = 50.0;
-  }
+  cell[0][0] = domain->xprd;
+  cell[0][1] = 0.0;
+  cell[0][2] = 0.0;
+  cell[1][0] = domain->xy;
+  cell[1][1] = domain->yprd;
+  cell[1][2] = 0.0;
+  cell[2][0] = domain->xz;
+  cell[2][1] = domain->yz;
+  cell[2][2] = domain->zprd;
 
   // ----- edge_index -----
   int n_edges = 0;

From 19625a371d810b5d58d0fba11f405c429b6fa3a9 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Wed, 2 Nov 2022 14:33:13 +0000
Subject: [PATCH 09/51] add num_interactions and more.

---
 src/ML-MACE/pair_mace.cpp | 89 ++++++++++++++++++++++++++++++---------
 src/ML-MACE/pair_mace.h   |  1 +
 2 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index dede53fc85d..e3d735d36cb 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -48,13 +48,17 @@ void PairMACE::compute(int eflag, int vflag)
   ev_init(eflag, vflag);
 
   // ----- positions -----
+  // WARNING: list->gnum not the same as atom->nghost
   int n_nodes = atom->nlocal + atom->nghost;
+  //int n_nodes = list->inum + list->gnum;
   auto positions = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
   for (int ii = 0; ii < n_nodes; ii++) {
-    for (int jj = 0; jj < 3; jj++) {
-      positions[ii][jj] = atom->x[ii][jj];
-    }
+    int i = list->ilist[ii];
+    positions[i][0] = atom->x[i][0];
+    positions[i][1] = atom->x[i][1];
+    positions[i][2] = atom->x[i][2];
   }
+
   // ----- cell -----
   auto cell = torch::zeros({3,3}, torch::dtype(torch::kFloat64));
   cell[0][0] = domain->xprd;
@@ -68,20 +72,53 @@ void PairMACE::compute(int eflag, int vflag)
   cell[2][2] = domain->zprd;
 
   // ----- edge_index -----
+  // count total number of edges
   int n_edges = 0;
-  for (int ii = 0; ii < list->inum; ii++)
-    n_edges += list->numneigh[list->ilist[ii]];
+  for (int ii = 0; ii < n_nodes; ii++) {
+  //for (int ii = 0; ii < list->inum; ii++) {
+    int i = list->ilist[ii];
+    double xtmp = atom->x[i][0];
+    double ytmp = atom->x[i][1];
+    double ztmp = atom->x[i][2];
+    int *jlist = list->firstneigh[i];
+    int jnum = list->numneigh[i];
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = jlist[jj];
+      j &= NEIGHMASK; // TODO: what is this?
+      double delx = xtmp - atom->x[j][0];
+      double dely = ytmp - atom->x[j][1];
+      double delz = ztmp - atom->x[j][2];
+      double rsq = delx * delx + dely * dely + delz * delz;
+      // TODO: which is better?
+      //if (rsq < cutsq[itype][jtype]) n_edges += 1;
+      if (rsq < r_max*r_max) n_edges += 1;
+    }
+  }
+
+
   auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
   int k = 0;
-  for (int ii = 0; ii < list->inum; ii++) {
+  for (int ii = 0; ii < n_nodes; ii++) {
+  //for (int ii = 0; ii < list->inum; ii++) {
     int i = list->ilist[ii];
+    double xtmp = atom->x[i][0];
+    double ytmp = atom->x[i][1];
+    double ztmp = atom->x[i][2];
     int *jlist = list->firstneigh[i];
     int jnum = list->numneigh[i];
     for (int jj = 0; jj < jnum; jj++) {
-      edge_index[0][k] = i;
-      edge_index[1][k] = jlist[jj];
-      //edge_index[1,k] = (jlist[jj] & NEIGHMASK) + 1;
-      k++;
+      int j = jlist[jj];
+      j &= NEIGHMASK; // TODO: what is this?
+      double delx = xtmp - atom->x[j][0];
+      double dely = ytmp - atom->x[j][1];
+      double delz = ztmp - atom->x[j][2];
+      double rsq = delx * delx + dely * dely + delz * delz;
+      if (rsq < r_max*r_max) {
+        edge_index[0][k] = i;
+        edge_index[1][k] = j;
+        //edge_index[1,k] = (jlist[jj] & NEIGHMASK) + 1;
+        k++;
+      }
     }
   }
 
@@ -98,10 +135,10 @@ void PairMACE::compute(int eflag, int vflag)
   // node_attrs involves atomic numbers
   int n_node_feats = mace_atomic_numbers.size();
   auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch::dtype(torch::kFloat64));
-  // TODO: generalize this
-  for (int ii = 0; ii < list->inum; ii++) {
+  for (int ii = 0; ii < n_nodes; ii++) {
+    int i = list->ilist[ii];
     // map lammps type to mace type
-    node_attrs[ii][mace_type(atom->type[ii])-1] = 1.0;
+    node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
   }
 
   // TODO: consider from_blob to avoid copy
@@ -131,13 +168,21 @@ void PairMACE::compute(int eflag, int vflag)
   input.insert("weight", weight);
   // TODO: when should stress be printed?
   auto output = model.forward({input, false, true, false, false}).toGenericDict();
-  // TODO: remove this unused energy call
-  //energy = output.at("energy").toTensor();
-  auto site_energies = output.at("node_energy").toTensor();
+  auto node_energy = output.at("node_energy").toTensor();
   forces = output.at("forces").toTensor();
 
   // post-process
-  eng_vdwl = output.at("energy").toTensor()[0].item<double>();
+  eng_vdwl = 0.0;
+  for (int ii = 0; ii < n_nodes; ii++) {
+    int i = list->ilist[ii];
+    // TODO: update forces here
+    if (ii < list->inum) {
+      eng_vdwl += node_energy[i].item<double>();
+    if (eflag_atom)
+      eatom[i] = node_energy[i].item<double>();
+    }
+  }
+
 }
 
 /* ---------------------------------------------------------------------- */
@@ -150,6 +195,8 @@ void PairMACE::settings(int narg, char **arg)
 
 void PairMACE::coeff(int narg, char **arg)
 {
+  // TODO: remove print statements from this routine, or have a single proc print
+
   if (!allocated) allocate();
 
   std::cout << "Loading MACE model from \"" << arg[2] << "\" ...";
@@ -158,6 +205,8 @@ void PairMACE::coeff(int narg, char **arg)
 
   r_max = model.attr("r_max").toTensor().item<double>();
   std::cout << "  - The r_max is: " << r_max << "." << std::endl;
+  num_interactions = model.attr("num_interactions").toTensor().item<int64_t>();
+  std::cout << "  - The model has: " << num_interactions << " layers." << std::endl;
 
   // extract atomic numbers from mace model
   auto a_n = model.attr("atomic_numbers").toTensor();
@@ -181,14 +230,12 @@ void PairMACE::coeff(int narg, char **arg)
 
 void PairMACE::init_style()
 {
-  // require full neighbor list
-  neighbor->add_request(this, NeighConst::REQ_FULL);
+  neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
 }
 
 double PairMACE::init_one(int i, int j)
 {
-  // TODO: address neighbor list skin distance (2A) differently
-  return model.attr("r_max").toTensor().item<double>() - 2.0;
+  return num_interactions*model.attr("r_max").toTensor().item<double>();
 }
 
 void PairMACE::allocate()
diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index 94de254c57e..5d0ed3da0f5 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -48,6 +48,7 @@ class PairMACE : public Pair {
 
   torch::jit::script::Module model;
   double r_max;
+  int64_t num_interactions;
   std::vector<int64_t> mace_atomic_numbers;
   std::vector<int64_t> lammps_atomic_numbers;
   const std::array<std::string,10> periodic_table =

From d97f7c98776bf6a4ef0a8afdf0216487f3318585 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Thu, 3 Nov 2022 02:41:09 +0000
Subject: [PATCH 10/51] several improvements.

---
 src/ML-MACE/pair_mace.cpp | 49 ++++++++++++++++++++++++---------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index e3d735d36cb..7d098a78b90 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -20,12 +20,15 @@
 
 #include "atom.h"
 #include "domain.h"
+#include "error.h"
+#include "force.h"
 #include "memory.h"
 #include "neigh_list.h"
 #include "neighbor.h"
 
 #include <algorithm>
 #include <iostream>
+#include <stdexcept>
 
 using namespace LAMMPS_NS;
 
@@ -47,10 +50,11 @@ void PairMACE::compute(int eflag, int vflag)
 {
   ev_init(eflag, vflag);
 
+  if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
+  if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
+
   // ----- positions -----
-  // WARNING: list->gnum not the same as atom->nghost
-  int n_nodes = atom->nlocal + atom->nghost;
-  //int n_nodes = list->inum + list->gnum;
+  int n_nodes = list->inum + list->gnum;
   auto positions = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
   for (int ii = 0; ii < n_nodes; ii++) {
     int i = list->ilist[ii];
@@ -75,7 +79,6 @@ void PairMACE::compute(int eflag, int vflag)
   // count total number of edges
   int n_edges = 0;
   for (int ii = 0; ii < n_nodes; ii++) {
-  //for (int ii = 0; ii < list->inum; ii++) {
     int i = list->ilist[ii];
     double xtmp = atom->x[i][0];
     double ytmp = atom->x[i][1];
@@ -89,8 +92,6 @@ void PairMACE::compute(int eflag, int vflag)
       double dely = ytmp - atom->x[j][1];
       double delz = ztmp - atom->x[j][2];
       double rsq = delx * delx + dely * dely + delz * delz;
-      // TODO: which is better?
-      //if (rsq < cutsq[itype][jtype]) n_edges += 1;
       if (rsq < r_max*r_max) n_edges += 1;
     }
   }
@@ -99,7 +100,6 @@ void PairMACE::compute(int eflag, int vflag)
   auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
   int k = 0;
   for (int ii = 0; ii < n_nodes; ii++) {
-  //for (int ii = 0; ii < list->inum; ii++) {
     int i = list->ilist[ii];
     double xtmp = atom->x[i][0];
     double ytmp = atom->x[i][1];
@@ -128,20 +128,18 @@ void PairMACE::compute(int eflag, int vflag)
         return i+1;
       }
     }
-    // TODO: should throw error
-    return -1000;
+    error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
+    return -1;
   };
 
-  // node_attrs involves atomic numbers
+  // node_attrs is one-hot encoding for atomic numbers
   int n_node_feats = mace_atomic_numbers.size();
   auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch::dtype(torch::kFloat64));
   for (int ii = 0; ii < n_nodes; ii++) {
     int i = list->ilist[ii];
-    // map lammps type to mace type
     node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
   }
 
-  // TODO: consider from_blob to avoid copy
   auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
   auto energy = torch::empty({1}, torch::dtype(torch::kFloat64));
   auto forces = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
@@ -166,20 +164,21 @@ void PairMACE::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-  // TODO: when should stress be printed?
-  auto output = model.forward({input, false, true, false, false}).toGenericDict();
+  auto output = model.forward({input, false, true, vflag, false}).toGenericDict();
   auto node_energy = output.at("node_energy").toTensor();
   forces = output.at("forces").toTensor();
 
-  // post-process
+  // pass the output to LAMMPS variables
   eng_vdwl = 0.0;
   for (int ii = 0; ii < n_nodes; ii++) {
     int i = list->ilist[ii];
-    // TODO: update forces here
     if (ii < list->inum) {
       eng_vdwl += node_energy[i].item<double>();
-    if (eflag_atom)
-      eatom[i] = node_energy[i].item<double>();
+      atom->f[i][0] = forces[i][0].item<double>();
+      atom->f[i][1] = forces[i][1].item<double>();
+      atom->f[i][2] = forces[i][2].item<double>();
+      if (eflag_atom)
+        eatom[i] = node_energy[i].item<double>();
     }
   }
 
@@ -230,11 +229,24 @@ void PairMACE::coeff(int narg, char **arg)
 
 void PairMACE::init_style()
 {
+  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
+
+  /*
+    MACE requires the full neighbor list AND neighbors of ghost atoms
+    it appears that:
+      * without REQ_GHOST
+           list->gnum == 0
+           list->ilist does not include ghost atoms, but the jlists do
+      * with REQ_GHOST
+           list->gnum == atom->nghost
+           list->ilist includes ghost atoms
+  */
   neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
 }
 
 double PairMACE::init_one(int i, int j)
 {
+  // to account for message passing, require cutoff of n_layers * r_max
   return num_interactions*model.attr("r_max").toTensor().item<double>();
 }
 
@@ -248,5 +260,4 @@ void PairMACE::allocate()
       setflag[i][j] = 0;
 
   memory->create(cutsq, atom->ntypes+1, atom->ntypes+1, "pair:cutsq");
-  std::cout << "WARNING: may need to overload init_one, which sets cutsq." << std::endl;
 }

From f611691683a39d71e7dd885a65a0b7621c5dbe9a Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Fri, 18 Nov 2022 22:13:07 +0000
Subject: [PATCH 11/51] virials now working in serial.

---
 src/ML-MACE/pair_mace.cpp | 62 +++++++++++++++++++++++++++++++--------
 1 file changed, 49 insertions(+), 13 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 7d098a78b90..2b7acc9507d 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -36,6 +36,7 @@ using namespace LAMMPS_NS;
 
 PairMACE::PairMACE(LAMMPS *lmp) : Pair(lmp)
 {
+  no_virial_fdotr_compute = 1;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -164,24 +165,59 @@ void PairMACE::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-  auto output = model.forward({input, false, true, vflag, false}).toGenericDict();
-  auto node_energy = output.at("node_energy").toTensor();
-  forces = output.at("forces").toTensor();
+  auto output = model.forward({input, false, true, true, false}).toGenericDict();
 
-  // pass the output to LAMMPS variables
-  eng_vdwl = 0.0;
-  for (int ii = 0; ii < n_nodes; ii++) {
-    int i = list->ilist[ii];
-    if (ii < list->inum) {
+  auto stress = output.at("stress").toTensor();
+
+  // mace energy
+  //   -> sum of site energies of local atoms
+  if (eflag_global) {
+    auto node_energy = output.at("node_energy").toTensor();
+    eng_vdwl = 0.0;
+    for (int ii = 0; ii < list->inum; ii++) {
+      int i = list->ilist[ii];
       eng_vdwl += node_energy[i].item<double>();
-      atom->f[i][0] = forces[i][0].item<double>();
-      atom->f[i][1] = forces[i][1].item<double>();
-      atom->f[i][2] = forces[i][2].item<double>();
-      if (eflag_atom)
-        eatom[i] = node_energy[i].item<double>();
     }
   }
 
+  // mace forces
+  //   -> derivatives of total mace energy
+  forces = output.at("forces").toTensor();
+  for (int ii = 0; ii < list->inum; ii++) {
+    int i = list->ilist[ii];
+    atom->f[i][0] = forces[i][0].item<double>();
+    atom->f[i][1] = forces[i][1].item<double>();
+    atom->f[i][2] = forces[i][2].item<double>();
+  }
+
+  // mace site energies
+  //   -> local atoms only
+  if (eflag_atom) {
+    auto node_energy = output.at("node_energy").toTensor();
+    for (int ii = 0; ii < list->inum; ii++) {
+      int i = list->ilist[ii];
+      eatom[i] = node_energy[i].item<double>();
+    }
+  }
+
+  // mace virials (local atoms only)
+  //   -> derivatives of sum of site energies of local atoms
+  if (vflag_global) {
+    auto vir = output.at("virials").toTensor();
+    virial[0] = vir[0][0][0].item<double>();
+    virial[1] = vir[0][1][1].item<double>();
+    virial[2] = vir[0][2][2].item<double>();
+    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
+    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+  }
+
+  // mace site virials
+  //   -> not available
+  if (vflag_atom) {
+    error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
+  }
+
 }
 
 /* ---------------------------------------------------------------------- */

From 17148576c776d5c53e7cef19219e804878756f10 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Sun, 20 Nov 2022 23:46:10 +0000
Subject: [PATCH 12/51] update virials for MPI.

---
 src/ML-MACE/pair_mace.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 2b7acc9507d..9a85df4612f 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -123,6 +123,8 @@ void PairMACE::compute(int eflag, int vflag)
     }
   }
 
+  //
+
   auto mace_type = [this](int lammps_type) {
     for (int i=0; i<mace_atomic_numbers.size(); ++i) {
       if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
@@ -141,6 +143,13 @@ void PairMACE::compute(int eflag, int vflag)
     node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
   }
 
+  // ----- mask for ghost -----
+  auto mask = torch::zeros(n_nodes, torch::dtype(torch::kBool));
+  for (int ii = 0; ii < list->inum; ii++) {
+    int i = list->ilist[ii];
+    mask[i] = true;
+  }
+
   auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
   auto energy = torch::empty({1}, torch::dtype(torch::kFloat64));
   auto forces = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
@@ -165,7 +174,7 @@ void PairMACE::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-  auto output = model.forward({input, false, true, true, false}).toGenericDict();
+  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
 
   auto stress = output.at("stress").toTensor();
 

From f5d7251ce82129e2aca9ff6e9650ed64640e2265 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Sun, 20 Nov 2022 23:46:24 +0000
Subject: [PATCH 13/51] grow periodic table.

---
 src/ML-MACE/pair_mace.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index 5d0ed3da0f5..c54f569fcc3 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -52,8 +52,15 @@ class PairMACE : public Pair {
   std::vector<int64_t> mace_atomic_numbers;
   std::vector<int64_t> lammps_atomic_numbers;
   const std::array<std::string,10> periodic_table =
-    {"H", "He",
-     "Li", "Be", "B", "C", "N", "O", "F", "Ne"};
+    { "H", "He",
+     "Li", "Be",                                                              "B",  "C",  "N",  "O",  "F", "Ne",
+     "Na", "Mg",                                                             "Al", "Si",  "P",  "S", "Cl", "Ar",
+     "K",  "Ca", "Sc", "Ti",  "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
+     "Rb", "Sr",  "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te",  "I", "Xe",
+     "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
+                       "Hf", "Ta",  "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
+     "Fr", "Ra", "Ac", "Th", "Pa",  "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
+                       "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"};
 
 };
 }    // namespace LAMMPS_NS

From 67e96baf011e4b82471218fe7d48d2ec9ac5f89e Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Fri, 25 Nov 2022 11:11:31 +0000
Subject: [PATCH 14/51] update periodic_table.

---
 src/ML-MACE/pair_mace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index c54f569fcc3..1ce687cc274 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -51,7 +51,7 @@ class PairMACE : public Pair {
   int64_t num_interactions;
   std::vector<int64_t> mace_atomic_numbers;
   std::vector<int64_t> lammps_atomic_numbers;
-  const std::array<std::string,10> periodic_table =
+  const std::array<std::string,118> periodic_table =
     { "H", "He",
      "Li", "Be",                                                              "B",  "C",  "N",  "O",  "F", "Ne",
      "Na", "Mg",                                                             "Al", "Si",  "P",  "S", "Cl", "Ar",

From 3bd488c03a452aaf664ee43ca61565a313e2256f Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Fri, 9 Dec 2022 13:03:52 +0000
Subject: [PATCH 15/51] add ghostneigh=1 to pair_mace construction.

---
 src/ML-MACE/pair_mace.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 9a85df4612f..3dae6514ee7 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -37,6 +37,7 @@ using namespace LAMMPS_NS;
 PairMACE::PairMACE(LAMMPS *lmp) : Pair(lmp)
 {
   no_virial_fdotr_compute = 1;
+  ghostneigh = 1;
 }
 
 /* ---------------------------------------------------------------------- */

From d212e47364f04e29aa465096fe9ee368d9bd60ff Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Sun, 11 Dec 2022 21:51:58 +0000
Subject: [PATCH 16/51] remove todo comments.

---
 src/ML-MACE/pair_mace.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 3dae6514ee7..e4bd09f3624 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -89,7 +89,7 @@ void PairMACE::compute(int eflag, int vflag)
     int jnum = list->numneigh[i];
     for (int jj = 0; jj < jnum; jj++) {
       int j = jlist[jj];
-      j &= NEIGHMASK; // TODO: what is this?
+      j &= NEIGHMASK;
       double delx = xtmp - atom->x[j][0];
       double dely = ytmp - atom->x[j][1];
       double delz = ztmp - atom->x[j][2];
@@ -110,7 +110,7 @@ void PairMACE::compute(int eflag, int vflag)
     int jnum = list->numneigh[i];
     for (int jj = 0; jj < jnum; jj++) {
       int j = jlist[jj];
-      j &= NEIGHMASK; // TODO: what is this?
+      j &= NEIGHMASK;
       double delx = xtmp - atom->x[j][0];
       double dely = ytmp - atom->x[j][1];
       double delz = ztmp - atom->x[j][2];
@@ -118,7 +118,6 @@ void PairMACE::compute(int eflag, int vflag)
       if (rsq < r_max*r_max) {
         edge_index[0][k] = i;
         edge_index[1][k] = j;
-        //edge_index[1,k] = (jlist[jj] & NEIGHMASK) + 1;
         k++;
       }
     }

From 577b043a3104756636d39244e1885c349fc8e325 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Sun, 11 Dec 2022 23:48:09 +0000
Subject: [PATCH 17/51] Revert "add ghostneigh=1 to pair_mace construction." We
 don't set cutghost explicitly, so unnecessary.

This reverts commit 3bd488c03a452aaf664ee43ca61565a313e2256f.
---
 src/ML-MACE/pair_mace.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index e4bd09f3624..3d3d1cec661 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -37,7 +37,6 @@ using namespace LAMMPS_NS;
 PairMACE::PairMACE(LAMMPS *lmp) : Pair(lmp)
 {
   no_virial_fdotr_compute = 1;
-  ghostneigh = 1;
 }
 
 /* ---------------------------------------------------------------------- */

From 42c459ff4442d66b0b7b4d1faac3bbe93ab9cf5e Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 20 Dec 2022 22:25:13 +0000
Subject: [PATCH 18/51] add no_domain_decomposition option and other tweaks.

---
 src/ML-MACE/pair_mace.cpp | 152 ++++++++++++++++++++++++++++----------
 src/ML-MACE/pair_mace.h   |   3 +
 2 files changed, 114 insertions(+), 41 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 3d3d1cec661..3d813c09d65 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -52,12 +52,24 @@ void PairMACE::compute(int eflag, int vflag)
   ev_init(eflag, vflag);
 
   if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
-  if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
+  if (domain_decomposition) {
+    if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
+  }
 
   // ----- positions -----
-  int n_nodes = list->inum + list->gnum;
-  auto positions = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
-  for (int ii = 0; ii < n_nodes; ii++) {
+  int n_nodes;
+  if (domain_decomposition) {
+    n_nodes = atom->nlocal + atom->nghost;
+  } else {
+    // normally, ghost atoms are included in the graph as independent
+    // nodes, as required when the local domain does not have PBC.
+    // however, in no_domain_decomposition mode, ghost atoms are known to
+    // be shifted versions of local atoms.
+    n_nodes = atom->nlocal;
+  }
+  auto positions = torch::empty({n_nodes,3}, torch_float_dtype);
+  #pragma omp parallel for
+  for (int ii=0; ii<n_nodes; ++ii) {
     int i = list->ilist[ii];
     positions[i][0] = atom->x[i][0];
     positions[i][1] = atom->x[i][1];
@@ -65,65 +77,95 @@ void PairMACE::compute(int eflag, int vflag)
   }
 
   // ----- cell -----
-  auto cell = torch::zeros({3,3}, torch::dtype(torch::kFloat64));
-  cell[0][0] = domain->xprd;
+  auto cell = torch::zeros({3,3}, torch_float_dtype);
+  cell[0][0] = domain->h[0];
   cell[0][1] = 0.0;
   cell[0][2] = 0.0;
-  cell[1][0] = domain->xy;
-  cell[1][1] = domain->yprd;
+  cell[1][0] = domain->h[5];
+  cell[1][1] = domain->h[1];
   cell[1][2] = 0.0;
-  cell[2][0] = domain->xz;
-  cell[2][1] = domain->yz;
-  cell[2][2] = domain->zprd;
+  cell[2][0] = domain->h[4];
+  cell[2][1] = domain->h[3];
+  cell[2][2] = domain->h[2];
 
-  // ----- edge_index -----
+  // ----- edge_index and unit_shifts -----
   // count total number of edges
   int n_edges = 0;
-  for (int ii = 0; ii < n_nodes; ii++) {
+  std::vector<int> n_edges_vec(n_nodes, 0);
+  #pragma omp parallel for reduction(+:n_edges)
+  for (int ii=0; ii<n_nodes; ++ii) {
     int i = list->ilist[ii];
     double xtmp = atom->x[i][0];
     double ytmp = atom->x[i][1];
     double ztmp = atom->x[i][2];
     int *jlist = list->firstneigh[i];
     int jnum = list->numneigh[i];
-    for (int jj = 0; jj < jnum; jj++) {
+    for (int jj=0; jj<jnum; ++jj) {
       int j = jlist[jj];
       j &= NEIGHMASK;
       double delx = xtmp - atom->x[j][0];
       double dely = ytmp - atom->x[j][1];
       double delz = ztmp - atom->x[j][2];
       double rsq = delx * delx + dely * dely + delz * delz;
-      if (rsq < r_max*r_max) n_edges += 1;
+      if (rsq < r_max_squared) {
+        n_edges += 1;
+        n_edges_vec[ii] += 1;
+      }
     }
   }
-
-
+  // make first_edge vector to help with parallelizing following loop
+  std::vector<int> first_edge(n_nodes);
+  first_edge[0] = 0;
+  for (int ii=0; ii<n_nodes-1; ++ii) {
+    first_edge[ii+1] = first_edge[ii] + n_edges_vec[ii];
+  }
+  // fill edge_index and unit_shifts tensors
   auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
-  int k = 0;
-  for (int ii = 0; ii < n_nodes; ii++) {
+  auto unit_shifts = torch::zeros({n_edges,3}, torch_float_dtype);
+  auto shifts = torch::zeros({n_edges,3}, torch_float_dtype);
+  #pragma omp parallel for
+  for (int ii=0; ii<n_nodes; ++ii) {
     int i = list->ilist[ii];
     double xtmp = atom->x[i][0];
     double ytmp = atom->x[i][1];
     double ztmp = atom->x[i][2];
     int *jlist = list->firstneigh[i];
     int jnum = list->numneigh[i];
-    for (int jj = 0; jj < jnum; jj++) {
+    int k = first_edge[ii];
+    for (int jj=0; jj<jnum; ++jj) {
       int j = jlist[jj];
       j &= NEIGHMASK;
       double delx = xtmp - atom->x[j][0];
       double dely = ytmp - atom->x[j][1];
       double delz = ztmp - atom->x[j][2];
       double rsq = delx * delx + dely * dely + delz * delz;
-      if (rsq < r_max*r_max) {
+      if (rsq < r_max_squared) {
         edge_index[0][k] = i;
-        edge_index[1][k] = j;
+        if (domain_decomposition) {
+          edge_index[1][k] = j;
+        } else {
+          int j_local = atom->map(atom->tag[j]);
+          edge_index[1][k] = j_local;
+          double shiftx = atom->x[j][0] - atom->x[j_local][0];
+          double shifty = atom->x[j][1] - atom->x[j_local][1];
+          double shiftz = atom->x[j][2] - atom->x[j_local][2];
+          double shiftxs = std::round(domain->h_inv[0]*shiftx + domain->h_inv[5]*shifty + domain->h_inv[4]*shiftz);
+          double shiftys = std::round(domain->h_inv[1]*shifty + domain->h_inv[3]*shiftz);
+          double shiftzs = std::round(domain->h_inv[2]*shiftz);
+          unit_shifts[k][0] = shiftxs;
+          unit_shifts[k][1] = shiftys;
+          unit_shifts[k][2] = shiftzs;
+          shifts[k][0] = domain->h[0]*shiftxs + domain->h[5]*shiftys + domain->h[4]*shiftzs;
+          shifts[k][1] = domain->h[1]*shiftys + domain->h[3]*shiftzs;
+          shifts[k][2] = domain->h[2]*shiftzs;
+        }
         k++;
       }
     }
   }
 
-  //
-
+  // ----- node_attrs -----
+  // node_attrs is one-hot encoding for atomic numbers
   auto mace_type = [this](int lammps_type) {
     for (int i=0; i<mace_atomic_numbers.size(); ++i) {
       if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
@@ -133,29 +175,27 @@ void PairMACE::compute(int eflag, int vflag)
     error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
     return -1;
   };
-
-  // node_attrs is one-hot encoding for atomic numbers
   int n_node_feats = mace_atomic_numbers.size();
-  auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch::dtype(torch::kFloat64));
-  for (int ii = 0; ii < n_nodes; ii++) {
+  auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch_float_dtype);
+  #pragma omp parallel for
+  for (int ii=0; ii<n_nodes; ++ii) {
     int i = list->ilist[ii];
     node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
   }
 
   // ----- mask for ghost -----
   auto mask = torch::zeros(n_nodes, torch::dtype(torch::kBool));
-  for (int ii = 0; ii < list->inum; ii++) {
+  #pragma omp parallel for
+  for (int ii=0; ii<atom->nlocal; ++ii) {
     int i = list->ilist[ii];
     mask[i] = true;
   }
 
   auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
-  auto energy = torch::empty({1}, torch::dtype(torch::kFloat64));
-  auto forces = torch::empty({n_nodes,3}, torch::dtype(torch::kFloat64));
+  auto energy = torch::empty({1}, torch_float_dtype);
+  auto forces = torch::empty({n_nodes,3}, torch_float_dtype);
   auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
-  auto shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64));
-  auto unit_shifts = torch::zeros({n_edges,3}, torch::dtype(torch::kFloat64));
-  auto weight = torch::empty({1}, torch::dtype(torch::kFloat64));
+  auto weight = torch::empty({1}, torch_float_dtype);
   ptr[0] = 0;
   ptr[1] = n_nodes;
   weight[0] = 1.0;
@@ -175,14 +215,13 @@ void PairMACE::compute(int eflag, int vflag)
   input.insert("weight", weight);
   auto output = model.forward({input, mask, true, true, false}).toGenericDict();
 
-  auto stress = output.at("stress").toTensor();
-
   // mace energy
   //   -> sum of site energies of local atoms
   if (eflag_global) {
     auto node_energy = output.at("node_energy").toTensor();
     eng_vdwl = 0.0;
-    for (int ii = 0; ii < list->inum; ii++) {
+    #pragma omp parallel for reduction(+:eng_vdwl)
+    for (int ii=0; ii<list->inum; ++ii) {
       int i = list->ilist[ii];
       eng_vdwl += node_energy[i].item<double>();
     }
@@ -191,7 +230,8 @@ void PairMACE::compute(int eflag, int vflag)
   // mace forces
   //   -> derivatives of total mace energy
   forces = output.at("forces").toTensor();
-  for (int ii = 0; ii < list->inum; ii++) {
+  #pragma omp parallel for
+  for (int ii=0; ii<list->inum; ++ii) {
     int i = list->ilist[ii];
     atom->f[i][0] = forces[i][0].item<double>();
     atom->f[i][1] = forces[i][1].item<double>();
@@ -202,7 +242,8 @@ void PairMACE::compute(int eflag, int vflag)
   //   -> local atoms only
   if (eflag_atom) {
     auto node_energy = output.at("node_energy").toTensor();
-    for (int ii = 0; ii < list->inum; ii++) {
+    #pragma omp parallel for
+    for (int ii=0; ii<list->inum; ++ii) {
       int i = list->ilist[ii];
       eatom[i] = node_energy[i].item<double>();
     }
@@ -232,6 +273,15 @@ void PairMACE::compute(int eflag, int vflag)
 
 void PairMACE::settings(int narg, char **arg)
 {
+  if (narg == 1) {
+    if (strcmp(arg[0], "no_domain_decomposition") == 0) {
+      domain_decomposition = false;
+    } else {
+      error->all(FLERR, "Invalid option for pair_style mace.");
+    }
+  } else if (narg > 1) {
+    error->all(FLERR, "Too many pair_style arguments for pair_style mace.");
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -246,7 +296,22 @@ void PairMACE::coeff(int narg, char **arg)
   model = torch::jit::load(arg[2]);
   std::cout << " finished." << std::endl;
 
+  // extract default dtype from mace model
+  for (auto p: model.named_attributes()) {
+      // this is a somewhat random choice of variable to check. could it be improved?
+      if (p.name == "model.node_embedding.linear.weight") {
+          if (p.value.toTensor().dtype() == caffe2::TypeMeta::Make<float>()) {
+            torch_float_dtype = torch::kFloat32;
+          } else if (p.value.toTensor().dtype() == caffe2::TypeMeta::Make<double>()) {
+            torch_float_dtype = torch::kFloat64;
+          }
+      }
+  }
+  std::cout << "  - The torch_float_dtype is: " << torch_float_dtype << std::endl;
+
+  // extract r_max from mace model
   r_max = model.attr("r_max").toTensor().item<double>();
+  r_max_squared = r_max*r_max;
   std::cout << "  - The r_max is: " << r_max << "." << std::endl;
   num_interactions = model.attr("num_interactions").toTensor().item<int64_t>();
   std::cout << "  - The model has: " << num_interactions << " layers." << std::endl;
@@ -285,13 +350,18 @@ void PairMACE::init_style()
            list->gnum == atom->nghost
            list->ilist includes ghost atoms
   */
-  neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
+  if (domain_decomposition) {
+    neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
+  } else {
+    neighbor->add_request(this, NeighConst::REQ_FULL);
+  }
 }
 
 double PairMACE::init_one(int i, int j)
 {
   // to account for message passing, require cutoff of n_layers * r_max
-  return num_interactions*model.attr("r_max").toTensor().item<double>();
+//  return num_interactions*model.attr("r_max").toTensor().item<double>();
+  return (1+num_interactions)*model.attr("r_max").toTensor().item<double>();
 }
 
 void PairMACE::allocate()
diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index 1ce687cc274..91aafc663b5 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -46,8 +46,11 @@ class PairMACE : public Pair {
 
  protected:
 
+  bool domain_decomposition = true;
   torch::jit::script::Module model;
+  torch::ScalarType torch_float_dtype;
   double r_max;
+  double r_max_squared;
   int64_t num_interactions;
   std::vector<int64_t> mace_atomic_numbers;
   std::vector<int64_t> lammps_atomic_numbers;

From d60bd8ab259d1efa65b453455fd0d758a2677bc9 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Thu, 12 Jan 2023 09:05:55 +0000
Subject: [PATCH 19/51] restore approach for determining effective interaction.

---
 src/ML-MACE/pair_mace.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 3d813c09d65..4346a2336d7 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -360,8 +360,7 @@ void PairMACE::init_style()
 double PairMACE::init_one(int i, int j)
 {
   // to account for message passing, require cutoff of n_layers * r_max
-//  return num_interactions*model.attr("r_max").toTensor().item<double>();
-  return (1+num_interactions)*model.attr("r_max").toTensor().item<double>();
+  return num_interactions*model.attr("r_max").toTensor().item<double>();
 }
 
 void PairMACE::allocate()

From 6fe7bd8800a92ee0cc56d3dbb1dc43f90fb1134f Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Thu, 23 Feb 2023 21:15:33 +0000
Subject: [PATCH 20/51] gpu work.

---
 cmake/Modules/Packages/ML-MACE.cmake |   3 +-
 src/ML-MACE/pair_mace.cpp            | 143 ++++++++++++++++++++++++---
 src/ML-MACE/pair_mace.h              |   3 +
 3 files changed, 135 insertions(+), 14 deletions(-)

diff --git a/cmake/Modules/Packages/ML-MACE.cmake b/cmake/Modules/Packages/ML-MACE.cmake
index 984298c75fd..c22a971c225 100644
--- a/cmake/Modules/Packages/ML-MACE.cmake
+++ b/cmake/Modules/Packages/ML-MACE.cmake
@@ -1,10 +1,9 @@
 cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
 
-message("Hello from ML-MACE.cmake.")
-
 find_package(Torch REQUIRED)
 
 #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 #target_include_directories(lammps PRIVATE "${TORCH_INCLUDE_DIRS}")
 target_link_libraries(lammps PRIVATE "${TORCH_LIBRARIES}")
 set_property(TARGET lammps PROPERTY CXX_STANDARD 14)
+add_compile_definitions(MACE_DEBUG)
diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 4346a2336d7..d37abc5facd 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -30,6 +30,10 @@
 #include <iostream>
 #include <stdexcept>
 
+#ifdef MACE_DEBUG
+#include <chrono>
+#endif
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -51,6 +55,16 @@ void PairMACE::compute(int eflag, int vflag)
 {
   ev_init(eflag, vflag);
 
+std::cout << " " << std::endl;
+
+  #ifdef MACE_DEBUG
+  std::chrono::time_point<std::chrono::high_resolution_clock> t0, t1;
+  #endif
+
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
+
   if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
   if (domain_decomposition) {
     if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
@@ -88,6 +102,15 @@ void PairMACE::compute(int eflag, int vflag)
   cell[2][1] = domain->h[3];
   cell[2][2] = domain->h[2];
 
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "positions+cell: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
+
   // ----- edge_index and unit_shifts -----
   // count total number of edges
   int n_edges = 0;
@@ -119,6 +142,13 @@ void PairMACE::compute(int eflag, int vflag)
   for (int ii=0; ii<n_nodes-1; ++ii) {
     first_edge[ii+1] = first_edge[ii] + n_edges_vec[ii];
   }
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "edge count: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
   // fill edge_index and unit_shifts tensors
   auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
   auto unit_shifts = torch::zeros({n_edges,3}, torch_float_dtype);
@@ -163,6 +193,14 @@ void PairMACE::compute(int eflag, int vflag)
       }
     }
   }
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "edge fill: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
 
   // ----- node_attrs -----
   // node_attrs is one-hot encoding for atomic numbers
@@ -183,6 +221,15 @@ void PairMACE::compute(int eflag, int vflag)
     node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
   }
 
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "node attrs: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
+
   // ----- mask for ghost -----
   auto mask = torch::zeros(n_nodes, torch::dtype(torch::kBool));
   #pragma omp parallel for
@@ -200,7 +247,35 @@ void PairMACE::compute(int eflag, int vflag)
   ptr[1] = n_nodes;
   weight[0] = 1.0;
 
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "other setup: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
+  // transfer data to device
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
+  batch = batch.to(device);
+  cell = cell.to(device);
+  edge_index = edge_index.to(device);
+  energy = energy.to(device);
+  forces = forces.to(device);
+  node_attrs = node_attrs.to(device);
+  positions = positions.to(device);
+  ptr = ptr.to(device);
+  shifts = shifts.to(device);
+  unit_shifts = unit_shifts.to(device);
+  weight = weight.to(device);
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "transfer: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
   // pack the input, call the model, extract the output
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
   c10::Dict<std::string, torch::Tensor> input;
   input.insert("batch", batch);
   input.insert("cell", cell);
@@ -213,12 +288,38 @@ void PairMACE::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "pack: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
+  auto output = model.forward({input, mask.to(device), true, true, false}).toGenericDict();
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "model: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
+  auto node_energy = output.at("node_energy").toTensor().cpu();
+  forces = output.at("forces").toTensor().cpu();
+  auto vir = output.at("virials").toTensor().cpu();
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "transfer back: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
+  #ifdef MACE_DEBUG
+  t0 = std::chrono::high_resolution_clock::now();
+  #endif
 
   // mace energy
   //   -> sum of site energies of local atoms
   if (eflag_global) {
-    auto node_energy = output.at("node_energy").toTensor();
     eng_vdwl = 0.0;
     #pragma omp parallel for reduction(+:eng_vdwl)
     for (int ii=0; ii<list->inum; ++ii) {
@@ -229,7 +330,6 @@ void PairMACE::compute(int eflag, int vflag)
 
   // mace forces
   //   -> derivatives of total mace energy
-  forces = output.at("forces").toTensor();
   #pragma omp parallel for
   for (int ii=0; ii<list->inum; ++ii) {
     int i = list->ilist[ii];
@@ -241,7 +341,6 @@ void PairMACE::compute(int eflag, int vflag)
   // mace site energies
   //   -> local atoms only
   if (eflag_atom) {
-    auto node_energy = output.at("node_energy").toTensor();
     #pragma omp parallel for
     for (int ii=0; ii<list->inum; ++ii) {
       int i = list->ilist[ii];
@@ -252,7 +351,6 @@ void PairMACE::compute(int eflag, int vflag)
   // mace virials (local atoms only)
   //   -> derivatives of sum of site energies of local atoms
   if (vflag_global) {
-    auto vir = output.at("virials").toTensor();
     virial[0] = vir[0][0][0].item<double>();
     virial[1] = vir[0][1][1].item<double>();
     virial[2] = vir[0][2][2].item<double>();
@@ -267,20 +365,32 @@ void PairMACE::compute(int eflag, int vflag)
     error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
   }
 
+  #ifdef MACE_DEBUG
+  t1 = std::chrono::high_resolution_clock::now();
+  std::cout << "unpack: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
+  #endif
+
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairMACE::settings(int narg, char **arg)
 {
-  if (narg == 1) {
-    if (strcmp(arg[0], "no_domain_decomposition") == 0) {
+  if (narg > 2) {
+    error->all(FLERR, "Too many pair_style arguments for pair_style mace.");
+  }
+
+  if (narg >= 1) {
+    if (strcmp(arg[0], "gpu") == 0) {
+      device_type = "gpu";
+    }
+  }
+
+  if (narg >= 2) {
+    if (strcmp(arg[1], "no_domain_decomposition") == 0) {
       domain_decomposition = false;
-    } else {
-      error->all(FLERR, "Invalid option for pair_style mace.");
+      // TODO: add check against MPI rank
     }
-  } else if (narg > 1) {
-    error->all(FLERR, "Too many pair_style arguments for pair_style mace.");
   }
 }
 
@@ -292,8 +402,17 @@ void PairMACE::coeff(int narg, char **arg)
 
   if (!allocated) allocate();
 
+  if (device_type == "cpu") {
+    device = c10::Device(torch::kCPU);
+  } else {
+    int rank;
+    MPI_Comm_rank(world, &rank);
+    std::cout << "MPI rank: " << rank << std::endl;
+    device = c10::Device(torch::kCUDA,rank);
+  }
+
   std::cout << "Loading MACE model from \"" << arg[2] << "\" ...";
-  model = torch::jit::load(arg[2]);
+  model = torch::jit::load(arg[2], device);
   std::cout << " finished." << std::endl;
 
   // extract default dtype from mace model
diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index 91aafc663b5..9616d18286e 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -27,6 +27,7 @@ PairStyle(mace,PairMACE);
 
 #include "pair.h"
 
+#include <torch/torch.h>
 #include <torch/script.h>
 
 namespace LAMMPS_NS {
@@ -46,7 +47,9 @@ class PairMACE : public Pair {
 
  protected:
 
+  std::string device_type = "cpu";
   bool domain_decomposition = true;
+  torch::Device device = torch::kCPU;
   torch::jit::script::Module model;
   torch::ScalarType torch_float_dtype;
   double r_max;

From 0246fad8327658b388069d7ab5af9d90fb418990 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Thu, 23 Feb 2023 22:58:10 +0000
Subject: [PATCH 21/51] first steps.

---
 cmake/Modules/Packages/ML-MACE.cmake |   3 +-
 src/ML-MACE/pair_mace_kokkos.cpp     | 384 +++++++++++++++++++++++++++
 src/ML-MACE/pair_mace_kokkos.h       |  78 ++++++
 3 files changed, 463 insertions(+), 2 deletions(-)
 create mode 100644 src/ML-MACE/pair_mace_kokkos.cpp
 create mode 100644 src/ML-MACE/pair_mace_kokkos.h

diff --git a/cmake/Modules/Packages/ML-MACE.cmake b/cmake/Modules/Packages/ML-MACE.cmake
index 984298c75fd..4646648e370 100644
--- a/cmake/Modules/Packages/ML-MACE.cmake
+++ b/cmake/Modules/Packages/ML-MACE.cmake
@@ -3,8 +3,7 @@ cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
 message("Hello from ML-MACE.cmake.")
 
 find_package(Torch REQUIRED)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 
-#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
-#target_include_directories(lammps PRIVATE "${TORCH_INCLUDE_DIRS}")
 target_link_libraries(lammps PRIVATE "${TORCH_LIBRARIES}")
 set_property(TARGET lammps PROPERTY CXX_STANDARD 14)
diff --git a/src/ML-MACE/pair_mace_kokkos.cpp b/src/ML-MACE/pair_mace_kokkos.cpp
new file mode 100644
index 00000000000..b52a48eed78
--- /dev/null
+++ b/src/ML-MACE/pair_mace_kokkos.cpp
@@ -0,0 +1,384 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributors
+      William C Witt (University of Cambridge)
+------------------------------------------------------------------------- */
+
+#include "pair_mace_kokkos.h"
+
+#include "atom.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "neigh_list.h"
+#include "neighbor.h"
+
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMACEKokkos<DeviceType>::PairMACEKokkos(LAMMPS *lmp) : PairMACE(lmp)
+{
+  no_virial_fdotr_compute = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMACEKokkos<DeviceType>::~PairMACEKokkos()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
+{
+  ev_init(eflag, vflag);
+
+  if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
+  if (domain_decomposition) {
+    if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
+  }
+
+  // ----- positions -----
+  int n_nodes;
+  if (domain_decomposition) {
+    n_nodes = atom->nlocal + atom->nghost;
+  } else {
+    // normally, ghost atoms are included in the graph as independent
+    // nodes, as required when the local domain does not have PBC.
+    // however, in no_domain_decomposition mode, ghost atoms are known to
+    // be shifted versions of local atoms.
+    n_nodes = atom->nlocal;
+  }
+  auto positions = torch::empty({n_nodes,3}, torch_float_dtype);
+  #pragma omp parallel for
+  for (int ii=0; ii<n_nodes; ++ii) {
+    int i = list->ilist[ii];
+    positions[i][0] = atom->x[i][0];
+    positions[i][1] = atom->x[i][1];
+    positions[i][2] = atom->x[i][2];
+  }
+
+  // ----- cell -----
+  auto cell = torch::zeros({3,3}, torch_float_dtype);
+  cell[0][0] = domain->h[0];
+  cell[0][1] = 0.0;
+  cell[0][2] = 0.0;
+  cell[1][0] = domain->h[5];
+  cell[1][1] = domain->h[1];
+  cell[1][2] = 0.0;
+  cell[2][0] = domain->h[4];
+  cell[2][1] = domain->h[3];
+  cell[2][2] = domain->h[2];
+
+  // ----- edge_index and unit_shifts -----
+  // count total number of edges
+  int n_edges = 0;
+  std::vector<int> n_edges_vec(n_nodes, 0);
+  #pragma omp parallel for reduction(+:n_edges)
+  for (int ii=0; ii<n_nodes; ++ii) {
+    int i = list->ilist[ii];
+    double xtmp = atom->x[i][0];
+    double ytmp = atom->x[i][1];
+    double ztmp = atom->x[i][2];
+    int *jlist = list->firstneigh[i];
+    int jnum = list->numneigh[i];
+    for (int jj=0; jj<jnum; ++jj) {
+      int j = jlist[jj];
+      j &= NEIGHMASK;
+      double delx = xtmp - atom->x[j][0];
+      double dely = ytmp - atom->x[j][1];
+      double delz = ztmp - atom->x[j][2];
+      double rsq = delx * delx + dely * dely + delz * delz;
+      if (rsq < r_max_squared) {
+        n_edges += 1;
+        n_edges_vec[ii] += 1;
+      }
+    }
+  }
+  // make first_edge vector to help with parallelizing following loop
+  std::vector<int> first_edge(n_nodes);
+  first_edge[0] = 0;
+  for (int ii=0; ii<n_nodes-1; ++ii) {
+    first_edge[ii+1] = first_edge[ii] + n_edges_vec[ii];
+  }
+  // fill edge_index and unit_shifts tensors
+  auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
+  auto unit_shifts = torch::zeros({n_edges,3}, torch_float_dtype);
+  auto shifts = torch::zeros({n_edges,3}, torch_float_dtype);
+  #pragma omp parallel for
+  for (int ii=0; ii<n_nodes; ++ii) {
+    int i = list->ilist[ii];
+    double xtmp = atom->x[i][0];
+    double ytmp = atom->x[i][1];
+    double ztmp = atom->x[i][2];
+    int *jlist = list->firstneigh[i];
+    int jnum = list->numneigh[i];
+    int k = first_edge[ii];
+    for (int jj=0; jj<jnum; ++jj) {
+      int j = jlist[jj];
+      j &= NEIGHMASK;
+      double delx = xtmp - atom->x[j][0];
+      double dely = ytmp - atom->x[j][1];
+      double delz = ztmp - atom->x[j][2];
+      double rsq = delx * delx + dely * dely + delz * delz;
+      if (rsq < r_max_squared) {
+        edge_index[0][k] = i;
+        if (domain_decomposition) {
+          edge_index[1][k] = j;
+        } else {
+          int j_local = atom->map(atom->tag[j]);
+          edge_index[1][k] = j_local;
+          double shiftx = atom->x[j][0] - atom->x[j_local][0];
+          double shifty = atom->x[j][1] - atom->x[j_local][1];
+          double shiftz = atom->x[j][2] - atom->x[j_local][2];
+          double shiftxs = std::round(domain->h_inv[0]*shiftx + domain->h_inv[5]*shifty + domain->h_inv[4]*shiftz);
+          double shiftys = std::round(domain->h_inv[1]*shifty + domain->h_inv[3]*shiftz);
+          double shiftzs = std::round(domain->h_inv[2]*shiftz);
+          unit_shifts[k][0] = shiftxs;
+          unit_shifts[k][1] = shiftys;
+          unit_shifts[k][2] = shiftzs;
+          shifts[k][0] = domain->h[0]*shiftxs + domain->h[5]*shiftys + domain->h[4]*shiftzs;
+          shifts[k][1] = domain->h[1]*shiftys + domain->h[3]*shiftzs;
+          shifts[k][2] = domain->h[2]*shiftzs;
+        }
+        k++;
+      }
+    }
+  }
+
+  // ----- node_attrs -----
+  // node_attrs is one-hot encoding for atomic numbers
+  auto mace_type = [this](int lammps_type) {
+    for (int i=0; i<mace_atomic_numbers.size(); ++i) {
+      if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
+        return i+1;
+      }
+    }
+    error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
+    return -1;
+  };
+  int n_node_feats = mace_atomic_numbers.size();
+  auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch_float_dtype);
+  #pragma omp parallel for
+  for (int ii=0; ii<n_nodes; ++ii) {
+    int i = list->ilist[ii];
+    node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
+  }
+
+  // ----- mask for ghost -----
+  auto mask = torch::zeros(n_nodes, torch::dtype(torch::kBool));
+  #pragma omp parallel for
+  for (int ii=0; ii<atom->nlocal; ++ii) {
+    int i = list->ilist[ii];
+    mask[i] = true;
+  }
+
+  auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
+  auto energy = torch::empty({1}, torch_float_dtype);
+  auto forces = torch::empty({n_nodes,3}, torch_float_dtype);
+  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
+  auto weight = torch::empty({1}, torch_float_dtype);
+  ptr[0] = 0;
+  ptr[1] = n_nodes;
+  weight[0] = 1.0;
+
+  // pack the input, call the model, extract the output
+  c10::Dict<std::string, torch::Tensor> input;
+  input.insert("batch", batch);
+  input.insert("cell", cell);
+  input.insert("edge_index", edge_index);
+  input.insert("energy", energy);
+  input.insert("forces", forces);
+  input.insert("node_attrs", node_attrs);
+  input.insert("positions", positions);
+  input.insert("ptr", ptr);
+  input.insert("shifts", shifts);
+  input.insert("unit_shifts", unit_shifts);
+  input.insert("weight", weight);
+  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
+
+  // mace energy
+  //   -> sum of site energies of local atoms
+  if (eflag_global) {
+    auto node_energy = output.at("node_energy").toTensor();
+    eng_vdwl = 0.0;
+    #pragma omp parallel for reduction(+:eng_vdwl)
+    for (int ii=0; ii<list->inum; ++ii) {
+      int i = list->ilist[ii];
+      eng_vdwl += node_energy[i].item<double>();
+    }
+  }
+
+  // mace forces
+  //   -> derivatives of total mace energy
+  forces = output.at("forces").toTensor();
+  #pragma omp parallel for
+  for (int ii=0; ii<list->inum; ++ii) {
+    int i = list->ilist[ii];
+    atom->f[i][0] = forces[i][0].item<double>();
+    atom->f[i][1] = forces[i][1].item<double>();
+    atom->f[i][2] = forces[i][2].item<double>();
+  }
+
+  // mace site energies
+  //   -> local atoms only
+  if (eflag_atom) {
+    auto node_energy = output.at("node_energy").toTensor();
+    #pragma omp parallel for
+    for (int ii=0; ii<list->inum; ++ii) {
+      int i = list->ilist[ii];
+      eatom[i] = node_energy[i].item<double>();
+    }
+  }
+
+  // mace virials (local atoms only)
+  //   -> derivatives of sum of site energies of local atoms
+  if (vflag_global) {
+    auto vir = output.at("virials").toTensor();
+    virial[0] = vir[0][0][0].item<double>();
+    virial[1] = vir[0][1][1].item<double>();
+    virial[2] = vir[0][2][2].item<double>();
+    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
+    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+  }
+
+  // mace site virials
+  //   -> not available
+  if (vflag_atom) {
+    error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
+  }
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg == 1) {
+    if (strcmp(arg[0], "no_domain_decomposition") == 0) {
+      domain_decomposition = false;
+    } else {
+      error->all(FLERR, "Invalid option for pair_style mace.");
+    }
+  } else if (narg > 1) {
+    error->all(FLERR, "Too many pair_style arguments for pair_style mace.");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  // TODO: remove print statements from this routine, or have a single proc print
+
+  if (!allocated) allocate();
+
+  std::cout << "Loading MACEKokkos model from \"" << arg[2] << "\" ...";
+  model = torch::jit::load(arg[2]);
+  std::cout << " finished." << std::endl;
+
+  // extract default dtype from mace model
+  for (auto p: model.named_attributes()) {
+      // this is a somewhat random choice of variable to check. could it be improved?
+      if (p.name == "model.node_embedding.linear.weight") {
+          if (p.value.toTensor().dtype() == caffe2::TypeMeta::Make<float>()) {
+            torch_float_dtype = torch::kFloat32;
+          } else if (p.value.toTensor().dtype() == caffe2::TypeMeta::Make<double>()) {
+            torch_float_dtype = torch::kFloat64;
+          }
+      }
+  }
+  std::cout << "  - The torch_float_dtype is: " << torch_float_dtype << std::endl;
+
+  // extract r_max from mace model
+  r_max = model.attr("r_max").toTensor().item<double>();
+  r_max_squared = r_max*r_max;
+  std::cout << "  - The r_max is: " << r_max << "." << std::endl;
+  num_interactions = model.attr("num_interactions").toTensor().item<int64_t>();
+  std::cout << "  - The model has: " << num_interactions << " layers." << std::endl;
+
+  // extract atomic numbers from mace model
+  auto a_n = model.attr("atomic_numbers").toTensor();
+  for (int i=0; i<a_n.size(0); ++i) {
+    mace_atomic_numbers.push_back(a_n[i].item<int64_t>());
+  }
+  std::cout << "  - The model atomic numbers are: " << mace_atomic_numbers << "." << std::endl;
+
+  // extract atomic numbers from pair_coeff
+  for (int i=3; i<narg; ++i) {
+    auto iter = std::find(periodic_table.begin(), periodic_table.end(), arg[i]);
+    int index = std::distance(periodic_table.begin(), iter) + 1;
+    lammps_atomic_numbers.push_back(index);
+  }
+  std::cout << "  - The pair_coeff atomic numbers are: " << lammps_atomic_numbers << "." << std::endl;
+
+  for (int i=1; i<atom->ntypes+1; i++)
+    for (int j=i; j<atom->ntypes+1; j++)
+      setflag[i][j] = 1;
+}
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::init_style()
+{
+  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
+
+  /*
+    MACE requires the full neighbor list AND neighbors of ghost atoms
+    it appears that:
+      * without REQ_GHOST
+           list->gnum == 0
+           list->ilist does not include ghost atoms, but the jlists do
+      * with REQ_GHOST
+           list->gnum == atom->nghost
+           list->ilist includes ghost atoms
+  */
+  if (domain_decomposition) {
+    neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
+  } else {
+    neighbor->add_request(this, NeighConst::REQ_FULL);
+  }
+}
+
+template<class DeviceType>
+double PairMACEKokkos<DeviceType>::init_one(int i, int j)
+{
+  // to account for message passing, require cutoff of n_layers * r_max
+  return num_interactions*model.attr("r_max").toTensor().item<double>();
+}
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::allocate()
+{
+  allocated = 1;
+
+  memory->create(setflag, atom->ntypes+1, atom->ntypes+1, "pair:setflag");
+  for (int i=1; i<atom->ntypes+1; i++)
+    for (int j=i; j<atom->ntypes+1; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cutsq, atom->ntypes+1, atom->ntypes+1, "pair:cutsq");
+}
diff --git a/src/ML-MACE/pair_mace_kokkos.h b/src/ML-MACE/pair_mace_kokkos.h
new file mode 100644
index 00000000000..1e269c2ce0f
--- /dev/null
+++ b/src/ML-MACE/pair_mace_kokkos.h
@@ -0,0 +1,78 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributors
+      William C Witt (University of Cambridge)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(mace/kk,PairMACEKokkos<LMPDeviceType>);
+PairStyle(mace/kk/device,PairLJCutKokkos<LMPDeviceType>);
+PairStyle(mace/kk/host,PairLJCutKokkos<LMPHostType>);
+// clang-format on
+#else
+
+#ifndef LMP_PAIR_MACE_KOKKOS_H
+#define LMP_PAIR_MACE_KOKKOS_H
+
+#include "pair_kokkos.h"
+#include "pair_mace.h"
+#include "neigh_list_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PairMACEKokkos : public PairMACE {
+
+ public:
+
+  //enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF};
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  PairMACEKokkos(class LAMMPS *);
+  ~PairMACEKokkos() override;
+  void compute(int, int) override;
+  void settings(int, char **) override;
+  void coeff(int, char **) override;
+  void init_style() override;
+  double init_one(int, int) override;
+  void allocate();
+
+ protected:
+
+  bool domain_decomposition = true;
+  torch::jit::script::Module model;
+  torch::ScalarType torch_float_dtype;
+  double r_max;
+  double r_max_squared;
+  int64_t num_interactions;
+  std::vector<int64_t> mace_atomic_numbers;
+  std::vector<int64_t> lammps_atomic_numbers;
+  const std::array<std::string,118> periodic_table =
+    { "H", "He",
+     "Li", "Be",                                                              "B",  "C",  "N",  "O",  "F", "Ne",
+     "Na", "Mg",                                                             "Al", "Si",  "P",  "S", "Cl", "Ar",
+     "K",  "Ca", "Sc", "Ti",  "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
+     "Rb", "Sr",  "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te",  "I", "Xe",
+     "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
+                       "Hf", "Ta",  "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
+     "Fr", "Ra", "Ac", "Th", "Pa",  "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
+                       "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"};
+
+};
+}    // namespace LAMMPS_NS
+
+#endif
+#endif

From 4ee087496324a8ce320459dc0bdf58b8e5e74ab0 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 27 Feb 2023 20:51:24 +0000
Subject: [PATCH 22/51] fix virial ordering as diagnosed by @imagdau.

---
 src/ML-MACE/pair_mace.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 4346a2336d7..7fe0f1451b6 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -256,9 +256,9 @@ void PairMACE::compute(int eflag, int vflag)
     virial[0] = vir[0][0][0].item<double>();
     virial[1] = vir[0][1][1].item<double>();
     virial[2] = vir[0][2][2].item<double>();
-    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+    virial[3] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
     virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
-    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+    virial[5] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
   }
 
   // mace site virials

From 5290e1126b3f98203dd2eafc21ac6654a60f16de Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 28 Feb 2023 18:42:10 +0000
Subject: [PATCH 23/51] wip.

---
 src/ML-MACE/pair_mace_kokkos.cpp | 611 +++++++++++++++++++------------
 src/ML-MACE/pair_mace_kokkos.h   |   4 +-
 2 files changed, 370 insertions(+), 245 deletions(-)

diff --git a/src/ML-MACE/pair_mace_kokkos.cpp b/src/ML-MACE/pair_mace_kokkos.cpp
index b52a48eed78..55ef528330b 100644
--- a/src/ML-MACE/pair_mace_kokkos.cpp
+++ b/src/ML-MACE/pair_mace_kokkos.cpp
@@ -18,13 +18,17 @@
 
 #include "pair_mace_kokkos.h"
 
-#include "atom.h"
-#include "domain.h"
+// TODO: do i need all these?
+#include "atom_kokkos.h"
+#include "atom_masks.h"
 #include "error.h"
 #include "force.h"
-#include "memory.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
 #include "neigh_list.h"
+#include "neigh_request.h"
 #include "neighbor.h"
+#include "update.h"
 
 #include <algorithm>
 #include <iostream>
@@ -38,6 +42,13 @@ template<class DeviceType>
 PairMACEKokkos<DeviceType>::PairMACEKokkos(LAMMPS *lmp) : PairMACE(lmp)
 {
   no_virial_fdotr_compute = 1;
+
+  // TODO: from lj_cut ... possibly delete
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  //datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -45,6 +56,15 @@ PairMACEKokkos<DeviceType>::PairMACEKokkos(LAMMPS *lmp) : PairMACE(lmp)
 template<class DeviceType>
 PairMACEKokkos<DeviceType>::~PairMACEKokkos()
 {
+  // TODO: from lj_cut, possibly delete
+  if (copymode) return;
+
+  // TODO: from lj_cut, possibly delete
+  //if (allocated) {
+  //  memoryKK->destroy_kokkos(k_eatom,eatom);
+  //  memoryKK->destroy_kokkos(k_vatom,vatom);
+  //  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  //}
 }
 
 /* ---------------------------------------------------------------------- */
@@ -52,223 +72,287 @@ PairMACEKokkos<DeviceType>::~PairMACEKokkos()
 template<class DeviceType>
 void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 {
-  ev_init(eflag, vflag);
-
-  if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
-  if (domain_decomposition) {
-    if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
-  }
-
-  // ----- positions -----
-  int n_nodes;
-  if (domain_decomposition) {
-    n_nodes = atom->nlocal + atom->nghost;
-  } else {
-    // normally, ghost atoms are included in the graph as independent
-    // nodes, as required when the local domain does not have PBC.
-    // however, in no_domain_decomposition mode, ghost atoms are known to
-    // be shifted versions of local atoms.
-    n_nodes = atom->nlocal;
-  }
-  auto positions = torch::empty({n_nodes,3}, torch_float_dtype);
-  #pragma omp parallel for
-  for (int ii=0; ii<n_nodes; ++ii) {
-    int i = list->ilist[ii];
-    positions[i][0] = atom->x[i][0];
-    positions[i][1] = atom->x[i][1];
-    positions[i][2] = atom->x[i][2];
-  }
-
-  // ----- cell -----
-  auto cell = torch::zeros({3,3}, torch_float_dtype);
-  cell[0][0] = domain->h[0];
-  cell[0][1] = 0.0;
-  cell[0][2] = 0.0;
-  cell[1][0] = domain->h[5];
-  cell[1][1] = domain->h[1];
-  cell[1][2] = 0.0;
-  cell[2][0] = domain->h[4];
-  cell[2][1] = domain->h[3];
-  cell[2][2] = domain->h[2];
-
-  // ----- edge_index and unit_shifts -----
-  // count total number of edges
-  int n_edges = 0;
-  std::vector<int> n_edges_vec(n_nodes, 0);
-  #pragma omp parallel for reduction(+:n_edges)
-  for (int ii=0; ii<n_nodes; ++ii) {
-    int i = list->ilist[ii];
-    double xtmp = atom->x[i][0];
-    double ytmp = atom->x[i][1];
-    double ztmp = atom->x[i][2];
-    int *jlist = list->firstneigh[i];
-    int jnum = list->numneigh[i];
-    for (int jj=0; jj<jnum; ++jj) {
-      int j = jlist[jj];
-      j &= NEIGHMASK;
-      double delx = xtmp - atom->x[j][0];
-      double dely = ytmp - atom->x[j][1];
-      double delz = ztmp - atom->x[j][2];
-      double rsq = delx * delx + dely * dely + delz * delz;
-      if (rsq < r_max_squared) {
-        n_edges += 1;
-        n_edges_vec[ii] += 1;
-      }
-    }
-  }
-  // make first_edge vector to help with parallelizing following loop
-  std::vector<int> first_edge(n_nodes);
-  first_edge[0] = 0;
-  for (int ii=0; ii<n_nodes-1; ++ii) {
-    first_edge[ii+1] = first_edge[ii] + n_edges_vec[ii];
-  }
-  // fill edge_index and unit_shifts tensors
-  auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
-  auto unit_shifts = torch::zeros({n_edges,3}, torch_float_dtype);
-  auto shifts = torch::zeros({n_edges,3}, torch_float_dtype);
-  #pragma omp parallel for
-  for (int ii=0; ii<n_nodes; ++ii) {
-    int i = list->ilist[ii];
-    double xtmp = atom->x[i][0];
-    double ytmp = atom->x[i][1];
-    double ztmp = atom->x[i][2];
-    int *jlist = list->firstneigh[i];
-    int jnum = list->numneigh[i];
-    int k = first_edge[ii];
-    for (int jj=0; jj<jnum; ++jj) {
-      int j = jlist[jj];
-      j &= NEIGHMASK;
-      double delx = xtmp - atom->x[j][0];
-      double dely = ytmp - atom->x[j][1];
-      double delz = ztmp - atom->x[j][2];
-      double rsq = delx * delx + dely * dely + delz * delz;
-      if (rsq < r_max_squared) {
-        edge_index[0][k] = i;
-        if (domain_decomposition) {
-          edge_index[1][k] = j;
-        } else {
-          int j_local = atom->map(atom->tag[j]);
-          edge_index[1][k] = j_local;
-          double shiftx = atom->x[j][0] - atom->x[j_local][0];
-          double shifty = atom->x[j][1] - atom->x[j_local][1];
-          double shiftz = atom->x[j][2] - atom->x[j_local][2];
-          double shiftxs = std::round(domain->h_inv[0]*shiftx + domain->h_inv[5]*shifty + domain->h_inv[4]*shiftz);
-          double shiftys = std::round(domain->h_inv[1]*shifty + domain->h_inv[3]*shiftz);
-          double shiftzs = std::round(domain->h_inv[2]*shiftz);
-          unit_shifts[k][0] = shiftxs;
-          unit_shifts[k][1] = shiftys;
-          unit_shifts[k][2] = shiftzs;
-          shifts[k][0] = domain->h[0]*shiftxs + domain->h[5]*shiftys + domain->h[4]*shiftzs;
-          shifts[k][1] = domain->h[1]*shiftys + domain->h[3]*shiftzs;
-          shifts[k][2] = domain->h[2]*shiftzs;
-        }
-        k++;
-      }
-    }
-  }
+  ev_init(eflag,vflag,0);
 
-  // ----- node_attrs -----
-  // node_attrs is one-hot encoding for atomic numbers
-  auto mace_type = [this](int lammps_type) {
-    for (int i=0; i<mace_atomic_numbers.size(); ++i) {
-      if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
-        return i+1;
-      }
-    }
-    error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
-    return -1;
-  };
-  int n_node_feats = mace_atomic_numbers.size();
-  auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch_float_dtype);
-  #pragma omp parallel for
-  for (int ii=0; ii<n_nodes; ++ii) {
-    int i = list->ilist[ii];
-    node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
+  if (eflag_atom || vflag_atom) {
+    error->all(FLERR, "ERROR: kokkos eflag_atom not implemented.");
   }
 
-  // ----- mask for ghost -----
-  auto mask = torch::zeros(n_nodes, torch::dtype(torch::kBool));
-  #pragma omp parallel for
-  for (int ii=0; ii<atom->nlocal; ++ii) {
-    int i = list->ilist[ii];
-    mask[i] = true;
-  }
-
-  auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
-  auto energy = torch::empty({1}, torch_float_dtype);
-  auto forces = torch::empty({n_nodes,3}, torch_float_dtype);
-  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
-  auto weight = torch::empty({1}, torch_float_dtype);
-  ptr[0] = 0;
-  ptr[1] = n_nodes;
-  weight[0] = 1.0;
-
-  // pack the input, call the model, extract the output
-  c10::Dict<std::string, torch::Tensor> input;
-  input.insert("batch", batch);
-  input.insert("cell", cell);
-  input.insert("edge_index", edge_index);
-  input.insert("energy", energy);
-  input.insert("forces", forces);
-  input.insert("node_attrs", node_attrs);
-  input.insert("positions", positions);
-  input.insert("ptr", ptr);
-  input.insert("shifts", shifts);
-  input.insert("unit_shifts", unit_shifts);
-  input.insert("weight", weight);
-  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
-
-  // mace energy
-  //   -> sum of site energies of local atoms
-  if (eflag_global) {
-    auto node_energy = output.at("node_energy").toTensor();
-    eng_vdwl = 0.0;
-    #pragma omp parallel for reduction(+:eng_vdwl)
-    for (int ii=0; ii<list->inum; ++ii) {
-      int i = list->ilist[ii];
-      eng_vdwl += node_energy[i].item<double>();
-    }
-  }
-
-  // mace forces
-  //   -> derivatives of total mace energy
-  forces = output.at("forces").toTensor();
-  #pragma omp parallel for
-  for (int ii=0; ii<list->inum; ++ii) {
-    int i = list->ilist[ii];
-    atom->f[i][0] = forces[i][0].item<double>();
-    atom->f[i][1] = forces[i][1].item<double>();
-    atom->f[i][2] = forces[i][2].item<double>();
-  }
-
-  // mace site energies
-  //   -> local atoms only
-  if (eflag_atom) {
-    auto node_energy = output.at("node_energy").toTensor();
-    #pragma omp parallel for
-    for (int ii=0; ii<list->inum; ++ii) {
-      int i = list->ilist[ii];
-      eatom[i] = node_energy[i].item<double>();
-    }
-  }
-
-  // mace virials (local atoms only)
-  //   -> derivatives of sum of site energies of local atoms
-  if (vflag_global) {
-    auto vir = output.at("virials").toTensor();
-    virial[0] = vir[0][0][0].item<double>();
-    virial[1] = vir[0][1][1].item<double>();
-    virial[2] = vir[0][2][2].item<double>();
-    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
-    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
-    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
-  }
-
-  // mace site virials
-  //   -> not available
-  if (vflag_atom) {
-    error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
-  }
+//  if (eflag_atom) {
+//    memoryKK->destroy_kokkos(k_eatom,eatom);
+//    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+//    d_eatom = k_eatom.view<DeviceType>();
+//  }
+//  if (vflag_atom) {
+//    memoryKK->destroy_kokkos(k_vatom,vatom);
+//    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
+//    d_vatom = k_vatom.view<DeviceType>();
+//  }
+
+  atomKK->sync(execution_space,datamask_read);
+  //k_cutsq.template sync<DeviceType>();
+  //k_params.template sync<DeviceType>();
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  c_x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  nlocal = atom->nlocal;
+  nall = atom->nlocal + atom->nghost;
+  //newton_pair = force->newton_pair;
+  //special_lj[0] = force->special_lj[0];
+  //special_lj[1] = force->special_lj[1];
+  //special_lj[2] = force->special_lj[2];
+  //special_lj[3] = force->special_lj[3];
+
+  // loop over neighbors of my atoms
+
+  //copymode = 1;
+
+  //EV_FLOAT ev = pair_compute<PairLJCutKokkos<DeviceType>,void >(this,(NeighListKokkos<DeviceType>*)list);
+
+  //if (eflag_global) eng_vdwl += ev.evdwl;
+  //if (vflag_global) {
+  //  virial[0] += ev.v[0];
+  //  virial[1] += ev.v[1];
+  //  virial[2] += ev.v[2];
+  //  virial[3] += ev.v[3];
+  //  virial[4] += ev.v[4];
+  //  virial[5] += ev.v[5];
+  //}
+
+  //if (eflag_atom) {
+  //  k_eatom.template modify<DeviceType>();
+  //  k_eatom.template sync<LMPHostType>();
+  //}
+
+  //if (vflag_atom) {
+  //  k_vatom.template modify<DeviceType>();
+  //  k_vatom.template sync<LMPHostType>();
+  //}
+
+  copymode = 0;
+  
+
+//  ev_init(eflag, vflag);
+//
+//  if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
+//  if (domain_decomposition) {
+//    if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
+//  }
+//
+//  // ----- positions -----
+//  int n_nodes;
+//  if (domain_decomposition) {
+//    n_nodes = atom->nlocal + atom->nghost;
+//  } else {
+//    // normally, ghost atoms are included in the graph as independent
+//    // nodes, as required when the local domain does not have PBC.
+//    // however, in no_domain_decomposition mode, ghost atoms are known to
+//    // be shifted versions of local atoms.
+//    n_nodes = atom->nlocal;
+//  }
+//  auto positions = torch::empty({n_nodes,3}, torch_float_dtype);
+//  #pragma omp parallel for
+//  for (int ii=0; ii<n_nodes; ++ii) {
+//    int i = list->ilist[ii];
+//    positions[i][0] = atom->x[i][0];
+//    positions[i][1] = atom->x[i][1];
+//    positions[i][2] = atom->x[i][2];
+//  }
+//
+//  // ----- cell -----
+//  auto cell = torch::zeros({3,3}, torch_float_dtype);
+//  cell[0][0] = domain->h[0];
+//  cell[0][1] = 0.0;
+//  cell[0][2] = 0.0;
+//  cell[1][0] = domain->h[5];
+//  cell[1][1] = domain->h[1];
+//  cell[1][2] = 0.0;
+//  cell[2][0] = domain->h[4];
+//  cell[2][1] = domain->h[3];
+//  cell[2][2] = domain->h[2];
+//
+//  // ----- edge_index and unit_shifts -----
+//  // count total number of edges
+//  int n_edges = 0;
+//  std::vector<int> n_edges_vec(n_nodes, 0);
+//  #pragma omp parallel for reduction(+:n_edges)
+//  for (int ii=0; ii<n_nodes; ++ii) {
+//    int i = list->ilist[ii];
+//    double xtmp = atom->x[i][0];
+//    double ytmp = atom->x[i][1];
+//    double ztmp = atom->x[i][2];
+//    int *jlist = list->firstneigh[i];
+//    int jnum = list->numneigh[i];
+//    for (int jj=0; jj<jnum; ++jj) {
+//      int j = jlist[jj];
+//      j &= NEIGHMASK;
+//      double delx = xtmp - atom->x[j][0];
+//      double dely = ytmp - atom->x[j][1];
+//      double delz = ztmp - atom->x[j][2];
+//      double rsq = delx * delx + dely * dely + delz * delz;
+//      if (rsq < r_max_squared) {
+//        n_edges += 1;
+//        n_edges_vec[ii] += 1;
+//      }
+//    }
+//  }
+//  // make first_edge vector to help with parallelizing following loop
+//  std::vector<int> first_edge(n_nodes);
+//  first_edge[0] = 0;
+//  for (int ii=0; ii<n_nodes-1; ++ii) {
+//    first_edge[ii+1] = first_edge[ii] + n_edges_vec[ii];
+//  }
+//  // fill edge_index and unit_shifts tensors
+//  auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
+//  auto unit_shifts = torch::zeros({n_edges,3}, torch_float_dtype);
+//  auto shifts = torch::zeros({n_edges,3}, torch_float_dtype);
+//  #pragma omp parallel for
+//  for (int ii=0; ii<n_nodes; ++ii) {
+//    int i = list->ilist[ii];
+//    double xtmp = atom->x[i][0];
+//    double ytmp = atom->x[i][1];
+//    double ztmp = atom->x[i][2];
+//    int *jlist = list->firstneigh[i];
+//    int jnum = list->numneigh[i];
+//    int k = first_edge[ii];
+//    for (int jj=0; jj<jnum; ++jj) {
+//      int j = jlist[jj];
+//      j &= NEIGHMASK;
+//      double delx = xtmp - atom->x[j][0];
+//      double dely = ytmp - atom->x[j][1];
+//      double delz = ztmp - atom->x[j][2];
+//      double rsq = delx * delx + dely * dely + delz * delz;
+//      if (rsq < r_max_squared) {
+//        edge_index[0][k] = i;
+//        if (domain_decomposition) {
+//          edge_index[1][k] = j;
+//        } else {
+//          int j_local = atom->map(atom->tag[j]);
+//          edge_index[1][k] = j_local;
+//          double shiftx = atom->x[j][0] - atom->x[j_local][0];
+//          double shifty = atom->x[j][1] - atom->x[j_local][1];
+//          double shiftz = atom->x[j][2] - atom->x[j_local][2];
+//          double shiftxs = std::round(domain->h_inv[0]*shiftx + domain->h_inv[5]*shifty + domain->h_inv[4]*shiftz);
+//          double shiftys = std::round(domain->h_inv[1]*shifty + domain->h_inv[3]*shiftz);
+//          double shiftzs = std::round(domain->h_inv[2]*shiftz);
+//          unit_shifts[k][0] = shiftxs;
+//          unit_shifts[k][1] = shiftys;
+//          unit_shifts[k][2] = shiftzs;
+//          shifts[k][0] = domain->h[0]*shiftxs + domain->h[5]*shiftys + domain->h[4]*shiftzs;
+//          shifts[k][1] = domain->h[1]*shiftys + domain->h[3]*shiftzs;
+//          shifts[k][2] = domain->h[2]*shiftzs;
+//        }
+//        k++;
+//      }
+//    }
+//  }
+//
+//  // ----- node_attrs -----
+//  // node_attrs is one-hot encoding for atomic numbers
+//  auto mace_type = [this](int lammps_type) {
+//    for (int i=0; i<mace_atomic_numbers.size(); ++i) {
+//      if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
+//        return i+1;
+//      }
+//    }
+//    error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
+//    return -1;
+//  };
+//  int n_node_feats = mace_atomic_numbers.size();
+//  auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch_float_dtype);
+//  #pragma omp parallel for
+//  for (int ii=0; ii<n_nodes; ++ii) {
+//    int i = list->ilist[ii];
+//    node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
+//  }
+//
+//  // ----- mask for ghost -----
+//  auto mask = torch::zeros(n_nodes, torch::dtype(torch::kBool));
+//  #pragma omp parallel for
+//  for (int ii=0; ii<atom->nlocal; ++ii) {
+//    int i = list->ilist[ii];
+//    mask[i] = true;
+//  }
+//
+//  auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
+//  auto energy = torch::empty({1}, torch_float_dtype);
+//  auto forces = torch::empty({n_nodes,3}, torch_float_dtype);
+//  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
+//  auto weight = torch::empty({1}, torch_float_dtype);
+//  ptr[0] = 0;
+//  ptr[1] = n_nodes;
+//  weight[0] = 1.0;
+//
+//  // pack the input, call the model, extract the output
+//  c10::Dict<std::string, torch::Tensor> input;
+//  input.insert("batch", batch);
+//  input.insert("cell", cell);
+//  input.insert("edge_index", edge_index);
+//  input.insert("energy", energy);
+//  input.insert("forces", forces);
+//  input.insert("node_attrs", node_attrs);
+//  input.insert("positions", positions);
+//  input.insert("ptr", ptr);
+//  input.insert("shifts", shifts);
+//  input.insert("unit_shifts", unit_shifts);
+//  input.insert("weight", weight);
+//  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
+//
+//  // mace energy
+//  //   -> sum of site energies of local atoms
+//  if (eflag_global) {
+//    auto node_energy = output.at("node_energy").toTensor();
+//    eng_vdwl = 0.0;
+//    #pragma omp parallel for reduction(+:eng_vdwl)
+//    for (int ii=0; ii<list->inum; ++ii) {
+//      int i = list->ilist[ii];
+//      eng_vdwl += node_energy[i].item<double>();
+//    }
+//  }
+//
+//  // mace forces
+//  //   -> derivatives of total mace energy
+//  forces = output.at("forces").toTensor();
+//  #pragma omp parallel for
+//  for (int ii=0; ii<list->inum; ++ii) {
+//    int i = list->ilist[ii];
+//    atom->f[i][0] = forces[i][0].item<double>();
+//    atom->f[i][1] = forces[i][1].item<double>();
+//    atom->f[i][2] = forces[i][2].item<double>();
+//  }
+//
+//  // mace site energies
+//  //   -> local atoms only
+//  if (eflag_atom) {
+//    auto node_energy = output.at("node_energy").toTensor();
+//    #pragma omp parallel for
+//    for (int ii=0; ii<list->inum; ++ii) {
+//      int i = list->ilist[ii];
+//      eatom[i] = node_energy[i].item<double>();
+//    }
+//  }
+//
+//  // mace virials (local atoms only)
+//  //   -> derivatives of sum of site energies of local atoms
+//  if (vflag_global) {
+//    auto vir = output.at("virials").toTensor();
+//    virial[0] = vir[0][0][0].item<double>();
+//    virial[1] = vir[0][1][1].item<double>();
+//    virial[2] = vir[0][2][2].item<double>();
+//    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+//    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
+//    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+//  }
+//
+//  // mace site virials
+//  //   -> not available
+//  if (vflag_atom) {
+//    error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
+//  }
 
 }
 
@@ -344,41 +428,82 @@ void PairMACEKokkos<DeviceType>::coeff(int narg, char **arg)
 template<class DeviceType>
 void PairMACEKokkos<DeviceType>::init_style()
 {
-  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
-
-  /*
-    MACE requires the full neighbor list AND neighbors of ghost atoms
-    it appears that:
-      * without REQ_GHOST
-           list->gnum == 0
-           list->ilist does not include ghost atoms, but the jlists do
-      * with REQ_GHOST
-           list->gnum == atom->nghost
-           list->ilist includes ghost atoms
-  */
-  if (domain_decomposition) {
-    neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
-  } else {
-    neighbor->add_request(this, NeighConst::REQ_FULL);
-  }
+//  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
+//
+//  /*
+//    MACE requires the full neighbor list AND neighbors of ghost atoms
+//    it appears that:
+//      * without REQ_GHOST
+//           list->gnum == 0
+//           list->ilist does not include ghost atoms, but the jlists do
+//      * with REQ_GHOST
+//           list->gnum == atom->nghost
+//           list->ilist includes ghost atoms
+//  */
+//  if (domain_decomposition) {
+//    neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
+//  } else {
+//    neighbor->add_request(this, NeighConst::REQ_FULL);
+//  }
+
+  PairMACE::init_style();
+
+  // TODO: from lj_cut, possibly delete
+  // adjust neighbor list request for KOKKOS
+
+  //neighflag = lmp->kokkos->neighflag;
+  //auto request = neighbor->find_request(this);
+  //request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+  //                         !std::is_same<DeviceType,LMPDeviceType>::value);
+  //request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  //if (neighflag == FULL) request->enable_full();
 }
 
 template<class DeviceType>
 double PairMACEKokkos<DeviceType>::init_one(int i, int j)
 {
-  // to account for message passing, require cutoff of n_layers * r_max
-  return num_interactions*model.attr("r_max").toTensor().item<double>();
+//  // to account for message passing, require cutoff of n_layers * r_max
+//  return num_interactions*model.attr("r_max").toTensor().item<double>();
+
+  double cutone = PairMACE::init_one(i,j);
+
+//  k_params.h_view(i,j).lj1 = lj1[i][j];
+//  k_params.h_view(i,j).lj2 = lj2[i][j];
+//  k_params.h_view(i,j).lj3 = lj3[i][j];
+//  k_params.h_view(i,j).lj4 = lj4[i][j];
+//  k_params.h_view(i,j).offset = offset[i][j];
+//  k_params.h_view(i,j).cutsq = cutone*cutone;
+//  k_params.h_view(j,i) = k_params.h_view(i,j);
+//  if (i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+//    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+//    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
+//  }
+//
+//  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
+//  k_cutsq.template modify<LMPHostType>();
+//  k_params.template modify<LMPHostType>();
+
+  return cutone;
 }
 
 template<class DeviceType>
 void PairMACEKokkos<DeviceType>::allocate()
 {
-  allocated = 1;
-
-  memory->create(setflag, atom->ntypes+1, atom->ntypes+1, "pair:setflag");
-  for (int i=1; i<atom->ntypes+1; i++)
-    for (int j=i; j<atom->ntypes+1; j++)
-      setflag[i][j] = 0;
+  PairMACE::allocate();
+
+  // TODO: from lj_cut, possibly delete
+  //int n = atom->ntypes;
+  //memory->destroy(cutsq);
+  //memoryKK->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  //d_cutsq = k_cutsq.template view<DeviceType>();
+  ////k_params = Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType>("PairLJCut::params",n+1,n+1);
+  //params = k_params.template view<DeviceType>();
+}
 
-  memory->create(cutsq, atom->ntypes+1, atom->ntypes+1, "pair:cutsq");
+namespace LAMMPS_NS {
+template class PairMACEKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class PairMACEKokkos<LMPHostType>;
+#endif
 }
+
diff --git a/src/ML-MACE/pair_mace_kokkos.h b/src/ML-MACE/pair_mace_kokkos.h
index 1e269c2ce0f..0bf3b813941 100644
--- a/src/ML-MACE/pair_mace_kokkos.h
+++ b/src/ML-MACE/pair_mace_kokkos.h
@@ -19,8 +19,8 @@
 #ifdef PAIR_CLASS
 // clang-format off
 PairStyle(mace/kk,PairMACEKokkos<LMPDeviceType>);
-PairStyle(mace/kk/device,PairLJCutKokkos<LMPDeviceType>);
-PairStyle(mace/kk/host,PairLJCutKokkos<LMPHostType>);
+PairStyle(mace/kk/device,PairMACEKokkos<LMPDeviceType>);
+PairStyle(mace/kk/host,PairMACEKokkos<LMPHostType>);
 // clang-format on
 #else
 

From 4239331858f42b113a41413e6ce2aba7b62a4dd4 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Fri, 10 Mar 2023 15:52:14 +0000
Subject: [PATCH 24/51] wip.

---
 src/KOKKOS/pair_mace_kokkos.cpp            | 597 +++++++++++++++++++++
 src/{ML-MACE => KOKKOS}/pair_mace_kokkos.h |  22 +-
 src/ML-MACE/pair_mace_kokkos.cpp           | 509 ------------------
 3 files changed, 617 insertions(+), 511 deletions(-)
 create mode 100644 src/KOKKOS/pair_mace_kokkos.cpp
 rename src/{ML-MACE => KOKKOS}/pair_mace_kokkos.h (85%)
 delete mode 100644 src/ML-MACE/pair_mace_kokkos.cpp

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
new file mode 100644
index 00000000000..64a80ae9338
--- /dev/null
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -0,0 +1,597 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributors
+      William C Witt (University of Cambridge)
+------------------------------------------------------------------------- */
+
+#include "pair_mace_kokkos.h"
+
+// TODO: do i need all these?
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
+#include "neigh_list.h"
+#include "neighbor_kokkos.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+#include "update.h"
+
+#include <algorithm>
+#include <iostream>
+#include <stdexcept>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMACEKokkos<DeviceType>::PairMACEKokkos(LAMMPS *lmp) : PairMACE(lmp)
+{
+std::cout << "hello from kokkos mace constructor" << std::endl;
+  no_virial_fdotr_compute = 1;
+
+  //kokkosable = 1;
+  //atomKK = (AtomKokkos *) atom;
+  //execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  //datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  //datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+  // pair_pace_kokkos has these instead
+  //datamask_read = EMPTY_MASK;
+  //datamask_modify = EMPTY_MASK;
+
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  host_flag = (execution_space == Host);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMACEKokkos<DeviceType>::~PairMACEKokkos()
+{
+  if (copymode) return;
+
+  // from lj_cut_kokkos
+  //if (allocated) {
+  //  memoryKK->destroy_kokkos(k_eatom,eatom);
+  //  memoryKK->destroy_kokkos(k_vatom,vatom);
+  //  memoryKK->destroy_kokkos(k_cutsq,cutsq);
+  //}
+
+  // from pair_pace_kokkos
+  //memoryKK->destroy_kokkos(k_eatom,eatom);
+  //memoryKK->destroy_kokkos(k_vatom,vatom);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
+{
+  ev_init(eflag,vflag,0);
+
+  if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
+  if (domain_decomposition) {
+    if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
+  }
+
+  if (eflag_atom || vflag_atom) {
+    error->all(FLERR, "ERROR: kokkos eflag_atom not implemented.");
+  }
+
+//  if (eflag_atom) {
+//    memoryKK->destroy_kokkos(k_eatom,eatom);
+//    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+//    d_eatom = k_eatom.view<DeviceType>();
+//  }
+//  if (vflag_atom) {
+//    memoryKK->destroy_kokkos(k_vatom,vatom);
+//    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
+//    d_vatom = k_vatom.view<DeviceType>();
+//  }
+
+  auto r_max_squared = this->r_max_squared;
+  auto h0 = domain->h[0];
+  auto h1 = domain->h[1];
+  auto h2 = domain->h[2];
+  auto h3 = domain->h[3];
+  auto h4 = domain->h[4];
+  auto h5 = domain->h[5];
+  auto hinv0 = domain->h_inv[0];
+  auto hinv1 = domain->h_inv[1];
+  auto hinv2 = domain->h_inv[2];
+  auto hinv3 = domain->h_inv[3];
+  auto hinv4 = domain->h_inv[4];
+  auto hinv5 = domain->h_inv[5];
+
+  auto k_lammps_atomic_numbers = Kokkos::View<long*>("k_lammps_atomic_numbers",lammps_atomic_numbers.size());
+  auto k_lammps_atomic_numbers_mirror = Kokkos::create_mirror_view(k_lammps_atomic_numbers);
+  for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
+    k_lammps_atomic_numbers(i) = lammps_atomic_numbers[i];
+  }
+  auto k_mace_atomic_numbers = Kokkos::View<long*>("k_mace_atomic_numbers",mace_atomic_numbers.size());
+  auto k_mace_atomic_numbers_mirror = Kokkos::create_mirror_view(k_mace_atomic_numbers);
+  for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
+    k_mace_atomic_numbers(i) = mace_atomic_numbers[i];
+  }
+  auto mace_atomic_numbers_size = mace_atomic_numbers.size();
+
+//  auto k_positions = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_positions", n_nodes);
+//  Kokkos::vector<int64_t> lammps_atomic_numbers = this->lammps_atomic_numbers;
+//  Kokkos::vector<int64_t> mace_atomic_numbers = this->mace_atomic_numbers;
+
+  // atom map
+  auto map_style = atom->map_style;
+  auto k_map_array = atomKK->k_map_array;
+  auto k_map_hash = atomKK->k_map_hash;
+  k_map_array.template sync<DeviceType>();
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK|TAG_MASK);
+
+  auto x = atomKK->k_x.view<DeviceType>();
+//  c_x = atomKK->k_x.view<DeviceType>();
+  auto f = atomKK->k_f.view<DeviceType>();
+  auto tag = atomKK->k_tag.view<DeviceType>();
+  auto type = atomKK->k_type.view<DeviceType>();
+//  int nlocal = atom->nlocal;
+//  int nall = atom->nlocal + atom->nghost;
+
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  auto d_numneigh = k_list->d_numneigh;
+  auto d_neighbors = k_list->d_neighbors;
+  auto d_ilist = k_list->d_ilist;
+  //inum = list->inum;
+
+  // ----- positions -----
+  int n_nodes;
+  if (domain_decomposition) {
+    n_nodes = atom->nlocal + atom->nghost;
+  } else {
+    // normally, ghost atoms are included in the graph as independent
+    // nodes, as required when the local domain does not have PBC.
+    // however, in no_domain_decomposition mode, ghost atoms are known to
+    // be shifted versions of local atoms.
+    n_nodes = atom->nlocal;
+  }
+  auto k_positions = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_positions", n_nodes);
+  Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA (const int i) {
+    k_positions(i,0) = x(i,0);
+    k_positions(i,1) = x(i,1);
+    k_positions(i,2) = x(i,2);
+  });
+  auto positions = torch::from_blob(
+    k_positions.data(), {n_nodes,3}, torch_float_dtype);
+
+  // ----- cell -----
+  // TODO: how to use kokkos here?
+  auto cell = torch::zeros({3,3}, torch_float_dtype);
+  cell[0][0] = h0;
+  cell[0][1] = 0.0;
+  cell[0][2] = 0.0;
+  cell[1][0] = h5;
+  cell[1][1] = h1;
+  cell[1][2] = 0.0;
+  cell[2][0] = h4;
+  cell[2][1] = h3;
+  cell[2][2] = h2;
+
+  // ----- edge_index and unit_shifts -----
+  // count total number of edges
+  auto k_n_edges_vec = Kokkos::View<long*,DeviceType>("k_n_edges_vec", n_nodes);
+  Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA (const int ii) {
+    const int i = d_ilist[ii];
+    const X_FLOAT xtmp = x(i,0);
+    const X_FLOAT ytmp = x(i,1);
+    const X_FLOAT ztmp = x(i,2);
+    const int jnum = d_numneigh[i];
+    for (int jj=0; jj<jnum; ++jj) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+      const X_FLOAT delx = xtmp - x(j,0);
+      const X_FLOAT dely = ytmp - x(j,1);
+      const X_FLOAT delz = ztmp - x(j,2);
+      const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+      if (rsq < r_max_squared) {
+        k_n_edges_vec(ii) += 1;
+      }
+    }
+  });
+  int n_edges = 0;
+  Kokkos::parallel_reduce(n_nodes, KOKKOS_LAMBDA(const int ii, int& n_edges) {
+    n_edges += k_n_edges_vec(ii);
+  }, n_edges);
+  // make first_edge vector to help with parallelizing following loop
+  auto k_first_edge = Kokkos::View<long*,DeviceType>("k_first_edge", n_nodes);  // initialized to zero
+  Kokkos::parallel_for(n_nodes-1, KOKKOS_LAMBDA(const int ii) {
+    k_first_edge[ii+1] = k_first_edge(ii) + k_n_edges_vec(ii);
+  });
+  // fill edge_index and unit_shifts tensors
+  Kokkos::View<long**> k_edge_index("k_edge_index", 2, n_edges);
+  Kokkos::View<double*[3]> k_unit_shifts("k_unit_shifts", n_edges);
+  Kokkos::View<double*[3]> k_shifts("k_shifts", n_edges);
+  if (domain_decomposition) {
+
+//          if (domain_decomposition) {
+//            edge_index[1][k] = j;
+
+  } else {
+
+    Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
+      const int i = d_ilist[ii];
+      const X_FLOAT xtmp = x(i,0);
+      const X_FLOAT ytmp = x(i,1);
+      const X_FLOAT ztmp = x(i,2);
+      const int jnum = d_numneigh[i];
+      int k = k_first_edge(ii);
+      for (int jj=0; jj<jnum; ++jj) {
+        int j = d_neighbors(i,jj);
+        j &= NEIGHMASK;
+        const X_FLOAT delx = xtmp - x(j,0);
+        const X_FLOAT dely = ytmp - x(j,1);
+        const X_FLOAT delz = ztmp - x(j,2);
+        const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq < r_max_squared) {
+          k_edge_index(0,k) = i;
+          //int j_local = atom->map(tag(j));
+          int j_local = AtomKokkos::map_kokkos<DeviceType>(tag(j),map_style,k_map_array,k_map_hash);
+          k_edge_index(1,k) = j_local;
+          double shiftx = x(j,0) - x(j_local,0);
+          double shifty = x(j,1) - x(j_local,1);
+          double shiftz = x(j,2) - x(j_local,2);
+          double shiftxs = std::round(hinv0*shiftx + hinv5*shifty + hinv4*shiftz);
+          double shiftys = std::round(hinv1*shifty + hinv3*shiftz);
+          double shiftzs = std::round(hinv2*shiftz);
+          k_unit_shifts(k,0) = shiftxs;
+          k_unit_shifts(k,1) = shiftys;
+          k_unit_shifts(k,2) = shiftzs;
+          k_shifts(k,0) = h0*shiftxs + h5*shiftys + h4*shiftzs;
+          k_shifts(k,1) = h1*shiftys + h3*shiftzs;
+          k_shifts(k,2) = h2*shiftzs;
+          k++;
+        }
+      }
+    });
+  }
+  auto edge_index = torch::from_blob(k_edge_index.data(), {2,n_edges}, torch::dtype(torch::kInt64));
+  auto unit_shifts = torch::from_blob(k_unit_shifts.data(), {n_edges,3}, torch_float_dtype);
+  auto shifts = torch::from_blob(k_shifts.data(), {n_edges,3}, torch_float_dtype);
+
+  // ----- node_attrs -----
+  // node_attrs is one-hot encoding for atomic numbers
+  int n_node_feats = mace_atomic_numbers.size();
+  Kokkos::View<double**> k_node_attrs("k_node_attrs", n_nodes, n_node_feats);
+  Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
+    const int i = d_ilist[ii];
+    const int lammps_type = type[i];
+    int t = -1;
+    for (int j=0; j<mace_atomic_numbers_size; ++j) {
+      if (k_mace_atomic_numbers(j)==k_lammps_atomic_numbers(lammps_type-1)) {
+        t = j+1;
+      }
+    }
+    //if (t==-1) error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
+    k_node_attrs(i,t-1) = 1.0;
+  });
+  auto node_attrs = torch::from_blob(k_node_attrs.data(), {n_nodes,n_node_feats}, torch_float_dtype);
+
+//  // ----- mask for ghost -----
+//  Kokkos::View<bool*> k_mask("k_mask", n_nodes);
+//  Kokkos::parallel_for(atom->nlocal, KOKKOS_LAMBDA(const int ii) {
+//    const int i = d_ilist[ii];
+//    k_mask(i) = true;
+//  });
+//  auto mask = torch::from_blob(k_mask.data(), n_nodes, torch::dtype(torch::kBool));
+//
+//  auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
+//  auto energy = torch::empty({1}, torch_float_dtype);
+//  auto forces = torch::empty({n_nodes,3}, torch_float_dtype);
+//  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
+//  auto weight = torch::empty({1}, torch_float_dtype);
+//  ptr[0] = 0;
+//  ptr[1] = n_nodes;
+//  weight[0] = 1.0;
+//
+//std::cout << "packing the input" << std::endl;
+//
+//  // pack the input, call the model, extract the output
+//  c10::Dict<std::string, torch::Tensor> input;
+//  input.insert("batch", batch);
+//  input.insert("cell", cell);
+//  input.insert("edge_index", edge_index);
+//  input.insert("energy", energy);
+//  input.insert("forces", forces);
+//  input.insert("node_attrs", node_attrs);
+//  input.insert("positions", positions);
+//  input.insert("ptr", ptr);
+//  input.insert("shifts", shifts);
+//  input.insert("unit_shifts", unit_shifts);
+//  input.insert("weight", weight);
+////std::cout << "batch" << batch << std::endl;
+////std::cout << "cell" << cell << std::endl;
+////std::cout << "edge_index" << edge_index << std::endl;
+////// energy
+////// forces
+////std::cout << "node_attrs" << node_attrs << std::endl;
+////std::cout << "positions" << positions << std::endl;
+////std::cout << "ptr" << ptr << std::endl;
+////std::cout << "shifts" << shifts << std::endl;
+////std::cout << "unit_shifts" << unit_shifts << std::endl;
+////std::cout << "weight" << weight << std::endl;
+////std::cout << "mask" << mask << std::endl;
+//  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
+//
+////std::cout << "energy: " << output.at("energy").toTensor() << std::endl;
+////std::cout << "node_energy: " << output.at("node_energy").toTensor() << std::endl;
+//
+//  // mace energy
+//  //   -> sum of site energies of local atoms
+//  if (eflag_global) {
+//    auto node_energy = output.at("node_energy").toTensor();
+//    eng_vdwl = 0.0;
+//    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int ii, double &eng_vdwl) {
+//      const int i = d_ilist[ii];
+//      eng_vdwl += node_energy[i].item<double>();
+//    }, eng_vdwl);
+//  }
+//
+//  // mace forces
+//  //   -> derivatives of total mace energy
+//  forces = output.at("forces").toTensor();
+//  Kokkos::View<double*[3], Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
+//    k_f(forces.data_ptr<double>(), n_nodes);
+//  Kokkos::parallel_for(atom->nlocal, KOKKOS_LAMBDA(const int ii) {
+//    const int i = d_ilist[ii];
+//    f(i,0) = k_f(i,0);
+//    f(i,1) = k_f(i,1);
+//    f(i,2) = k_f(i,2);
+//  });
+//
+////  // mace site energies
+////  //   -> local atoms only
+////  if (eflag_atom) {
+////    auto node_energy = output.at("node_energy").toTensor();
+////    #pragma omp parallel for
+////    for (int ii=0; ii<list->inum; ++ii) {
+////      int i = list->ilist[ii];
+////      eatom[i] = node_energy[i].item<double>();
+////    }
+////  }
+////
+//  // mace virials (local atoms only)
+//  //   -> derivatives of sum of site energies of local atoms
+//  if (vflag_global) {
+//    auto vir = output.at("virials").toTensor();
+//    virial[0] = vir[0][0][0].item<double>();
+//    virial[1] = vir[0][1][1].item<double>();
+//    virial[2] = vir[0][2][2].item<double>();
+//    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+//    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
+//    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+//  }
+////
+////  // mace site virials
+////  //   -> not available
+////  if (vflag_atom) {
+////    error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
+////  }
+//
+//
+//  //copymode = 1;
+//
+//  //EV_FLOAT ev = pair_compute<PairLJCutKokkos<DeviceType>,void >(this,(NeighListKokkos<DeviceType>*)list);
+//
+//  //if (eflag_global) eng_vdwl += ev.evdwl;
+//  //if (vflag_global) {
+//  //  virial[0] += ev.v[0];
+//  //  virial[1] += ev.v[1];
+//  //  virial[2] += ev.v[2];
+//  //  virial[3] += ev.v[3];
+//  //  virial[4] += ev.v[4];
+//  //  virial[5] += ev.v[5];
+//  //}
+//
+//  //if (eflag_atom) {
+//  //  k_eatom.template modify<DeviceType>();
+//  //  k_eatom.template sync<LMPHostType>();
+//  //}
+//
+//  //if (vflag_atom) {
+//  //  k_vatom.template modify<DeviceType>();
+//  //  k_vatom.template sync<LMPHostType>();
+//  //}
+//
+//  copymode = 0;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg == 1) {
+    if (strcmp(arg[0], "no_domain_decomposition") == 0) {
+      domain_decomposition = false;
+    } else {
+      error->all(FLERR, "Invalid option for pair_style mace.");
+    }
+  } else if (narg > 1) {
+    error->all(FLERR, "Too many pair_style arguments for pair_style mace.");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+std::cout << "hello from kokkos coeff" << std::endl;
+  if (!allocated) allocate();
+std::cout << "allocated: " << allocated << std::endl;
+  PairMACE::coeff(narg,arg);
+std::cout << "allocated: " << allocated << std::endl;
+}
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::init_style()
+{
+std::cout << "hello from kokkos init_style" << std::endl;
+//  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
+//
+//  /*
+//    MACE requires the full neighbor list AND neighbors of ghost atoms
+//    it appears that:
+//      * without REQ_GHOST
+//           list->gnum == 0
+//           list->ilist does not include ghost atoms, but the jlists do
+//      * with REQ_GHOST
+//           list->gnum == atom->nghost
+//           list->ilist includes ghost atoms
+//  */
+//  if (domain_decomposition) {
+//    neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
+//  } else {
+//    neighbor->add_request(this, NeighConst::REQ_FULL);
+//  }
+
+  if (host_flag) {
+    PairMACE::init_style();
+    return;
+  }
+std::cout << "after mace init" << std::endl;
+
+
+  // neighbor list request for KOKKOS
+
+  neighflag = lmp->kokkos->neighflag;
+
+  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+//  if (neighflag == FULL)
+//    error->all(FLERR,"Must use half neighbor list style with pair pace/kk");
+
+
+
+
+
+
+
+
+
+
+
+//  // TODO: from lj_cut, possibly delete
+//  // adjust neighbor list request for KOKKOS
+//  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+//  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+//                           !std::is_same<DeviceType,LMPDeviceType>::value);
+//  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+//
+//  neighflag = lmp->kokkos->neighflag;
+//  auto request = neighbor->find_request(this);
+//std::cout << "before request" << std::endl;
+//  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+//                           !std::is_same<DeviceType,LMPDeviceType>::value);
+//std::cout << "before request" << std::endl;
+//  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+//std::cout << "after request" << std::endl;
+//  if (neighflag == FULL)
+//    std::cout << "REQUESTING FULL LIST." << std::endl;
+//  if (neighflag == FULL) request->enable_full();
+
+  //auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+  //request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+  //                         !std::is_same<DeviceType,LMPDeviceType>::value);
+  //request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  //if (neighflag == FULL)
+  //  error->all(FLERR,"Must use half neighbor list style with pair pace/kk");
+
+std::cout << "goodbye from init_style" << std::endl;
+}
+
+template<class DeviceType>
+double PairMACEKokkos<DeviceType>::init_one(int i, int j)
+{
+std::cout << "hello from kokkos init_one" << std::endl;
+  double cutone = PairMACE::init_one(i,j);
+
+  //k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j];
+  //k_scale.template modify<LMPHostType>();
+
+  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
+  k_cutsq.template modify<LMPHostType>();
+
+  return cutone;
+
+  //// to account for message passing, require cutoff of n_layers * r_max
+  //return num_interactions*model.attr("r_max").toTensor().item<double>();
+
+//  k_params.h_view(i,j).lj1 = lj1[i][j];
+//  k_params.h_view(i,j).lj2 = lj2[i][j];
+//  k_params.h_view(i,j).lj3 = lj3[i][j];
+//  k_params.h_view(i,j).lj4 = lj4[i][j];
+//  k_params.h_view(i,j).offset = offset[i][j];
+//  k_params.h_view(i,j).cutsq = cutone*cutone;
+//  k_params.h_view(j,i) = k_params.h_view(i,j);
+//  if (i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+//    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+//    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
+//  }
+//
+//  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
+//  k_cutsq.template modify<LMPHostType>();
+//  k_params.template modify<LMPHostType>();
+
+//  return cutone;
+}
+
+template<class DeviceType>
+void PairMACEKokkos<DeviceType>::allocate()
+{
+std::cout << "hello from kokkos allocate" << std::endl;
+  PairMACE::allocate();
+
+  int n = atom->ntypes + 1;
+  //MemKK::realloc_kokkos(d_map, "pace:map", n);
+
+  MemKK::realloc_kokkos(k_cutsq, "mace:cutsq", n, n);
+  d_cutsq = k_cutsq.template view<DeviceType>();
+
+  // TODO: from lj_cut, possibly delete
+  //int n = atom->ntypes;
+  //memory->destroy(cutsq);
+  //memoryKK->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  //d_cutsq = k_cutsq.template view<DeviceType>();
+  //k_params = Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType>("PairLJCut::params",n+1,n+1);
+  //params = k_params.template view<DeviceType>();
+}
+
+namespace LAMMPS_NS {
+template class PairMACEKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class PairMACEKokkos<LMPHostType>;
+#endif
+}
+
diff --git a/src/ML-MACE/pair_mace_kokkos.h b/src/KOKKOS/pair_mace_kokkos.h
similarity index 85%
rename from src/ML-MACE/pair_mace_kokkos.h
rename to src/KOKKOS/pair_mace_kokkos.h
index 0bf3b813941..cbb78cc6e6c 100644
--- a/src/ML-MACE/pair_mace_kokkos.h
+++ b/src/KOKKOS/pair_mace_kokkos.h
@@ -27,8 +27,9 @@ PairStyle(mace/kk/host,PairMACEKokkos<LMPHostType>);
 #ifndef LMP_PAIR_MACE_KOKKOS_H
 #define LMP_PAIR_MACE_KOKKOS_H
 
-#include "pair_kokkos.h"
 #include "pair_mace.h"
+#include "kokkos_type.h"
+#include "pair_kokkos.h"
 #include "neigh_list_kokkos.h"
 
 namespace LAMMPS_NS {
@@ -38,7 +39,7 @@ class PairMACEKokkos : public PairMACE {
 
  public:
 
-  //enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF};
+  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF};
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
   PairMACEKokkos(class LAMMPS *);
@@ -71,6 +72,23 @@ class PairMACEKokkos : public PairMACE {
      "Fr", "Ra", "Ac", "Th", "Pa",  "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
                        "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"};
 
+
+  // kokkos stuff
+  int host_flag;
+  int neighflag;
+  //typename AT::t_x_array_randomread x;
+  typename AT::t_x_array c_x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+
+  
+  //Kokkos::View<double*[3], DeviceType> k_positions;
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq;
+  typedef Kokkos::View<F_FLOAT**, DeviceType> t_fparams;
+  t_fparams d_cutsq;
+
 };
 }    // namespace LAMMPS_NS
 
diff --git a/src/ML-MACE/pair_mace_kokkos.cpp b/src/ML-MACE/pair_mace_kokkos.cpp
deleted file mode 100644
index 55ef528330b..00000000000
--- a/src/ML-MACE/pair_mace_kokkos.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributors
-      William C Witt (University of Cambridge)
-------------------------------------------------------------------------- */
-
-#include "pair_mace_kokkos.h"
-
-// TODO: do i need all these?
-#include "atom_kokkos.h"
-#include "atom_masks.h"
-#include "error.h"
-#include "force.h"
-#include "kokkos.h"
-#include "memory_kokkos.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "neighbor.h"
-#include "update.h"
-
-#include <algorithm>
-#include <iostream>
-#include <stdexcept>
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-PairMACEKokkos<DeviceType>::PairMACEKokkos(LAMMPS *lmp) : PairMACE(lmp)
-{
-  no_virial_fdotr_compute = 1;
-
-  // TODO: from lj_cut ... possibly delete
-  kokkosable = 1;
-  atomKK = (AtomKokkos *) atom;
-  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
-  //datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-PairMACEKokkos<DeviceType>::~PairMACEKokkos()
-{
-  // TODO: from lj_cut, possibly delete
-  if (copymode) return;
-
-  // TODO: from lj_cut, possibly delete
-  //if (allocated) {
-  //  memoryKK->destroy_kokkos(k_eatom,eatom);
-  //  memoryKK->destroy_kokkos(k_vatom,vatom);
-  //  memoryKK->destroy_kokkos(k_cutsq,cutsq);
-  //}
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
-{
-  ev_init(eflag,vflag,0);
-
-  if (eflag_atom || vflag_atom) {
-    error->all(FLERR, "ERROR: kokkos eflag_atom not implemented.");
-  }
-
-//  if (eflag_atom) {
-//    memoryKK->destroy_kokkos(k_eatom,eatom);
-//    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
-//    d_eatom = k_eatom.view<DeviceType>();
-//  }
-//  if (vflag_atom) {
-//    memoryKK->destroy_kokkos(k_vatom,vatom);
-//    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
-//    d_vatom = k_vatom.view<DeviceType>();
-//  }
-
-  atomKK->sync(execution_space,datamask_read);
-  //k_cutsq.template sync<DeviceType>();
-  //k_params.template sync<DeviceType>();
-  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
-  else atomKK->modified(execution_space,F_MASK);
-
-  x = atomKK->k_x.view<DeviceType>();
-  c_x = atomKK->k_x.view<DeviceType>();
-  f = atomKK->k_f.view<DeviceType>();
-  type = atomKK->k_type.view<DeviceType>();
-  nlocal = atom->nlocal;
-  nall = atom->nlocal + atom->nghost;
-  //newton_pair = force->newton_pair;
-  //special_lj[0] = force->special_lj[0];
-  //special_lj[1] = force->special_lj[1];
-  //special_lj[2] = force->special_lj[2];
-  //special_lj[3] = force->special_lj[3];
-
-  // loop over neighbors of my atoms
-
-  //copymode = 1;
-
-  //EV_FLOAT ev = pair_compute<PairLJCutKokkos<DeviceType>,void >(this,(NeighListKokkos<DeviceType>*)list);
-
-  //if (eflag_global) eng_vdwl += ev.evdwl;
-  //if (vflag_global) {
-  //  virial[0] += ev.v[0];
-  //  virial[1] += ev.v[1];
-  //  virial[2] += ev.v[2];
-  //  virial[3] += ev.v[3];
-  //  virial[4] += ev.v[4];
-  //  virial[5] += ev.v[5];
-  //}
-
-  //if (eflag_atom) {
-  //  k_eatom.template modify<DeviceType>();
-  //  k_eatom.template sync<LMPHostType>();
-  //}
-
-  //if (vflag_atom) {
-  //  k_vatom.template modify<DeviceType>();
-  //  k_vatom.template sync<LMPHostType>();
-  //}
-
-  copymode = 0;
-  
-
-//  ev_init(eflag, vflag);
-//
-//  if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
-//  if (domain_decomposition) {
-//    if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
-//  }
-//
-//  // ----- positions -----
-//  int n_nodes;
-//  if (domain_decomposition) {
-//    n_nodes = atom->nlocal + atom->nghost;
-//  } else {
-//    // normally, ghost atoms are included in the graph as independent
-//    // nodes, as required when the local domain does not have PBC.
-//    // however, in no_domain_decomposition mode, ghost atoms are known to
-//    // be shifted versions of local atoms.
-//    n_nodes = atom->nlocal;
-//  }
-//  auto positions = torch::empty({n_nodes,3}, torch_float_dtype);
-//  #pragma omp parallel for
-//  for (int ii=0; ii<n_nodes; ++ii) {
-//    int i = list->ilist[ii];
-//    positions[i][0] = atom->x[i][0];
-//    positions[i][1] = atom->x[i][1];
-//    positions[i][2] = atom->x[i][2];
-//  }
-//
-//  // ----- cell -----
-//  auto cell = torch::zeros({3,3}, torch_float_dtype);
-//  cell[0][0] = domain->h[0];
-//  cell[0][1] = 0.0;
-//  cell[0][2] = 0.0;
-//  cell[1][0] = domain->h[5];
-//  cell[1][1] = domain->h[1];
-//  cell[1][2] = 0.0;
-//  cell[2][0] = domain->h[4];
-//  cell[2][1] = domain->h[3];
-//  cell[2][2] = domain->h[2];
-//
-//  // ----- edge_index and unit_shifts -----
-//  // count total number of edges
-//  int n_edges = 0;
-//  std::vector<int> n_edges_vec(n_nodes, 0);
-//  #pragma omp parallel for reduction(+:n_edges)
-//  for (int ii=0; ii<n_nodes; ++ii) {
-//    int i = list->ilist[ii];
-//    double xtmp = atom->x[i][0];
-//    double ytmp = atom->x[i][1];
-//    double ztmp = atom->x[i][2];
-//    int *jlist = list->firstneigh[i];
-//    int jnum = list->numneigh[i];
-//    for (int jj=0; jj<jnum; ++jj) {
-//      int j = jlist[jj];
-//      j &= NEIGHMASK;
-//      double delx = xtmp - atom->x[j][0];
-//      double dely = ytmp - atom->x[j][1];
-//      double delz = ztmp - atom->x[j][2];
-//      double rsq = delx * delx + dely * dely + delz * delz;
-//      if (rsq < r_max_squared) {
-//        n_edges += 1;
-//        n_edges_vec[ii] += 1;
-//      }
-//    }
-//  }
-//  // make first_edge vector to help with parallelizing following loop
-//  std::vector<int> first_edge(n_nodes);
-//  first_edge[0] = 0;
-//  for (int ii=0; ii<n_nodes-1; ++ii) {
-//    first_edge[ii+1] = first_edge[ii] + n_edges_vec[ii];
-//  }
-//  // fill edge_index and unit_shifts tensors
-//  auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
-//  auto unit_shifts = torch::zeros({n_edges,3}, torch_float_dtype);
-//  auto shifts = torch::zeros({n_edges,3}, torch_float_dtype);
-//  #pragma omp parallel for
-//  for (int ii=0; ii<n_nodes; ++ii) {
-//    int i = list->ilist[ii];
-//    double xtmp = atom->x[i][0];
-//    double ytmp = atom->x[i][1];
-//    double ztmp = atom->x[i][2];
-//    int *jlist = list->firstneigh[i];
-//    int jnum = list->numneigh[i];
-//    int k = first_edge[ii];
-//    for (int jj=0; jj<jnum; ++jj) {
-//      int j = jlist[jj];
-//      j &= NEIGHMASK;
-//      double delx = xtmp - atom->x[j][0];
-//      double dely = ytmp - atom->x[j][1];
-//      double delz = ztmp - atom->x[j][2];
-//      double rsq = delx * delx + dely * dely + delz * delz;
-//      if (rsq < r_max_squared) {
-//        edge_index[0][k] = i;
-//        if (domain_decomposition) {
-//          edge_index[1][k] = j;
-//        } else {
-//          int j_local = atom->map(atom->tag[j]);
-//          edge_index[1][k] = j_local;
-//          double shiftx = atom->x[j][0] - atom->x[j_local][0];
-//          double shifty = atom->x[j][1] - atom->x[j_local][1];
-//          double shiftz = atom->x[j][2] - atom->x[j_local][2];
-//          double shiftxs = std::round(domain->h_inv[0]*shiftx + domain->h_inv[5]*shifty + domain->h_inv[4]*shiftz);
-//          double shiftys = std::round(domain->h_inv[1]*shifty + domain->h_inv[3]*shiftz);
-//          double shiftzs = std::round(domain->h_inv[2]*shiftz);
-//          unit_shifts[k][0] = shiftxs;
-//          unit_shifts[k][1] = shiftys;
-//          unit_shifts[k][2] = shiftzs;
-//          shifts[k][0] = domain->h[0]*shiftxs + domain->h[5]*shiftys + domain->h[4]*shiftzs;
-//          shifts[k][1] = domain->h[1]*shiftys + domain->h[3]*shiftzs;
-//          shifts[k][2] = domain->h[2]*shiftzs;
-//        }
-//        k++;
-//      }
-//    }
-//  }
-//
-//  // ----- node_attrs -----
-//  // node_attrs is one-hot encoding for atomic numbers
-//  auto mace_type = [this](int lammps_type) {
-//    for (int i=0; i<mace_atomic_numbers.size(); ++i) {
-//      if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
-//        return i+1;
-//      }
-//    }
-//    error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
-//    return -1;
-//  };
-//  int n_node_feats = mace_atomic_numbers.size();
-//  auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch_float_dtype);
-//  #pragma omp parallel for
-//  for (int ii=0; ii<n_nodes; ++ii) {
-//    int i = list->ilist[ii];
-//    node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
-//  }
-//
-//  // ----- mask for ghost -----
-//  auto mask = torch::zeros(n_nodes, torch::dtype(torch::kBool));
-//  #pragma omp parallel for
-//  for (int ii=0; ii<atom->nlocal; ++ii) {
-//    int i = list->ilist[ii];
-//    mask[i] = true;
-//  }
-//
-//  auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
-//  auto energy = torch::empty({1}, torch_float_dtype);
-//  auto forces = torch::empty({n_nodes,3}, torch_float_dtype);
-//  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
-//  auto weight = torch::empty({1}, torch_float_dtype);
-//  ptr[0] = 0;
-//  ptr[1] = n_nodes;
-//  weight[0] = 1.0;
-//
-//  // pack the input, call the model, extract the output
-//  c10::Dict<std::string, torch::Tensor> input;
-//  input.insert("batch", batch);
-//  input.insert("cell", cell);
-//  input.insert("edge_index", edge_index);
-//  input.insert("energy", energy);
-//  input.insert("forces", forces);
-//  input.insert("node_attrs", node_attrs);
-//  input.insert("positions", positions);
-//  input.insert("ptr", ptr);
-//  input.insert("shifts", shifts);
-//  input.insert("unit_shifts", unit_shifts);
-//  input.insert("weight", weight);
-//  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
-//
-//  // mace energy
-//  //   -> sum of site energies of local atoms
-//  if (eflag_global) {
-//    auto node_energy = output.at("node_energy").toTensor();
-//    eng_vdwl = 0.0;
-//    #pragma omp parallel for reduction(+:eng_vdwl)
-//    for (int ii=0; ii<list->inum; ++ii) {
-//      int i = list->ilist[ii];
-//      eng_vdwl += node_energy[i].item<double>();
-//    }
-//  }
-//
-//  // mace forces
-//  //   -> derivatives of total mace energy
-//  forces = output.at("forces").toTensor();
-//  #pragma omp parallel for
-//  for (int ii=0; ii<list->inum; ++ii) {
-//    int i = list->ilist[ii];
-//    atom->f[i][0] = forces[i][0].item<double>();
-//    atom->f[i][1] = forces[i][1].item<double>();
-//    atom->f[i][2] = forces[i][2].item<double>();
-//  }
-//
-//  // mace site energies
-//  //   -> local atoms only
-//  if (eflag_atom) {
-//    auto node_energy = output.at("node_energy").toTensor();
-//    #pragma omp parallel for
-//    for (int ii=0; ii<list->inum; ++ii) {
-//      int i = list->ilist[ii];
-//      eatom[i] = node_energy[i].item<double>();
-//    }
-//  }
-//
-//  // mace virials (local atoms only)
-//  //   -> derivatives of sum of site energies of local atoms
-//  if (vflag_global) {
-//    auto vir = output.at("virials").toTensor();
-//    virial[0] = vir[0][0][0].item<double>();
-//    virial[1] = vir[0][1][1].item<double>();
-//    virial[2] = vir[0][2][2].item<double>();
-//    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
-//    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
-//    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
-//  }
-//
-//  // mace site virials
-//  //   -> not available
-//  if (vflag_atom) {
-//    error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
-//  }
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairMACEKokkos<DeviceType>::settings(int narg, char **arg)
-{
-  if (narg == 1) {
-    if (strcmp(arg[0], "no_domain_decomposition") == 0) {
-      domain_decomposition = false;
-    } else {
-      error->all(FLERR, "Invalid option for pair_style mace.");
-    }
-  } else if (narg > 1) {
-    error->all(FLERR, "Too many pair_style arguments for pair_style mace.");
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-void PairMACEKokkos<DeviceType>::coeff(int narg, char **arg)
-{
-  // TODO: remove print statements from this routine, or have a single proc print
-
-  if (!allocated) allocate();
-
-  std::cout << "Loading MACEKokkos model from \"" << arg[2] << "\" ...";
-  model = torch::jit::load(arg[2]);
-  std::cout << " finished." << std::endl;
-
-  // extract default dtype from mace model
-  for (auto p: model.named_attributes()) {
-      // this is a somewhat random choice of variable to check. could it be improved?
-      if (p.name == "model.node_embedding.linear.weight") {
-          if (p.value.toTensor().dtype() == caffe2::TypeMeta::Make<float>()) {
-            torch_float_dtype = torch::kFloat32;
-          } else if (p.value.toTensor().dtype() == caffe2::TypeMeta::Make<double>()) {
-            torch_float_dtype = torch::kFloat64;
-          }
-      }
-  }
-  std::cout << "  - The torch_float_dtype is: " << torch_float_dtype << std::endl;
-
-  // extract r_max from mace model
-  r_max = model.attr("r_max").toTensor().item<double>();
-  r_max_squared = r_max*r_max;
-  std::cout << "  - The r_max is: " << r_max << "." << std::endl;
-  num_interactions = model.attr("num_interactions").toTensor().item<int64_t>();
-  std::cout << "  - The model has: " << num_interactions << " layers." << std::endl;
-
-  // extract atomic numbers from mace model
-  auto a_n = model.attr("atomic_numbers").toTensor();
-  for (int i=0; i<a_n.size(0); ++i) {
-    mace_atomic_numbers.push_back(a_n[i].item<int64_t>());
-  }
-  std::cout << "  - The model atomic numbers are: " << mace_atomic_numbers << "." << std::endl;
-
-  // extract atomic numbers from pair_coeff
-  for (int i=3; i<narg; ++i) {
-    auto iter = std::find(periodic_table.begin(), periodic_table.end(), arg[i]);
-    int index = std::distance(periodic_table.begin(), iter) + 1;
-    lammps_atomic_numbers.push_back(index);
-  }
-  std::cout << "  - The pair_coeff atomic numbers are: " << lammps_atomic_numbers << "." << std::endl;
-
-  for (int i=1; i<atom->ntypes+1; i++)
-    for (int j=i; j<atom->ntypes+1; j++)
-      setflag[i][j] = 1;
-}
-
-template<class DeviceType>
-void PairMACEKokkos<DeviceType>::init_style()
-{
-//  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
-//
-//  /*
-//    MACE requires the full neighbor list AND neighbors of ghost atoms
-//    it appears that:
-//      * without REQ_GHOST
-//           list->gnum == 0
-//           list->ilist does not include ghost atoms, but the jlists do
-//      * with REQ_GHOST
-//           list->gnum == atom->nghost
-//           list->ilist includes ghost atoms
-//  */
-//  if (domain_decomposition) {
-//    neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
-//  } else {
-//    neighbor->add_request(this, NeighConst::REQ_FULL);
-//  }
-
-  PairMACE::init_style();
-
-  // TODO: from lj_cut, possibly delete
-  // adjust neighbor list request for KOKKOS
-
-  //neighflag = lmp->kokkos->neighflag;
-  //auto request = neighbor->find_request(this);
-  //request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-  //                         !std::is_same<DeviceType,LMPDeviceType>::value);
-  //request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-  //if (neighflag == FULL) request->enable_full();
-}
-
-template<class DeviceType>
-double PairMACEKokkos<DeviceType>::init_one(int i, int j)
-{
-//  // to account for message passing, require cutoff of n_layers * r_max
-//  return num_interactions*model.attr("r_max").toTensor().item<double>();
-
-  double cutone = PairMACE::init_one(i,j);
-
-//  k_params.h_view(i,j).lj1 = lj1[i][j];
-//  k_params.h_view(i,j).lj2 = lj2[i][j];
-//  k_params.h_view(i,j).lj3 = lj3[i][j];
-//  k_params.h_view(i,j).lj4 = lj4[i][j];
-//  k_params.h_view(i,j).offset = offset[i][j];
-//  k_params.h_view(i,j).cutsq = cutone*cutone;
-//  k_params.h_view(j,i) = k_params.h_view(i,j);
-//  if (i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
-//    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
-//    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
-//  }
-//
-//  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
-//  k_cutsq.template modify<LMPHostType>();
-//  k_params.template modify<LMPHostType>();
-
-  return cutone;
-}
-
-template<class DeviceType>
-void PairMACEKokkos<DeviceType>::allocate()
-{
-  PairMACE::allocate();
-
-  // TODO: from lj_cut, possibly delete
-  //int n = atom->ntypes;
-  //memory->destroy(cutsq);
-  //memoryKK->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
-  //d_cutsq = k_cutsq.template view<DeviceType>();
-  ////k_params = Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType>("PairLJCut::params",n+1,n+1);
-  //params = k_params.template view<DeviceType>();
-}
-
-namespace LAMMPS_NS {
-template class PairMACEKokkos<LMPDeviceType>;
-#ifdef LMP_KOKKOS_GPU
-template class PairMACEKokkos<LMPHostType>;
-#endif
-}
-

From 1fafa1ffc487e09862390fdba1dc20fd3611b19c Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Fri, 10 Mar 2023 22:47:35 +0000
Subject: [PATCH 25/51] wip.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 149 +++++++++++++++++---------------
 src/KOKKOS/pair_mace_kokkos.h   |  26 +-----
 2 files changed, 84 insertions(+), 91 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 64a80ae9338..179ec3849f3 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -89,6 +89,13 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 {
   ev_init(eflag,vflag,0);
 
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK|TAG_MASK);
+
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  auto d_numneigh = k_list->d_numneigh;
+  auto d_neighbors = k_list->d_neighbors;
+  auto d_ilist = k_list->d_ilist;
+
   if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
   if (domain_decomposition) {
     if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
@@ -109,6 +116,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 //    d_vatom = k_vatom.view<DeviceType>();
 //  }
 
+  int nlocal = atom->nlocal;
   auto r_max_squared = this->r_max_squared;
   auto h0 = domain->h[0];
   auto h1 = domain->h[1];
@@ -123,29 +131,24 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto hinv4 = domain->h_inv[4];
   auto hinv5 = domain->h_inv[5];
 
-  auto k_lammps_atomic_numbers = Kokkos::View<long*>("k_lammps_atomic_numbers",lammps_atomic_numbers.size());
+  auto k_lammps_atomic_numbers = Kokkos::View<long*,DeviceType>("k_lammps_atomic_numbers",lammps_atomic_numbers.size());
   auto k_lammps_atomic_numbers_mirror = Kokkos::create_mirror_view(k_lammps_atomic_numbers);
   for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
-    k_lammps_atomic_numbers(i) = lammps_atomic_numbers[i];
+    k_lammps_atomic_numbers_mirror(i) = lammps_atomic_numbers[i];
   }
-  auto k_mace_atomic_numbers = Kokkos::View<long*>("k_mace_atomic_numbers",mace_atomic_numbers.size());
+  auto k_mace_atomic_numbers = Kokkos::View<long*,DeviceType>("k_mace_atomic_numbers",mace_atomic_numbers.size());
   auto k_mace_atomic_numbers_mirror = Kokkos::create_mirror_view(k_mace_atomic_numbers);
   for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
-    k_mace_atomic_numbers(i) = mace_atomic_numbers[i];
+    k_mace_atomic_numbers_mirror(i) = mace_atomic_numbers[i];
   }
   auto mace_atomic_numbers_size = mace_atomic_numbers.size();
 
-//  auto k_positions = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_positions", n_nodes);
-//  Kokkos::vector<int64_t> lammps_atomic_numbers = this->lammps_atomic_numbers;
-//  Kokkos::vector<int64_t> mace_atomic_numbers = this->mace_atomic_numbers;
-
   // atom map
   auto map_style = atom->map_style;
   auto k_map_array = atomKK->k_map_array;
   auto k_map_hash = atomKK->k_map_hash;
   k_map_array.template sync<DeviceType>();
 
-  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK|TAG_MASK);
 
   auto x = atomKK->k_x.view<DeviceType>();
 //  c_x = atomKK->k_x.view<DeviceType>();
@@ -155,11 +158,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 //  int nlocal = atom->nlocal;
 //  int nall = atom->nlocal + atom->nghost;
 
-  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
-  auto d_numneigh = k_list->d_numneigh;
-  auto d_neighbors = k_list->d_neighbors;
-  auto d_ilist = k_list->d_ilist;
-  //inum = list->inum;
 
   // ----- positions -----
   int n_nodes;
@@ -179,11 +177,13 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
     k_positions(i,2) = x(i,2);
   });
   auto positions = torch::from_blob(
-    k_positions.data(), {n_nodes,3}, torch_float_dtype);
+    k_positions.data(),
+    {n_nodes,3},
+    torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
 
   // ----- cell -----
   // TODO: how to use kokkos here?
-  auto cell = torch::zeros({3,3}, torch_float_dtype);
+  auto cell = torch::zeros({3,3}, torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
   cell[0][0] = h0;
   cell[0][1] = 0.0;
   cell[0][2] = 0.0;
@@ -198,11 +198,11 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   // count total number of edges
   auto k_n_edges_vec = Kokkos::View<long*,DeviceType>("k_n_edges_vec", n_nodes);
   Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA (const int ii) {
-    const int i = d_ilist[ii];
+    const int i = d_ilist(ii);
     const X_FLOAT xtmp = x(i,0);
     const X_FLOAT ytmp = x(i,1);
     const X_FLOAT ztmp = x(i,2);
-    const int jnum = d_numneigh[i];
+    const int jnum = d_numneigh(i);
     for (int jj=0; jj<jnum; ++jj) {
       int j = d_neighbors(i,jj);
       j &= NEIGHMASK;
@@ -222,12 +222,12 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   // make first_edge vector to help with parallelizing following loop
   auto k_first_edge = Kokkos::View<long*,DeviceType>("k_first_edge", n_nodes);  // initialized to zero
   Kokkos::parallel_for(n_nodes-1, KOKKOS_LAMBDA(const int ii) {
-    k_first_edge[ii+1] = k_first_edge(ii) + k_n_edges_vec(ii);
+    k_first_edge(ii+1) = k_first_edge(ii) + k_n_edges_vec(ii);
   });
   // fill edge_index and unit_shifts tensors
-  Kokkos::View<long**> k_edge_index("k_edge_index", 2, n_edges);
-  Kokkos::View<double*[3]> k_unit_shifts("k_unit_shifts", n_edges);
-  Kokkos::View<double*[3]> k_shifts("k_shifts", n_edges);
+  auto k_edge_index = Kokkos::View<long**,Kokkos::LayoutRight,DeviceType>("k_edge_index", 2, n_edges);
+  auto k_unit_shifts = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_unit_shifts", n_edges);
+  auto k_shifts = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_shifts", n_edges);
   if (domain_decomposition) {
 
 //          if (domain_decomposition) {
@@ -236,11 +236,11 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   } else {
 
     Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
-      const int i = d_ilist[ii];
+      const int i = d_ilist(ii);
       const X_FLOAT xtmp = x(i,0);
       const X_FLOAT ytmp = x(i,1);
       const X_FLOAT ztmp = x(i,2);
-      const int jnum = d_numneigh[i];
+      const int jnum = d_numneigh(i);
       int k = k_first_edge(ii);
       for (int jj=0; jj<jnum; ++jj) {
         int j = d_neighbors(i,jj);
@@ -249,7 +249,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
         const X_FLOAT dely = ytmp - x(j,1);
         const X_FLOAT delz = ztmp - x(j,2);
         const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-
         if (rsq < r_max_squared) {
           k_edge_index(0,k) = i;
           //int j_local = atom->map(tag(j));
@@ -272,17 +271,26 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
       }
     });
   }
-  auto edge_index = torch::from_blob(k_edge_index.data(), {2,n_edges}, torch::dtype(torch::kInt64));
-  auto unit_shifts = torch::from_blob(k_unit_shifts.data(), {n_edges,3}, torch_float_dtype);
-  auto shifts = torch::from_blob(k_shifts.data(), {n_edges,3}, torch_float_dtype);
+  auto edge_index = torch::from_blob(
+    k_edge_index.data(),
+    {2,n_edges},
+    torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA));
+  auto unit_shifts = torch::from_blob(
+    k_unit_shifts.data(),
+    {n_edges,3},
+    torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+  auto shifts = torch::from_blob(
+    k_shifts.data(),
+    {n_edges,3},
+    torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
 
   // ----- node_attrs -----
   // node_attrs is one-hot encoding for atomic numbers
-  int n_node_feats = mace_atomic_numbers.size();
-  Kokkos::View<double**> k_node_attrs("k_node_attrs", n_nodes, n_node_feats);
+  int n_node_feats = mace_atomic_numbers_size;
+  auto k_node_attrs = Kokkos::View<double**,Kokkos::LayoutRight,DeviceType>("k_node_attrs", n_nodes, n_node_feats);
   Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
-    const int i = d_ilist[ii];
-    const int lammps_type = type[i];
+    const int i = d_ilist(ii);
+    const int lammps_type = type(i);
     int t = -1;
     for (int j=0; j<mace_atomic_numbers_size; ++j) {
       if (k_mace_atomic_numbers(j)==k_lammps_atomic_numbers(lammps_type-1)) {
@@ -292,40 +300,45 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
     //if (t==-1) error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
     k_node_attrs(i,t-1) = 1.0;
   });
-  auto node_attrs = torch::from_blob(k_node_attrs.data(), {n_nodes,n_node_feats}, torch_float_dtype);
-
-//  // ----- mask for ghost -----
-//  Kokkos::View<bool*> k_mask("k_mask", n_nodes);
-//  Kokkos::parallel_for(atom->nlocal, KOKKOS_LAMBDA(const int ii) {
-//    const int i = d_ilist[ii];
-//    k_mask(i) = true;
-//  });
-//  auto mask = torch::from_blob(k_mask.data(), n_nodes, torch::dtype(torch::kBool));
-//
-//  auto batch = torch::zeros({n_nodes}, torch::dtype(torch::kInt64));
-//  auto energy = torch::empty({1}, torch_float_dtype);
-//  auto forces = torch::empty({n_nodes,3}, torch_float_dtype);
-//  auto ptr = torch::empty({2}, torch::dtype(torch::kInt64));
-//  auto weight = torch::empty({1}, torch_float_dtype);
-//  ptr[0] = 0;
-//  ptr[1] = n_nodes;
-//  weight[0] = 1.0;
-//
-//std::cout << "packing the input" << std::endl;
-//
-//  // pack the input, call the model, extract the output
-//  c10::Dict<std::string, torch::Tensor> input;
-//  input.insert("batch", batch);
-//  input.insert("cell", cell);
-//  input.insert("edge_index", edge_index);
-//  input.insert("energy", energy);
-//  input.insert("forces", forces);
-//  input.insert("node_attrs", node_attrs);
-//  input.insert("positions", positions);
-//  input.insert("ptr", ptr);
-//  input.insert("shifts", shifts);
-//  input.insert("unit_shifts", unit_shifts);
-//  input.insert("weight", weight);
+  auto node_attrs = torch::from_blob(
+    k_node_attrs.data(),
+    {n_nodes, n_node_feats},
+    torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+
+  // ----- mask for ghost -----
+  Kokkos::View<bool*,DeviceType> k_mask("k_mask", n_nodes);
+  Kokkos::parallel_for(nlocal, KOKKOS_LAMBDA(const int ii) {
+    const int i = d_ilist(ii);
+    k_mask(i) = true;
+  });
+  auto mask = torch::from_blob(
+    k_mask.data(),
+    n_nodes,
+    torch::TensorOptions().dtype(torch::kBool).device(torch::kCUDA));
+
+  // TODO: add device?
+  auto batch = torch::zeros({n_nodes}, torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA));
+  auto energy = torch::empty({1}, torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+  auto forces = torch::empty({n_nodes,3}, torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+  auto ptr = torch::empty({2}, torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA));
+  auto weight = torch::empty({1}, torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+  ptr[0] = 0;
+  ptr[1] = n_nodes;
+  weight[0] = 1.0;
+
+  // pack the input, call the model, extract the output
+  c10::Dict<std::string, torch::Tensor> input;
+  input.insert("batch", batch);
+  input.insert("cell", cell);
+  input.insert("edge_index", edge_index);
+  input.insert("energy", energy);
+  input.insert("forces", forces);
+  input.insert("node_attrs", node_attrs);
+  input.insert("positions", positions);
+  input.insert("ptr", ptr);
+  input.insert("shifts", shifts);
+  input.insert("unit_shifts", unit_shifts);
+  input.insert("weight", weight);
 ////std::cout << "batch" << batch << std::endl;
 ////std::cout << "cell" << cell << std::endl;
 ////std::cout << "edge_index" << edge_index << std::endl;
@@ -338,8 +351,8 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 ////std::cout << "unit_shifts" << unit_shifts << std::endl;
 ////std::cout << "weight" << weight << std::endl;
 ////std::cout << "mask" << mask << std::endl;
-//  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
-//
+  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
+
 ////std::cout << "energy: " << output.at("energy").toTensor() << std::endl;
 ////std::cout << "node_energy: " << output.at("node_energy").toTensor() << std::endl;
 //
diff --git a/src/KOKKOS/pair_mace_kokkos.h b/src/KOKKOS/pair_mace_kokkos.h
index cbb78cc6e6c..ab658b70f49 100644
--- a/src/KOKKOS/pair_mace_kokkos.h
+++ b/src/KOKKOS/pair_mace_kokkos.h
@@ -53,33 +53,13 @@ class PairMACEKokkos : public PairMACE {
 
  protected:
 
-  bool domain_decomposition = true;
-  torch::jit::script::Module model;
-  torch::ScalarType torch_float_dtype;
-  double r_max;
-  double r_max_squared;
-  int64_t num_interactions;
-  std::vector<int64_t> mace_atomic_numbers;
-  std::vector<int64_t> lammps_atomic_numbers;
-  const std::array<std::string,118> periodic_table =
-    { "H", "He",
-     "Li", "Be",                                                              "B",  "C",  "N",  "O",  "F", "Ne",
-     "Na", "Mg",                                                             "Al", "Si",  "P",  "S", "Cl", "Ar",
-     "K",  "Ca", "Sc", "Ti",  "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
-     "Rb", "Sr",  "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te",  "I", "Xe",
-     "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu",
-                       "Hf", "Ta",  "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn",
-     "Fr", "Ra", "Ac", "Th", "Pa",  "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr",
-                       "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"};
-
-
   // kokkos stuff
   int host_flag;
   int neighflag;
   //typename AT::t_x_array_randomread x;
-  typename AT::t_x_array c_x;
-  typename AT::t_f_array f;
-  typename AT::t_int_1d_randomread type;
+  //typename AT::t_x_array c_x;
+  //typename AT::t_f_array f;
+  //typename AT::t_int_1d_randomread type;
 
   
   //Kokkos::View<double*[3], DeviceType> k_positions;

From d0f1c8aee27d1008e98359e574c894098c3d5a81 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 13 Mar 2023 16:53:09 +0000
Subject: [PATCH 26/51] wip.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 150 +++++++++++++++++++-------------
 1 file changed, 90 insertions(+), 60 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 179ec3849f3..ed65fde8a93 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -96,6 +96,9 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto d_neighbors = k_list->d_neighbors;
   auto d_ilist = k_list->d_ilist;
 
+std::cout << "nlocal inum " << atom->nlocal << " " << list->inum << std::endl;
+std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
+
   if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
   if (domain_decomposition) {
     if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
@@ -155,8 +158,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto f = atomKK->k_f.view<DeviceType>();
   auto tag = atomKK->k_tag.view<DeviceType>();
   auto type = atomKK->k_type.view<DeviceType>();
-//  int nlocal = atom->nlocal;
-//  int nall = atom->nlocal + atom->nghost;
 
 
   // ----- positions -----
@@ -199,17 +200,17 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto k_n_edges_vec = Kokkos::View<long*,DeviceType>("k_n_edges_vec", n_nodes);
   Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA (const int ii) {
     const int i = d_ilist(ii);
-    const X_FLOAT xtmp = x(i,0);
-    const X_FLOAT ytmp = x(i,1);
-    const X_FLOAT ztmp = x(i,2);
+    const double xtmp = x(i,0);
+    const double ytmp = x(i,1);
+    const double ztmp = x(i,2);
     const int jnum = d_numneigh(i);
     for (int jj=0; jj<jnum; ++jj) {
       int j = d_neighbors(i,jj);
       j &= NEIGHMASK;
-      const X_FLOAT delx = xtmp - x(j,0);
-      const X_FLOAT dely = ytmp - x(j,1);
-      const X_FLOAT delz = ztmp - x(j,2);
-      const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+      const double delx = xtmp - x(j,0);
+      const double dely = ytmp - x(j,1);
+      const double delz = ztmp - x(j,2);
+      const double rsq = delx*delx + dely*dely + delz*delz;
       if (rsq < r_max_squared) {
         k_n_edges_vec(ii) += 1;
       }
@@ -228,27 +229,47 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto k_edge_index = Kokkos::View<long**,Kokkos::LayoutRight,DeviceType>("k_edge_index", 2, n_edges);
   auto k_unit_shifts = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_unit_shifts", n_edges);
   auto k_shifts = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_shifts", n_edges);
+
   if (domain_decomposition) {
 
-//          if (domain_decomposition) {
-//            edge_index[1][k] = j;
+    Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
+      const int i = d_ilist(ii);
+      const double xtmp = x(i,0);
+      const double ytmp = x(i,1);
+      const double ztmp = x(i,2);
+      const int jnum = d_numneigh(i);
+      int k = k_first_edge(ii);
+      for (int jj=0; jj<jnum; ++jj) {
+        int j = d_neighbors(i,jj);
+        j &= NEIGHMASK;
+        const double delx = xtmp - x(j,0);
+        const double dely = ytmp - x(j,1);
+        const double delz = ztmp - x(j,2);
+        const double rsq = delx*delx + dely*dely + delz*delz;
+        if (rsq < r_max_squared) {
+          k_edge_index(0,k) = i;
+          k_edge_index(1,k) = j;
+          k++;
+        }
+      }
+    });
 
   } else {
 
     Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
       const int i = d_ilist(ii);
-      const X_FLOAT xtmp = x(i,0);
-      const X_FLOAT ytmp = x(i,1);
-      const X_FLOAT ztmp = x(i,2);
+      const double xtmp = x(i,0);
+      const double ytmp = x(i,1);
+      const double ztmp = x(i,2);
       const int jnum = d_numneigh(i);
       int k = k_first_edge(ii);
       for (int jj=0; jj<jnum; ++jj) {
         int j = d_neighbors(i,jj);
         j &= NEIGHMASK;
-        const X_FLOAT delx = xtmp - x(j,0);
-        const X_FLOAT dely = ytmp - x(j,1);
-        const X_FLOAT delz = ztmp - x(j,2);
-        const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+        const double delx = xtmp - x(j,0);
+        const double dely = ytmp - x(j,1);
+        const double delz = ztmp - x(j,2);
+        const double rsq = delx*delx + dely*dely + delz*delz;
         if (rsq < r_max_squared) {
           k_edge_index(0,k) = i;
           //int j_local = atom->map(tag(j));
@@ -339,45 +360,43 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-////std::cout << "batch" << batch << std::endl;
-////std::cout << "cell" << cell << std::endl;
-////std::cout << "edge_index" << edge_index << std::endl;
-////// energy
-////// forces
-////std::cout << "node_attrs" << node_attrs << std::endl;
-////std::cout << "positions" << positions << std::endl;
-////std::cout << "ptr" << ptr << std::endl;
-////std::cout << "shifts" << shifts << std::endl;
-////std::cout << "unit_shifts" << unit_shifts << std::endl;
-////std::cout << "weight" << weight << std::endl;
-////std::cout << "mask" << mask << std::endl;
+std::cout << "batch" << batch.to("cpu") << std::endl;
+std::cout << "cell" << cell.to("cpu") << std::endl;
+std::cout << "edge_index" << edge_index.to("cpu") << std::endl;
+// energy
+// forces
+std::cout << "node_attrs" << node_attrs.to("cpu") << std::endl;
+std::cout << "positions" << positions.to("cpu") << std::endl;
+std::cout << "ptr" << ptr.to("cpu") << std::endl;
+std::cout << "shifts" << shifts.to("cpu") << std::endl;
+std::cout << "unit_shifts" << unit_shifts.to("cpu") << std::endl;
+std::cout << "weight" << weight.to("cpu") << std::endl;
+std::cout << "mask" << mask.to("cpu") << std::endl;
   auto output = model.forward({input, mask, true, true, false}).toGenericDict();
 
-////std::cout << "energy: " << output.at("energy").toTensor() << std::endl;
-////std::cout << "node_energy: " << output.at("node_energy").toTensor() << std::endl;
-//
-//  // mace energy
-//  //   -> sum of site energies of local atoms
-//  if (eflag_global) {
-//    auto node_energy = output.at("node_energy").toTensor();
-//    eng_vdwl = 0.0;
-//    Kokkos::parallel_reduce(atom->nlocal, KOKKOS_LAMBDA(const int ii, double &eng_vdwl) {
-//      const int i = d_ilist[ii];
-//      eng_vdwl += node_energy[i].item<double>();
-//    }, eng_vdwl);
-//  }
 //
-//  // mace forces
-//  //   -> derivatives of total mace energy
-//  forces = output.at("forces").toTensor();
-//  Kokkos::View<double*[3], Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-//    k_f(forces.data_ptr<double>(), n_nodes);
-//  Kokkos::parallel_for(atom->nlocal, KOKKOS_LAMBDA(const int ii) {
-//    const int i = d_ilist[ii];
-//    f(i,0) = k_f(i,0);
-//    f(i,1) = k_f(i,1);
-//    f(i,2) = k_f(i,2);
-//  });
+  // mace energy
+  //   -> sum of site energies of local atoms
+  if (eflag_global) {
+    auto node_energy = output.at("node_energy").toTensor();
+    auto k_node_energy = Kokkos::View<double*,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(node_energy.data_ptr<double>(),n_nodes);
+    eng_vdwl = 0.0;
+    Kokkos::parallel_reduce(nlocal, KOKKOS_LAMBDA(const int ii, double &eng_vdwl) {
+      const int i = d_ilist(ii);
+      eng_vdwl += k_node_energy(i);
+    }, eng_vdwl);
+  }
+
+  // mace forces
+  //   -> derivatives of total mace energy
+  forces = output.at("forces").toTensor();
+  auto k_forces = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(forces.data_ptr<double>(),n_nodes);
+  Kokkos::parallel_for(nlocal, KOKKOS_LAMBDA(const int ii) {
+    const int i = d_ilist(ii);
+    f(i,0) = k_forces(i,0);
+    f(i,1) = k_forces(i,1);
+    f(i,2) = k_forces(i,2);
+  });
 //
 ////  // mace site energies
 ////  //   -> local atoms only
@@ -435,6 +454,9 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 //
 //  copymode = 0;
 
+std::cout << "energy: " << output.at("energy").toTensor().to("cpu") << std::endl;
+std::cout << "node_energy: " << output.at("node_energy").toTensor().to("cpu") << std::endl;
+
 }
 
 /* ---------------------------------------------------------------------- */
@@ -469,8 +491,8 @@ template<class DeviceType>
 void PairMACEKokkos<DeviceType>::init_style()
 {
 std::cout << "hello from kokkos init_style" << std::endl;
-//  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
-//
+  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
+
 //  /*
 //    MACE requires the full neighbor list AND neighbors of ghost atoms
 //    it appears that:
@@ -493,15 +515,23 @@ std::cout << "hello from kokkos init_style" << std::endl;
   }
 std::cout << "after mace init" << std::endl;
 
-
   // neighbor list request for KOKKOS
 
   neighflag = lmp->kokkos->neighflag;
 
   auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-                           !std::is_same<DeviceType,LMPDeviceType>::value);
-  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  //if (domain_decomposition) {
+    //auto request = neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
+    request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                             !std::is_same<DeviceType,LMPDeviceType>::value);
+    request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+    if (neighflag == FULL) request->enable_full();
+  //} else {
+  //  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+  //  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+  //                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  //  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  //}
 //  if (neighflag == FULL)
 //    error->all(FLERR,"Must use half neighbor list style with pair pace/kk");
 

From 1c59b0a6b7d655b550bdcb0c7432d8ce9c4aff59 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 13 Mar 2023 17:20:20 +0000
Subject: [PATCH 27/51] wip.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 44 ++++++++++++++-------------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index ed65fde8a93..4c2af6f54c4 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -134,12 +134,12 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
   auto hinv4 = domain->h_inv[4];
   auto hinv5 = domain->h_inv[5];
 
-  auto k_lammps_atomic_numbers = Kokkos::View<long*,DeviceType>("k_lammps_atomic_numbers",lammps_atomic_numbers.size());
+  auto k_lammps_atomic_numbers = Kokkos::View<int64_t*,DeviceType>("k_lammps_atomic_numbers",lammps_atomic_numbers.size());
   auto k_lammps_atomic_numbers_mirror = Kokkos::create_mirror_view(k_lammps_atomic_numbers);
   for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
     k_lammps_atomic_numbers_mirror(i) = lammps_atomic_numbers[i];
   }
-  auto k_mace_atomic_numbers = Kokkos::View<long*,DeviceType>("k_mace_atomic_numbers",mace_atomic_numbers.size());
+  auto k_mace_atomic_numbers = Kokkos::View<int64_t*,DeviceType>("k_mace_atomic_numbers",mace_atomic_numbers.size());
   auto k_mace_atomic_numbers_mirror = Kokkos::create_mirror_view(k_mace_atomic_numbers);
   for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
     k_mace_atomic_numbers_mirror(i) = mace_atomic_numbers[i];
@@ -180,11 +180,11 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
   auto positions = torch::from_blob(
     k_positions.data(),
     {n_nodes,3},
-    torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+    torch::TensorOptions().dtype(torch_float_dtype).device(device));
 
   // ----- cell -----
   // TODO: how to use kokkos here?
-  auto cell = torch::zeros({3,3}, torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+  auto cell = torch::zeros({3,3}, torch::TensorOptions().dtype(torch_float_dtype).device(device));
   cell[0][0] = h0;
   cell[0][1] = 0.0;
   cell[0][2] = 0.0;
@@ -197,7 +197,7 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
 
   // ----- edge_index and unit_shifts -----
   // count total number of edges
-  auto k_n_edges_vec = Kokkos::View<long*,DeviceType>("k_n_edges_vec", n_nodes);
+  auto k_n_edges_vec = Kokkos::View<int64_t*,DeviceType>("k_n_edges_vec", n_nodes);
   Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA (const int ii) {
     const int i = d_ilist(ii);
     const double xtmp = x(i,0);
@@ -221,12 +221,12 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
     n_edges += k_n_edges_vec(ii);
   }, n_edges);
   // make first_edge vector to help with parallelizing following loop
-  auto k_first_edge = Kokkos::View<long*,DeviceType>("k_first_edge", n_nodes);  // initialized to zero
+  auto k_first_edge = Kokkos::View<int64_t*,DeviceType>("k_first_edge", n_nodes);  // initialized to zero
   Kokkos::parallel_for(n_nodes-1, KOKKOS_LAMBDA(const int ii) {
     k_first_edge(ii+1) = k_first_edge(ii) + k_n_edges_vec(ii);
   });
   // fill edge_index and unit_shifts tensors
-  auto k_edge_index = Kokkos::View<long**,Kokkos::LayoutRight,DeviceType>("k_edge_index", 2, n_edges);
+  auto k_edge_index = Kokkos::View<int64_t**,Kokkos::LayoutRight,DeviceType>("k_edge_index", 2, n_edges);
   auto k_unit_shifts = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_unit_shifts", n_edges);
   auto k_shifts = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_shifts", n_edges);
 
@@ -295,15 +295,15 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
   auto edge_index = torch::from_blob(
     k_edge_index.data(),
     {2,n_edges},
-    torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA));
+    torch::TensorOptions().dtype(torch::kInt64).device(device));
   auto unit_shifts = torch::from_blob(
     k_unit_shifts.data(),
     {n_edges,3},
-    torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+    torch::TensorOptions().dtype(torch_float_dtype).device(device));
   auto shifts = torch::from_blob(
     k_shifts.data(),
     {n_edges,3},
-    torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+    torch::TensorOptions().dtype(torch_float_dtype).device(device));
 
   // ----- node_attrs -----
   // node_attrs is one-hot encoding for atomic numbers
@@ -324,7 +324,7 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
   auto node_attrs = torch::from_blob(
     k_node_attrs.data(),
     {n_nodes, n_node_feats},
-    torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+    torch::TensorOptions().dtype(torch_float_dtype).device(device));
 
   // ----- mask for ghost -----
   Kokkos::View<bool*,DeviceType> k_mask("k_mask", n_nodes);
@@ -335,14 +335,14 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
   auto mask = torch::from_blob(
     k_mask.data(),
     n_nodes,
-    torch::TensorOptions().dtype(torch::kBool).device(torch::kCUDA));
+    torch::TensorOptions().dtype(torch::kBool).device(device));
 
   // TODO: add device?
-  auto batch = torch::zeros({n_nodes}, torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA));
-  auto energy = torch::empty({1}, torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
-  auto forces = torch::empty({n_nodes,3}, torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
-  auto ptr = torch::empty({2}, torch::TensorOptions().dtype(torch::kInt64).device(torch::kCUDA));
-  auto weight = torch::empty({1}, torch::TensorOptions().dtype(torch_float_dtype).device(torch::kCUDA));
+  auto batch = torch::zeros({n_nodes}, torch::TensorOptions().dtype(torch::kInt64).device(device));
+  auto energy = torch::empty({1}, torch::TensorOptions().dtype(torch_float_dtype).device(device));
+  auto forces = torch::empty({n_nodes,3}, torch::TensorOptions().dtype(torch_float_dtype).device(device));
+  auto ptr = torch::empty({2}, torch::TensorOptions().dtype(torch::kInt64).device(device));
+  auto weight = torch::empty({1}, torch::TensorOptions().dtype(torch_float_dtype).device(device));
   ptr[0] = 0;
   ptr[1] = n_nodes;
   weight[0] = 1.0;
@@ -464,15 +464,7 @@ std::cout << "node_energy: " << output.at("node_energy").toTensor().to("cpu") <<
 template<class DeviceType>
 void PairMACEKokkos<DeviceType>::settings(int narg, char **arg)
 {
-  if (narg == 1) {
-    if (strcmp(arg[0], "no_domain_decomposition") == 0) {
-      domain_decomposition = false;
-    } else {
-      error->all(FLERR, "Invalid option for pair_style mace.");
-    }
-  } else if (narg > 1) {
-    error->all(FLERR, "Too many pair_style arguments for pair_style mace.");
-  }
+  PairMACE::settings(narg, arg);
 }
 
 /* ---------------------------------------------------------------------- */

From cbcc05cecf13068cdfbfdf21cec15d9bc0cece2b Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 14 Mar 2023 22:42:35 +0000
Subject: [PATCH 28/51] wip.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 85 +++++++++++++++++----------------
 src/KOKKOS/pair_mace_kokkos.h   |  3 +-
 2 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 4c2af6f54c4..521a48fc34c 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -96,9 +96,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto d_neighbors = k_list->d_neighbors;
   auto d_ilist = k_list->d_ilist;
 
-std::cout << "nlocal inum " << atom->nlocal << " " << list->inum << std::endl;
-std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
-
   if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
   if (domain_decomposition) {
     if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
@@ -203,8 +200,7 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
     const double xtmp = x(i,0);
     const double ytmp = x(i,1);
     const double ztmp = x(i,2);
-    const int jnum = d_numneigh(i);
-    for (int jj=0; jj<jnum; ++jj) {
+    for (int jj=0; jj<d_numneigh(i); ++jj) {
       int j = d_neighbors(i,jj);
       j &= NEIGHMASK;
       const double delx = xtmp - x(j,0);
@@ -216,8 +212,10 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
       }
     }
   });
-  int n_edges = 0;
-  Kokkos::parallel_reduce(n_nodes, KOKKOS_LAMBDA(const int ii, int& n_edges) {
+  // WARNING: if n_edges remains 0 (e.g., because atoms too far apart)
+  // strange things happen on gpu
+  int64_t n_edges = 0;
+  Kokkos::parallel_reduce(n_nodes, KOKKOS_LAMBDA(const int ii, int64_t& n_edges) {
     n_edges += k_n_edges_vec(ii);
   }, n_edges);
   // make first_edge vector to help with parallelizing following loop
@@ -237,9 +235,8 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
       const double xtmp = x(i,0);
       const double ytmp = x(i,1);
       const double ztmp = x(i,2);
-      const int jnum = d_numneigh(i);
       int k = k_first_edge(ii);
-      for (int jj=0; jj<jnum; ++jj) {
+      for (int jj=0; jj<d_numneigh(i); ++jj) {
         int j = d_neighbors(i,jj);
         j &= NEIGHMASK;
         const double delx = xtmp - x(j,0);
@@ -261,9 +258,8 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
       const double xtmp = x(i,0);
       const double ytmp = x(i,1);
       const double ztmp = x(i,2);
-      const int jnum = d_numneigh(i);
       int k = k_first_edge(ii);
-      for (int jj=0; jj<jnum; ++jj) {
+      for (int jj=0; jj<d_numneigh(i); ++jj) {
         int j = d_neighbors(i,jj);
         j &= NEIGHMASK;
         const double delx = xtmp - x(j,0);
@@ -337,6 +333,7 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
     n_nodes,
     torch::TensorOptions().dtype(torch::kBool).device(device));
 
+  // TODO: why is batch of size n_nodes?
   // TODO: add device?
   auto batch = torch::zeros({n_nodes}, torch::TensorOptions().dtype(torch::kInt64).device(device));
   auto energy = torch::empty({1}, torch::TensorOptions().dtype(torch_float_dtype).device(device));
@@ -362,10 +359,10 @@ std::cout << "ghost gnum " << atom->nghost << " " << list->gnum << std::endl;
   input.insert("weight", weight);
 std::cout << "batch" << batch.to("cpu") << std::endl;
 std::cout << "cell" << cell.to("cpu") << std::endl;
-std::cout << "edge_index" << edge_index.to("cpu") << std::endl;
+//std::cout << "edge_index" << edge_index.to("cpu") << std::endl;
 // energy
 // forces
-std::cout << "node_attrs" << node_attrs.to("cpu") << std::endl;
+//std::cout << "node_attrs" << node_attrs.to("cpu") << std::endl;
 std::cout << "positions" << positions.to("cpu") << std::endl;
 std::cout << "ptr" << ptr.to("cpu") << std::endl;
 std::cout << "shifts" << shifts.to("cpu") << std::endl;
@@ -374,12 +371,18 @@ std::cout << "weight" << weight.to("cpu") << std::endl;
 std::cout << "mask" << mask.to("cpu") << std::endl;
   auto output = model.forward({input, mask, true, true, false}).toGenericDict();
 
-//
+  //std::cout << "node energy: " << typeid(output.at("node_energy").toTensor()).name() << std::endl;
+  //std::cout << "node energy: " << typeid(output.at("node_energy").toTensor().data_ptr()).name() << std::endl;
+  //std::cout << "forces: " << typeid(output.at("forces").toTensor()).name() << std::endl;
+  //std::cout << "forces: " << typeid(output.at("forces").toTensor().data_ptr()).name() << std::endl;
+
+
   // mace energy
   //   -> sum of site energies of local atoms
   if (eflag_global) {
     auto node_energy = output.at("node_energy").toTensor();
-    auto k_node_energy = Kokkos::View<double*,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(node_energy.data_ptr<double>(),n_nodes);
+    auto node_energy_ptr = static_cast<double*>(node_energy.data_ptr());
+    auto k_node_energy = Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(node_energy_ptr,n_nodes);
     eng_vdwl = 0.0;
     Kokkos::parallel_reduce(nlocal, KOKKOS_LAMBDA(const int ii, double &eng_vdwl) {
       const int i = d_ilist(ii);
@@ -390,7 +393,8 @@ std::cout << "mask" << mask.to("cpu") << std::endl;
   // mace forces
   //   -> derivatives of total mace energy
   forces = output.at("forces").toTensor();
-  auto k_forces = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(forces.data_ptr<double>(),n_nodes);
+  auto forces_ptr = static_cast<double*>(forces.data_ptr());
+  auto k_forces = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(forces_ptr,n_nodes);
   Kokkos::parallel_for(nlocal, KOKKOS_LAMBDA(const int ii) {
     const int i = d_ilist(ii);
     f(i,0) = k_forces(i,0);
@@ -461,14 +465,6 @@ std::cout << "node_energy: " << output.at("node_energy").toTensor().to("cpu") <<
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-void PairMACEKokkos<DeviceType>::settings(int narg, char **arg)
-{
-  PairMACE::settings(narg, arg);
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 void PairMACEKokkos<DeviceType>::coeff(int narg, char **arg)
 {
@@ -483,7 +479,7 @@ template<class DeviceType>
 void PairMACEKokkos<DeviceType>::init_style()
 {
 std::cout << "hello from kokkos init_style" << std::endl;
-  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
+//  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
 
 //  /*
 //    MACE requires the full neighbor list AND neighbors of ghost atoms
@@ -501,23 +497,23 @@ std::cout << "hello from kokkos init_style" << std::endl;
 //    neighbor->add_request(this, NeighConst::REQ_FULL);
 //  }
 
-  if (host_flag) {
-    PairMACE::init_style();
-    return;
-  }
-std::cout << "after mace init" << std::endl;
-
-  // neighbor list request for KOKKOS
-
-  neighflag = lmp->kokkos->neighflag;
-
-  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-  //if (domain_decomposition) {
-    //auto request = neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
-    request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-                             !std::is_same<DeviceType,LMPDeviceType>::value);
-    request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-    if (neighflag == FULL) request->enable_full();
+//  if (host_flag) {
+//    PairMACE::init_style();
+//    return;
+//  }
+//std::cout << "after mace init" << std::endl;
+//
+//  // neighbor list request for KOKKOS
+//
+//  neighflag = lmp->kokkos->neighflag;
+//
+//  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+//  //if (domain_decomposition) {
+//    //auto request = neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
+//    request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+//                             !std::is_same<DeviceType,LMPDeviceType>::value);
+//    request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+//    if (neighflag == FULL) request->enable_full();
   //} else {
   //  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
   //  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
@@ -531,6 +527,11 @@ std::cout << "after mace init" << std::endl;
 
 
 
+  PairMACE::init_style();
+  auto request = neighbor->find_request(this);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
 
 
 
diff --git a/src/KOKKOS/pair_mace_kokkos.h b/src/KOKKOS/pair_mace_kokkos.h
index ab658b70f49..72b041f9e90 100644
--- a/src/KOKKOS/pair_mace_kokkos.h
+++ b/src/KOKKOS/pair_mace_kokkos.h
@@ -39,13 +39,12 @@ class PairMACEKokkos : public PairMACE {
 
  public:
 
-  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF};
+  //enum {EnabledNeighFlags=FULL};
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
   PairMACEKokkos(class LAMMPS *);
   ~PairMACEKokkos() override;
   void compute(int, int) override;
-  void settings(int, char **) override;
   void coeff(int, char **) override;
   void init_style() override;
   double init_one(int, int) override;

From e7ef916606e9853c7ff13e1007f5935d229cead2 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Thu, 30 Mar 2023 11:09:23 +0100
Subject: [PATCH 29/51] kokkos.

---
 src/KOKKOS/npair_kokkos.cpp     |   2 +-
 src/KOKKOS/pair_mace_kokkos.cpp | 268 ++++----------------------------
 src/KOKKOS/pair_mace_kokkos.h   |  11 --
 3 files changed, 31 insertions(+), 250 deletions(-)

diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index ee8da12c528..5f82b4e0690 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -246,7 +246,7 @@ void NPairKokkos<DeviceType,HALF,NEWTON,GHOST,TRI,SIZE>::build(NeighList *list_)
       // assumes newton off
 
       NPairKokkosBuildFunctorGhost<DeviceType,HALF> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
-#ifdef LMP_KOKKOS_GPU
+#if 0
       if (ExecutionSpaceFromDevice<DeviceType>::space == Device) {
         int team_size = atoms_per_bin*factor;
         int team_size_max = Kokkos::TeamPolicy<DeviceType>(team_size,Kokkos::AUTO).team_size_max(f,Kokkos::ParallelForTag());
diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 521a48fc34c..c79eed1b73a 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -18,7 +18,6 @@
 
 #include "pair_mace_kokkos.h"
 
-// TODO: do i need all these?
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "error.h"
@@ -42,18 +41,8 @@ using namespace LAMMPS_NS;
 template<class DeviceType>
 PairMACEKokkos<DeviceType>::PairMACEKokkos(LAMMPS *lmp) : PairMACE(lmp)
 {
-std::cout << "hello from kokkos mace constructor" << std::endl;
   no_virial_fdotr_compute = 1;
 
-  //kokkosable = 1;
-  //atomKK = (AtomKokkos *) atom;
-  //execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  //datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
-  //datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
-  // pair_pace_kokkos has these instead
-  //datamask_read = EMPTY_MASK;
-  //datamask_modify = EMPTY_MASK;
-
   kokkosable = 1;
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
@@ -69,17 +58,6 @@ template<class DeviceType>
 PairMACEKokkos<DeviceType>::~PairMACEKokkos()
 {
   if (copymode) return;
-
-  // from lj_cut_kokkos
-  //if (allocated) {
-  //  memoryKK->destroy_kokkos(k_eatom,eatom);
-  //  memoryKK->destroy_kokkos(k_vatom,vatom);
-  //  memoryKK->destroy_kokkos(k_cutsq,cutsq);
-  //}
-
-  // from pair_pace_kokkos
-  //memoryKK->destroy_kokkos(k_eatom,eatom);
-  //memoryKK->destroy_kokkos(k_vatom,vatom);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -105,17 +83,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
     error->all(FLERR, "ERROR: kokkos eflag_atom not implemented.");
   }
 
-//  if (eflag_atom) {
-//    memoryKK->destroy_kokkos(k_eatom,eatom);
-//    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
-//    d_eatom = k_eatom.view<DeviceType>();
-//  }
-//  if (vflag_atom) {
-//    memoryKK->destroy_kokkos(k_vatom,vatom);
-//    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
-//    d_vatom = k_vatom.view<DeviceType>();
-//  }
-
   int nlocal = atom->nlocal;
   auto r_max_squared = this->r_max_squared;
   auto h0 = domain->h[0];
@@ -136,11 +103,13 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
     k_lammps_atomic_numbers_mirror(i) = lammps_atomic_numbers[i];
   }
+  Kokkos::deep_copy(k_lammps_atomic_numbers, k_lammps_atomic_numbers_mirror);
   auto k_mace_atomic_numbers = Kokkos::View<int64_t*,DeviceType>("k_mace_atomic_numbers",mace_atomic_numbers.size());
   auto k_mace_atomic_numbers_mirror = Kokkos::create_mirror_view(k_mace_atomic_numbers);
   for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
     k_mace_atomic_numbers_mirror(i) = mace_atomic_numbers[i];
   }
+  Kokkos::deep_copy(k_mace_atomic_numbers, k_mace_atomic_numbers_mirror);
   auto mace_atomic_numbers_size = mace_atomic_numbers.size();
 
   // atom map
@@ -149,7 +118,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto k_map_hash = atomKK->k_map_hash;
   k_map_array.template sync<DeviceType>();
 
-
   auto x = atomKK->k_x.view<DeviceType>();
 //  c_x = atomKK->k_x.view<DeviceType>();
   auto f = atomKK->k_f.view<DeviceType>();
@@ -220,10 +188,12 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   }, n_edges);
   // make first_edge vector to help with parallelizing following loop
   auto k_first_edge = Kokkos::View<int64_t*,DeviceType>("k_first_edge", n_nodes);  // initialized to zero
-  Kokkos::parallel_for(n_nodes-1, KOKKOS_LAMBDA(const int ii) {
-    k_first_edge(ii+1) = k_first_edge(ii) + k_n_edges_vec(ii);
+  // TODO: this is serial to avoid race ... is there something better?
+  Kokkos::parallel_for(1, KOKKOS_LAMBDA(const int i) {
+    for (int ii=0; ii<n_nodes-1; ++ii) {
+      k_first_edge(ii+1) = k_first_edge(ii) + k_n_edges_vec(ii);
+    }
   });
-  // fill edge_index and unit_shifts tensors
   auto k_edge_index = Kokkos::View<int64_t**,Kokkos::LayoutRight,DeviceType>("k_edge_index", 2, n_edges);
   auto k_unit_shifts = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_unit_shifts", n_edges);
   auto k_shifts = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_shifts", n_edges);
@@ -268,7 +238,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
         const double rsq = delx*delx + dely*dely + delz*delz;
         if (rsq < r_max_squared) {
           k_edge_index(0,k) = i;
-          //int j_local = atom->map(tag(j));
           int j_local = AtomKokkos::map_kokkos<DeviceType>(tag(j),map_style,k_map_array,k_map_hash);
           k_edge_index(1,k) = j_local;
           double shiftx = x(j,0) - x(j_local,0);
@@ -314,7 +283,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
         t = j+1;
       }
     }
-    //if (t==-1) error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
     k_node_attrs(i,t-1) = 1.0;
   });
   auto node_attrs = torch::from_blob(
@@ -334,7 +302,6 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
     torch::TensorOptions().dtype(torch::kBool).device(device));
 
   // TODO: why is batch of size n_nodes?
-  // TODO: add device?
   auto batch = torch::zeros({n_nodes}, torch::TensorOptions().dtype(torch::kInt64).device(device));
   auto energy = torch::empty({1}, torch::TensorOptions().dtype(torch_float_dtype).device(device));
   auto forces = torch::empty({n_nodes,3}, torch::TensorOptions().dtype(torch_float_dtype).device(device));
@@ -357,25 +324,22 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-std::cout << "batch" << batch.to("cpu") << std::endl;
-std::cout << "cell" << cell.to("cpu") << std::endl;
+
+//std::cout << "batch" << batch.to("cpu") << std::endl;
+//std::cout << "cell" << cell.to("cpu") << std::endl;
 //std::cout << "edge_index" << edge_index.to("cpu") << std::endl;
-// energy
-// forces
 //std::cout << "node_attrs" << node_attrs.to("cpu") << std::endl;
-std::cout << "positions" << positions.to("cpu") << std::endl;
-std::cout << "ptr" << ptr.to("cpu") << std::endl;
-std::cout << "shifts" << shifts.to("cpu") << std::endl;
-std::cout << "unit_shifts" << unit_shifts.to("cpu") << std::endl;
-std::cout << "weight" << weight.to("cpu") << std::endl;
-std::cout << "mask" << mask.to("cpu") << std::endl;
-  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
+//std::cout << "positions" << positions.to("cpu") << std::endl;
+//std::cout << "ptr" << ptr.to("cpu") << std::endl;
+//std::cout << "shifts" << shifts.to("cpu") << std::endl;
+//std::cout << "unit_shifts" << unit_shifts.to("cpu") << std::endl;
+//std::cout << "weight" << weight.to("cpu") << std::endl;
+//std::cout << "mask" << mask.to("cpu") << std::endl;
 
-  //std::cout << "node energy: " << typeid(output.at("node_energy").toTensor()).name() << std::endl;
-  //std::cout << "node energy: " << typeid(output.at("node_energy").toTensor().data_ptr()).name() << std::endl;
-  //std::cout << "forces: " << typeid(output.at("forces").toTensor()).name() << std::endl;
-  //std::cout << "forces: " << typeid(output.at("forces").toTensor().data_ptr()).name() << std::endl;
+  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
 
+//std::cout << "energy: " << output.at("energy").toTensor().to("cpu") << std::endl;
+//std::cout << "node_energy: " << output.at("node_energy").toTensor().to("cpu") << std::endl;
 
   // mace energy
   //   -> sum of site energies of local atoms
@@ -401,66 +365,18 @@ std::cout << "mask" << mask.to("cpu") << std::endl;
     f(i,1) = k_forces(i,1);
     f(i,2) = k_forces(i,2);
   });
-//
-////  // mace site energies
-////  //   -> local atoms only
-////  if (eflag_atom) {
-////    auto node_energy = output.at("node_energy").toTensor();
-////    #pragma omp parallel for
-////    for (int ii=0; ii<list->inum; ++ii) {
-////      int i = list->ilist[ii];
-////      eatom[i] = node_energy[i].item<double>();
-////    }
-////  }
-////
-//  // mace virials (local atoms only)
-//  //   -> derivatives of sum of site energies of local atoms
-//  if (vflag_global) {
-//    auto vir = output.at("virials").toTensor();
-//    virial[0] = vir[0][0][0].item<double>();
-//    virial[1] = vir[0][1][1].item<double>();
-//    virial[2] = vir[0][2][2].item<double>();
-//    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
-//    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
-//    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
-//  }
-////
-////  // mace site virials
-////  //   -> not available
-////  if (vflag_atom) {
-////    error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
-////  }
-//
-//
-//  //copymode = 1;
-//
-//  //EV_FLOAT ev = pair_compute<PairLJCutKokkos<DeviceType>,void >(this,(NeighListKokkos<DeviceType>*)list);
-//
-//  //if (eflag_global) eng_vdwl += ev.evdwl;
-//  //if (vflag_global) {
-//  //  virial[0] += ev.v[0];
-//  //  virial[1] += ev.v[1];
-//  //  virial[2] += ev.v[2];
-//  //  virial[3] += ev.v[3];
-//  //  virial[4] += ev.v[4];
-//  //  virial[5] += ev.v[5];
-//  //}
-//
-//  //if (eflag_atom) {
-//  //  k_eatom.template modify<DeviceType>();
-//  //  k_eatom.template sync<LMPHostType>();
-//  //}
-//
-//  //if (vflag_atom) {
-//  //  k_vatom.template modify<DeviceType>();
-//  //  k_vatom.template sync<LMPHostType>();
-//  //}
-//
-//  copymode = 0;
-
-std::cout << "energy: " << output.at("energy").toTensor().to("cpu") << std::endl;
-std::cout << "node_energy: " << output.at("node_energy").toTensor().to("cpu") << std::endl;
 
+  // mace virials (local atoms only)
+  //   -> derivatives of sum of site energies of local atoms
+  if (vflag_global) {
+    auto vir = output.at("virials").toTensor().to("cpu");
+    virial[0] = vir[0][0][0].item<double>();
+    virial[1] = vir[0][1][1].item<double>();
+    virial[2] = vir[0][2][2].item<double>();
+    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
+    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -468,160 +384,36 @@ std::cout << "node_energy: " << output.at("node_energy").toTensor().to("cpu") <<
 template<class DeviceType>
 void PairMACEKokkos<DeviceType>::coeff(int narg, char **arg)
 {
-std::cout << "hello from kokkos coeff" << std::endl;
   if (!allocated) allocate();
-std::cout << "allocated: " << allocated << std::endl;
   PairMACE::coeff(narg,arg);
-std::cout << "allocated: " << allocated << std::endl;
 }
 
 template<class DeviceType>
 void PairMACEKokkos<DeviceType>::init_style()
 {
-std::cout << "hello from kokkos init_style" << std::endl;
-//  if (force->newton_pair == 0) error->all(FLERR, "ERROR: Pair style mace requires newton pair on.");
-
-//  /*
-//    MACE requires the full neighbor list AND neighbors of ghost atoms
-//    it appears that:
-//      * without REQ_GHOST
-//           list->gnum == 0
-//           list->ilist does not include ghost atoms, but the jlists do
-//      * with REQ_GHOST
-//           list->gnum == atom->nghost
-//           list->ilist includes ghost atoms
-//  */
-//  if (domain_decomposition) {
-//    neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
-//  } else {
-//    neighbor->add_request(this, NeighConst::REQ_FULL);
-//  }
-
-//  if (host_flag) {
-//    PairMACE::init_style();
-//    return;
-//  }
-//std::cout << "after mace init" << std::endl;
-//
-//  // neighbor list request for KOKKOS
-//
-//  neighflag = lmp->kokkos->neighflag;
-//
-//  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-//  //if (domain_decomposition) {
-//    //auto request = neighbor->add_request(this, NeighConst::REQ_FULL | NeighConst::REQ_GHOST);
-//    request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-//                             !std::is_same<DeviceType,LMPDeviceType>::value);
-//    request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-//    if (neighflag == FULL) request->enable_full();
-  //} else {
-  //  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-  //  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-  //                           !std::is_same<DeviceType,LMPDeviceType>::value);
-  //  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-  //}
-//  if (neighflag == FULL)
-//    error->all(FLERR,"Must use half neighbor list style with pair pace/kk");
-
-
-
-
-
   PairMACE::init_style();
   auto request = neighbor->find_request(this);
   request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
                            !std::is_same<DeviceType,LMPDeviceType>::value);
   request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-
-
-
-
-
-
-//  // TODO: from lj_cut, possibly delete
-//  // adjust neighbor list request for KOKKOS
-//  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-//  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-//                           !std::is_same<DeviceType,LMPDeviceType>::value);
-//  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-//
-//  neighflag = lmp->kokkos->neighflag;
-//  auto request = neighbor->find_request(this);
-//std::cout << "before request" << std::endl;
-//  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-//                           !std::is_same<DeviceType,LMPDeviceType>::value);
-//std::cout << "before request" << std::endl;
-//  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-//std::cout << "after request" << std::endl;
-//  if (neighflag == FULL)
-//    std::cout << "REQUESTING FULL LIST." << std::endl;
-//  if (neighflag == FULL) request->enable_full();
-
-  //auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
-  //request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
-  //                         !std::is_same<DeviceType,LMPDeviceType>::value);
-  //request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-  //if (neighflag == FULL)
-  //  error->all(FLERR,"Must use half neighbor list style with pair pace/kk");
-
-std::cout << "goodbye from init_style" << std::endl;
 }
 
 template<class DeviceType>
 double PairMACEKokkos<DeviceType>::init_one(int i, int j)
 {
-std::cout << "hello from kokkos init_one" << std::endl;
   double cutone = PairMACE::init_one(i,j);
-
-  //k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j];
-  //k_scale.template modify<LMPHostType>();
-
   k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
   k_cutsq.template modify<LMPHostType>();
-
   return cutone;
-
-  //// to account for message passing, require cutoff of n_layers * r_max
-  //return num_interactions*model.attr("r_max").toTensor().item<double>();
-
-//  k_params.h_view(i,j).lj1 = lj1[i][j];
-//  k_params.h_view(i,j).lj2 = lj2[i][j];
-//  k_params.h_view(i,j).lj3 = lj3[i][j];
-//  k_params.h_view(i,j).lj4 = lj4[i][j];
-//  k_params.h_view(i,j).offset = offset[i][j];
-//  k_params.h_view(i,j).cutsq = cutone*cutone;
-//  k_params.h_view(j,i) = k_params.h_view(i,j);
-//  if (i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
-//    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
-//    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
-//  }
-//
-//  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
-//  k_cutsq.template modify<LMPHostType>();
-//  k_params.template modify<LMPHostType>();
-
-//  return cutone;
 }
 
 template<class DeviceType>
 void PairMACEKokkos<DeviceType>::allocate()
 {
-std::cout << "hello from kokkos allocate" << std::endl;
   PairMACE::allocate();
-
   int n = atom->ntypes + 1;
-  //MemKK::realloc_kokkos(d_map, "pace:map", n);
-
   MemKK::realloc_kokkos(k_cutsq, "mace:cutsq", n, n);
   d_cutsq = k_cutsq.template view<DeviceType>();
-
-  // TODO: from lj_cut, possibly delete
-  //int n = atom->ntypes;
-  //memory->destroy(cutsq);
-  //memoryKK->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
-  //d_cutsq = k_cutsq.template view<DeviceType>();
-  //k_params = Kokkos::DualView<params_lj**,Kokkos::LayoutRight,DeviceType>("PairLJCut::params",n+1,n+1);
-  //params = k_params.template view<DeviceType>();
 }
 
 namespace LAMMPS_NS {
diff --git a/src/KOKKOS/pair_mace_kokkos.h b/src/KOKKOS/pair_mace_kokkos.h
index 72b041f9e90..e2a40fe1902 100644
--- a/src/KOKKOS/pair_mace_kokkos.h
+++ b/src/KOKKOS/pair_mace_kokkos.h
@@ -39,7 +39,6 @@ class PairMACEKokkos : public PairMACE {
 
  public:
 
-  //enum {EnabledNeighFlags=FULL};
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
   PairMACEKokkos(class LAMMPS *);
@@ -52,17 +51,7 @@ class PairMACEKokkos : public PairMACE {
 
  protected:
 
-  // kokkos stuff
   int host_flag;
-  int neighflag;
-  //typename AT::t_x_array_randomread x;
-  //typename AT::t_x_array c_x;
-  //typename AT::t_f_array f;
-  //typename AT::t_int_1d_randomread type;
-
-  
-  //Kokkos::View<double*[3], DeviceType> k_positions;
-
   typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
   tdual_fparams k_cutsq;
   typedef Kokkos::View<F_FLOAT**, DeviceType> t_fparams;

From ae8d33b1fdba40bc4559fb796db2c0c3dbbcf7af Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Fri, 21 Apr 2023 18:04:19 +0100
Subject: [PATCH 30/51] move some things out of compute.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 37 +++++++++++++++++++--------------
 src/KOKKOS/pair_mace_kokkos.h   |  5 +++++
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index c79eed1b73a..c8e33988f30 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -98,19 +98,9 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto hinv4 = domain->h_inv[4];
   auto hinv5 = domain->h_inv[5];
 
-  auto k_lammps_atomic_numbers = Kokkos::View<int64_t*,DeviceType>("k_lammps_atomic_numbers",lammps_atomic_numbers.size());
-  auto k_lammps_atomic_numbers_mirror = Kokkos::create_mirror_view(k_lammps_atomic_numbers);
-  for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
-    k_lammps_atomic_numbers_mirror(i) = lammps_atomic_numbers[i];
-  }
-  Kokkos::deep_copy(k_lammps_atomic_numbers, k_lammps_atomic_numbers_mirror);
-  auto k_mace_atomic_numbers = Kokkos::View<int64_t*,DeviceType>("k_mace_atomic_numbers",mace_atomic_numbers.size());
-  auto k_mace_atomic_numbers_mirror = Kokkos::create_mirror_view(k_mace_atomic_numbers);
-  for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
-    k_mace_atomic_numbers_mirror(i) = mace_atomic_numbers[i];
-  }
-  Kokkos::deep_copy(k_mace_atomic_numbers, k_mace_atomic_numbers_mirror);
-  auto mace_atomic_numbers_size = mace_atomic_numbers.size();
+  auto _k_lammps_atomic_numbers = k_lammps_atomic_numbers;
+  auto _k_mace_atomic_numbers = k_mace_atomic_numbers;
+  auto _mace_atomic_numbers_size = mace_atomic_numbers_size;
 
   // atom map
   auto map_style = atom->map_style;
@@ -272,14 +262,14 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 
   // ----- node_attrs -----
   // node_attrs is one-hot encoding for atomic numbers
-  int n_node_feats = mace_atomic_numbers_size;
+  int n_node_feats = _mace_atomic_numbers_size;
   auto k_node_attrs = Kokkos::View<double**,Kokkos::LayoutRight,DeviceType>("k_node_attrs", n_nodes, n_node_feats);
   Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
     const int i = d_ilist(ii);
     const int lammps_type = type(i);
     int t = -1;
-    for (int j=0; j<mace_atomic_numbers_size; ++j) {
-      if (k_mace_atomic_numbers(j)==k_lammps_atomic_numbers(lammps_type-1)) {
+    for (int j=0; j<_mace_atomic_numbers_size; ++j) {
+      if (_k_mace_atomic_numbers(j)==_k_lammps_atomic_numbers(lammps_type-1)) {
         t = j+1;
       }
     }
@@ -386,6 +376,21 @@ void PairMACEKokkos<DeviceType>::coeff(int narg, char **arg)
 {
   if (!allocated) allocate();
   PairMACE::coeff(narg,arg);
+
+  // new
+  k_lammps_atomic_numbers = Kokkos::View<int64_t*,DeviceType>("k_lammps_atomic_numbers",lammps_atomic_numbers.size());
+  auto k_lammps_atomic_numbers_mirror = Kokkos::create_mirror_view(k_lammps_atomic_numbers);
+  for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
+    k_lammps_atomic_numbers_mirror(i) = lammps_atomic_numbers[i];
+  }
+  Kokkos::deep_copy(k_lammps_atomic_numbers, k_lammps_atomic_numbers_mirror);
+  k_mace_atomic_numbers = Kokkos::View<int64_t*,DeviceType>("k_mace_atomic_numbers",mace_atomic_numbers.size());
+  auto k_mace_atomic_numbers_mirror = Kokkos::create_mirror_view(k_mace_atomic_numbers);
+  for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
+    k_mace_atomic_numbers_mirror(i) = mace_atomic_numbers[i];
+  }
+  Kokkos::deep_copy(k_mace_atomic_numbers, k_mace_atomic_numbers_mirror);
+  mace_atomic_numbers_size = mace_atomic_numbers.size();
 }
 
 template<class DeviceType>
diff --git a/src/KOKKOS/pair_mace_kokkos.h b/src/KOKKOS/pair_mace_kokkos.h
index e2a40fe1902..8aa0a8e62b4 100644
--- a/src/KOKKOS/pair_mace_kokkos.h
+++ b/src/KOKKOS/pair_mace_kokkos.h
@@ -57,6 +57,11 @@ class PairMACEKokkos : public PairMACE {
   typedef Kokkos::View<F_FLOAT**, DeviceType> t_fparams;
   t_fparams d_cutsq;
 
+  // new
+  Kokkos::View<int64_t*,DeviceType> k_lammps_atomic_numbers;
+  Kokkos::View<int64_t*,DeviceType> k_mace_atomic_numbers;
+  int mace_atomic_numbers_size;
+
 };
 }    // namespace LAMMPS_NS
 

From 11150d52f87897445b459b28804bfbb0d0a1c802 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Wed, 26 Apr 2023 11:30:11 +0100
Subject: [PATCH 31/51] add labels to kokkos kernels.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index c8e33988f30..2152a122fff 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -127,7 +127,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
     n_nodes = atom->nlocal;
   }
   auto k_positions = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_positions", n_nodes);
-  Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA (const int i) {
+  Kokkos::parallel_for("PairMACEKokkos: Fill k_positions.", n_nodes, KOKKOS_LAMBDA (const int i) {
     k_positions(i,0) = x(i,0);
     k_positions(i,1) = x(i,1);
     k_positions(i,2) = x(i,2);
@@ -153,7 +153,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   // ----- edge_index and unit_shifts -----
   // count total number of edges
   auto k_n_edges_vec = Kokkos::View<int64_t*,DeviceType>("k_n_edges_vec", n_nodes);
-  Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA (const int ii) {
+  Kokkos::parallel_for("PairMACEKokkos: Fill k_n_edges_vec.", n_nodes, KOKKOS_LAMBDA (const int ii) {
     const int i = d_ilist(ii);
     const double xtmp = x(i,0);
     const double ytmp = x(i,1);
@@ -173,13 +173,13 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   // WARNING: if n_edges remains 0 (e.g., because atoms too far apart)
   // strange things happen on gpu
   int64_t n_edges = 0;
-  Kokkos::parallel_reduce(n_nodes, KOKKOS_LAMBDA(const int ii, int64_t& n_edges) {
+  Kokkos::parallel_reduce("PairMACEKokkos: Determine n_edges.", n_nodes, KOKKOS_LAMBDA(const int ii, int64_t& n_edges) {
     n_edges += k_n_edges_vec(ii);
   }, n_edges);
   // make first_edge vector to help with parallelizing following loop
   auto k_first_edge = Kokkos::View<int64_t*,DeviceType>("k_first_edge", n_nodes);  // initialized to zero
   // TODO: this is serial to avoid race ... is there something better?
-  Kokkos::parallel_for(1, KOKKOS_LAMBDA(const int i) {
+  Kokkos::parallel_for("PairMACEKokkos: Fill k_first_edge.", 1, KOKKOS_LAMBDA(const int i) {
     for (int ii=0; ii<n_nodes-1; ++ii) {
       k_first_edge(ii+1) = k_first_edge(ii) + k_n_edges_vec(ii);
     }
@@ -190,7 +190,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 
   if (domain_decomposition) {
 
-    Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
+    Kokkos::parallel_for("PairMACEKokkos: Fill edge_index (using domain decomposition).", n_nodes, KOKKOS_LAMBDA(const int ii) {
       const int i = d_ilist(ii);
       const double xtmp = x(i,0);
       const double ytmp = x(i,1);
@@ -213,7 +213,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 
   } else {
 
-    Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
+    Kokkos::parallel_for("PairMACEKokkos: Fill edge_index (no domain decomposition).", n_nodes, KOKKOS_LAMBDA(const int ii) {
       const int i = d_ilist(ii);
       const double xtmp = x(i,0);
       const double ytmp = x(i,1);
@@ -264,7 +264,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   // node_attrs is one-hot encoding for atomic numbers
   int n_node_feats = _mace_atomic_numbers_size;
   auto k_node_attrs = Kokkos::View<double**,Kokkos::LayoutRight,DeviceType>("k_node_attrs", n_nodes, n_node_feats);
-  Kokkos::parallel_for(n_nodes, KOKKOS_LAMBDA(const int ii) {
+  Kokkos::parallel_for("PairMACEKokkos: Fill k_node_attrs.", n_nodes, KOKKOS_LAMBDA(const int ii) {
     const int i = d_ilist(ii);
     const int lammps_type = type(i);
     int t = -1;
@@ -282,7 +282,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
 
   // ----- mask for ghost -----
   Kokkos::View<bool*,DeviceType> k_mask("k_mask", n_nodes);
-  Kokkos::parallel_for(nlocal, KOKKOS_LAMBDA(const int ii) {
+  Kokkos::parallel_for("PairMACEKokkos: Fill k_mask.", nlocal, KOKKOS_LAMBDA(const int ii) {
     const int i = d_ilist(ii);
     k_mask(i) = true;
   });
@@ -338,7 +338,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
     auto node_energy_ptr = static_cast<double*>(node_energy.data_ptr());
     auto k_node_energy = Kokkos::View<double*,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(node_energy_ptr,n_nodes);
     eng_vdwl = 0.0;
-    Kokkos::parallel_reduce(nlocal, KOKKOS_LAMBDA(const int ii, double &eng_vdwl) {
+    Kokkos::parallel_reduce("PairMACEKokkos: Accumulate site energies.", nlocal, KOKKOS_LAMBDA(const int ii, double &eng_vdwl) {
       const int i = d_ilist(ii);
       eng_vdwl += k_node_energy(i);
     }, eng_vdwl);
@@ -349,7 +349,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   forces = output.at("forces").toTensor();
   auto forces_ptr = static_cast<double*>(forces.data_ptr());
   auto k_forces = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(forces_ptr,n_nodes);
-  Kokkos::parallel_for(nlocal, KOKKOS_LAMBDA(const int ii) {
+  Kokkos::parallel_for("PairMACEKokkos: Extract k_forces.", nlocal, KOKKOS_LAMBDA(const int ii) {
     const int i = d_ilist(ii);
     f(i,0) = k_forces(i,0);
     f(i,1) = k_forces(i,1);

From 4195b189d9bb1c6f85e6b359d82f8e1311750b06 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 30 May 2023 08:11:19 +0100
Subject: [PATCH 32/51] improvements.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 46 ++++++++++++---------------------
 1 file changed, 17 insertions(+), 29 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 2152a122fff..37ed8716ce4 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -31,7 +31,6 @@
 #include "update.h"
 
 #include <algorithm>
-#include <iostream>
 #include <stdexcept>
 
 using namespace LAMMPS_NS;
@@ -74,14 +73,12 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   auto d_neighbors = k_list->d_neighbors;
   auto d_ilist = k_list->d_ilist;
 
-  if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
-  if (domain_decomposition) {
-    if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
-  }
-
-  if (eflag_atom || vflag_atom) {
-    error->all(FLERR, "ERROR: kokkos eflag_atom not implemented.");
-  }
+  if (atom->nlocal != list->inum)
+    error->all(FLERR, "ERROR: nlocal != inum.");
+  if (domain_decomposition && (atom->nghost != list->gnum))
+    error->all(FLERR, "ERROR: nghost != gnum.");
+  if (eflag_atom || vflag_atom)
+    error->all(FLERR, "ERROR: mace/kokkos eflag_atom and/or vflag_atom not implemented.");
 
   int nlocal = atom->nlocal;
   auto r_max_squared = this->r_max_squared;
@@ -122,8 +119,8 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   } else {
     // normally, ghost atoms are included in the graph as independent
     // nodes, as required when the local domain does not have PBC.
-    // however, in no_domain_decomposition mode, ghost atoms are known to
-    // be shifted versions of local atoms.
+    // however, in no_domain_decomposition mode, ghost atoms are simply
+    // shifted versions of local atoms.
     n_nodes = atom->nlocal;
   }
   auto k_positions = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType>("k_positions", n_nodes);
@@ -314,22 +311,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-
-//std::cout << "batch" << batch.to("cpu") << std::endl;
-//std::cout << "cell" << cell.to("cpu") << std::endl;
-//std::cout << "edge_index" << edge_index.to("cpu") << std::endl;
-//std::cout << "node_attrs" << node_attrs.to("cpu") << std::endl;
-//std::cout << "positions" << positions.to("cpu") << std::endl;
-//std::cout << "ptr" << ptr.to("cpu") << std::endl;
-//std::cout << "shifts" << shifts.to("cpu") << std::endl;
-//std::cout << "unit_shifts" << unit_shifts.to("cpu") << std::endl;
-//std::cout << "weight" << weight.to("cpu") << std::endl;
-//std::cout << "mask" << mask.to("cpu") << std::endl;
-
-  auto output = model.forward({input, mask, true, true, false}).toGenericDict();
-
-//std::cout << "energy: " << output.at("energy").toTensor().to("cpu") << std::endl;
-//std::cout << "node_energy: " << output.at("node_energy").toTensor().to("cpu") << std::endl;
+  auto output = model.forward({input, mask, true, bool(vflag_global), false}).toGenericDict();
 
   // mace energy
   //   -> sum of site energies of local atoms
@@ -359,14 +341,20 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   // mace virials (local atoms only)
   //   -> derivatives of sum of site energies of local atoms
   if (vflag_global) {
+    // TODO: is this cpu transfer necessary?
     auto vir = output.at("virials").toTensor().to("cpu");
+    // caution: lammps does not use voigt ordering
     virial[0] = vir[0][0][0].item<double>();
     virial[1] = vir[0][1][1].item<double>();
     virial[2] = vir[0][2][2].item<double>();
-    virial[3] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+    virial[3] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
     virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
-    virial[5] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+    virial[5] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
   }
+
+  // TODO: investigate this
+  // Appears to be important for dumps and probably more
+  atomKK->modified(execution_space,F_MASK);
 }
 
 /* ---------------------------------------------------------------------- */

From eb867b146010b2f1a9607f7297ff0472187eac9d Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 30 May 2023 08:26:01 +0100
Subject: [PATCH 33/51] mergefix.

---
 cmake/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 66963a7a566..e3fc660d17a 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -521,7 +521,7 @@ else()
 endif()
 
 foreach(PKG_WITH_INCL KSPACE PYTHON ML-IAP VORONOI COLVARS ML-HDNNP MDI MOLFILE NETCDF
-        PLUMED QMMM ML-QUIP SCAFACOS MACHDYN VTK KIM LATTE MSCG COMPRESS ML-PACE ML-MACE LEPTON)
+        PLUMED QMMM ML-QUIP SCAFACOS MACHDYN VTK KIM MSCG COMPRESS ML-PACE ML-MACE LEPTON)
   if(PKG_${PKG_WITH_INCL})
     include(Packages/${PKG_WITH_INCL})
   endif()

From 22eb83687ce56da8d9db26a78533219827f1496f Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 30 May 2023 09:22:31 +0100
Subject: [PATCH 34/51] delete debug code.

---
 src/ML-MACE/pair_mace.cpp | 102 ++------------------------------------
 1 file changed, 5 insertions(+), 97 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index dd6821c6397..001bc8213a2 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -30,10 +30,6 @@
 #include <iostream>
 #include <stdexcept>
 
-#ifdef MACE_DEBUG
-#include <chrono>
-#endif
-
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -55,16 +51,6 @@ void PairMACE::compute(int eflag, int vflag)
 {
   ev_init(eflag, vflag);
 
-std::cout << " " << std::endl;
-
-  #ifdef MACE_DEBUG
-  std::chrono::time_point<std::chrono::high_resolution_clock> t0, t1;
-  #endif
-
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
-
   if (atom->nlocal != list->inum) error->all(FLERR, "ERROR: nlocal != inum.");
   if (domain_decomposition) {
     if (atom->nghost != list->gnum) error->all(FLERR, "ERROR: nghost != gnum.");
@@ -102,15 +88,6 @@ std::cout << " " << std::endl;
   cell[2][1] = domain->h[3];
   cell[2][2] = domain->h[2];
 
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "positions+cell: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
-
   // ----- edge_index and unit_shifts -----
   // count total number of edges
   int n_edges = 0;
@@ -142,13 +119,6 @@ std::cout << " " << std::endl;
   for (int ii=0; ii<n_nodes-1; ++ii) {
     first_edge[ii+1] = first_edge[ii] + n_edges_vec[ii];
   }
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "edge count: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
   // fill edge_index and unit_shifts tensors
   auto edge_index = torch::empty({2,n_edges}, torch::dtype(torch::kInt64));
   auto unit_shifts = torch::zeros({n_edges,3}, torch_float_dtype);
@@ -193,14 +163,6 @@ std::cout << " " << std::endl;
       }
     }
   }
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "edge fill: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
 
   // ----- node_attrs -----
   // node_attrs is one-hot encoding for atomic numbers
@@ -221,15 +183,6 @@ std::cout << " " << std::endl;
     node_attrs[i][mace_type(atom->type[i])-1] = 1.0;
   }
 
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "node attrs: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
-
   // ----- mask for ghost -----
   auto mask = torch::zeros(n_nodes, torch::dtype(torch::kBool));
   #pragma omp parallel for
@@ -247,15 +200,7 @@ std::cout << " " << std::endl;
   ptr[1] = n_nodes;
   weight[0] = 1.0;
 
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "other setup: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
   // transfer data to device
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
   batch = batch.to(device);
   cell = cell.to(device);
   edge_index = edge_index.to(device);
@@ -267,15 +212,8 @@ std::cout << " " << std::endl;
   shifts = shifts.to(device);
   unit_shifts = unit_shifts.to(device);
   weight = weight.to(device);
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "transfer: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
-  // pack the input, call the model, extract the output
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
+
+  // pack the input, call the model
   c10::Dict<std::string, torch::Tensor> input;
   input.insert("batch", batch);
   input.insert("cell", cell);
@@ -288,38 +226,12 @@ std::cout << " " << std::endl;
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "pack: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
   auto output = model.forward({input, mask.to(device), true, true, false}).toGenericDict();
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "model: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
-  auto node_energy = output.at("node_energy").toTensor().cpu();
-  forces = output.at("forces").toTensor().cpu();
-  auto vir = output.at("virials").toTensor().cpu();
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "transfer back: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
-  #ifdef MACE_DEBUG
-  t0 = std::chrono::high_resolution_clock::now();
-  #endif
 
   // mace energy
   //   -> sum of site energies of local atoms
   if (eflag_global) {
+    auto node_energy = output.at("node_energy").toTensor().cpu();
     eng_vdwl = 0.0;
     #pragma omp parallel for reduction(+:eng_vdwl)
     for (int ii=0; ii<list->inum; ++ii) {
@@ -330,6 +242,7 @@ std::cout << " " << std::endl;
 
   // mace forces
   //   -> derivatives of total mace energy
+  forces = output.at("forces").toTensor().cpu();
   #pragma omp parallel for
   for (int ii=0; ii<list->inum; ++ii) {
     int i = list->ilist[ii];
@@ -351,6 +264,7 @@ std::cout << " " << std::endl;
   // mace virials (local atoms only)
   //   -> derivatives of sum of site energies of local atoms
   if (vflag_global) {
+    auto vir = output.at("virials").toTensor().cpu();
     virial[0] = vir[0][0][0].item<double>();
     virial[1] = vir[0][1][1].item<double>();
     virial[2] = vir[0][2][2].item<double>();
@@ -364,12 +278,6 @@ std::cout << " " << std::endl;
   if (vflag_atom) {
     error->all(FLERR, "ERROR: pair_mace does not support vflag_atom.");
   }
-
-  #ifdef MACE_DEBUG
-  t1 = std::chrono::high_resolution_clock::now();
-  std::cout << "unpack: " << std::chrono::duration_cast<std::chrono::milliseconds>(t1-t0).count() << std::endl;
-  #endif
-
 }
 
 /* ---------------------------------------------------------------------- */

From 7d0543f3598a0cdc44960af5eec983915c97c624 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Tue, 30 May 2023 15:13:00 +0100
Subject: [PATCH 35/51] tidy.

---
 cmake/Modules/Packages/ML-MACE.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/Modules/Packages/ML-MACE.cmake b/cmake/Modules/Packages/ML-MACE.cmake
index 4d04b762a1a..a1624a370ef 100644
--- a/cmake/Modules/Packages/ML-MACE.cmake
+++ b/cmake/Modules/Packages/ML-MACE.cmake
@@ -5,4 +5,3 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 
 target_link_libraries(lammps PRIVATE "${TORCH_LIBRARIES}")
 set_property(TARGET lammps PROPERTY CXX_STANDARD 14)
-add_compile_definitions(MACE_DEBUG)

From f96283e0d362d61dd25d93828018bb5bb6d71dcd Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Wed, 31 May 2023 17:29:41 +0100
Subject: [PATCH 36/51] mergefix.

---
 src/ML-MACE/pair_mace.cpp | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 001bc8213a2..20bdd287849 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -231,13 +231,8 @@ void PairMACE::compute(int eflag, int vflag)
   // mace energy
   //   -> sum of site energies of local atoms
   if (eflag_global) {
-    auto node_energy = output.at("node_energy").toTensor().cpu();
-    eng_vdwl = 0.0;
-    #pragma omp parallel for reduction(+:eng_vdwl)
-    for (int ii=0; ii<list->inum; ++ii) {
-      int i = list->ilist[ii];
-      eng_vdwl += node_energy[i].item<double>();
-    }
+    energy = output.at("total_energy_local").toTensor().cpu();
+    eng_vdwl += energy.item<double>();
   }
 
   // mace forces
@@ -254,6 +249,7 @@ void PairMACE::compute(int eflag, int vflag)
   // mace site energies
   //   -> local atoms only
   if (eflag_atom) {
+    auto node_energy = output.at("node_energy").toTensor().cpu();
     #pragma omp parallel for
     for (int ii=0; ii<list->inum; ++ii) {
       int i = list->ilist[ii];

From 1350adb93cc56eb2e1e12db1766cc61e504e8e45 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Wed, 31 May 2023 18:00:37 +0100
Subject: [PATCH 37/51] detect gpu automatically.

---
 src/ML-MACE/pair_mace.cpp | 15 ++++-----------
 src/ML-MACE/pair_mace.h   |  1 -
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 20bdd287849..614f0c1ed47 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -280,18 +280,12 @@ void PairMACE::compute(int eflag, int vflag)
 
 void PairMACE::settings(int narg, char **arg)
 {
-  if (narg > 2) {
+  if (narg > 1) {
     error->all(FLERR, "Too many pair_style arguments for pair_style mace.");
   }
 
-  if (narg >= 1) {
-    if (strcmp(arg[0], "gpu") == 0) {
-      device_type = "gpu";
-    }
-  }
-
-  if (narg >= 2) {
-    if (strcmp(arg[1], "no_domain_decomposition") == 0) {
+  if (narg == 1) {
+    if (strcmp(arg[0], "no_domain_decomposition") == 0) {
       domain_decomposition = false;
       // TODO: add check against MPI rank
     }
@@ -306,12 +300,11 @@ void PairMACE::coeff(int narg, char **arg)
 
   if (!allocated) allocate();
 
-  if (device_type == "cpu") {
+  if (!torch::cuda::is_available()) {
     device = c10::Device(torch::kCPU);
   } else {
     int rank;
     MPI_Comm_rank(world, &rank);
-    std::cout << "MPI rank: " << rank << std::endl;
     device = c10::Device(torch::kCUDA,rank);
   }
 
diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index 9616d18286e..57b60e2bb17 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -47,7 +47,6 @@ class PairMACE : public Pair {
 
  protected:
 
-  std::string device_type = "cpu";
   bool domain_decomposition = true;
   torch::Device device = torch::kCPU;
   torch::jit::script::Module model;

From 38500ff5c5adeb47fafa2b367cf6777b3167928f Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Fri, 2 Jun 2023 10:30:09 +0100
Subject: [PATCH 38/51] bugfix.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 2 +-
 src/ML-MACE/pair_mace.cpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 37ed8716ce4..27e509164b6 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -311,7 +311,7 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-  auto output = model.forward({input, mask, true, bool(vflag_global), false}).toGenericDict();
+  auto output = model.forward({input, mask, bool(vflag_global)}).toGenericDict();
 
   // mace energy
   //   -> sum of site energies of local atoms
diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 614f0c1ed47..de97351c8ef 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -226,7 +226,7 @@ void PairMACE::compute(int eflag, int vflag)
   input.insert("shifts", shifts);
   input.insert("unit_shifts", unit_shifts);
   input.insert("weight", weight);
-  auto output = model.forward({input, mask.to(device), true, true, false}).toGenericDict();
+  auto output = model.forward({input, mask.to(device), bool(vflag_global)}).toGenericDict();
 
   // mace energy
   //   -> sum of site energies of local atoms

From 5dbc52d8a3e9b5ef321d2f157c052bb3812814fb Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Wed, 7 Jun 2023 23:23:41 +0100
Subject: [PATCH 39/51] add cuda message.

---
 src/ML-MACE/pair_mace.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index de97351c8ef..e44645bdd4a 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -301,8 +301,10 @@ void PairMACE::coeff(int narg, char **arg)
   if (!allocated) allocate();
 
   if (!torch::cuda::is_available()) {
+    std::cout << "CUDA unavailable, setting device type to torch::kCPU." << std::endl;
     device = c10::Device(torch::kCPU);
   } else {
+    std::cout << "CUDA found, setting device type to torch::kCUDA." << std::endl;
     int rank;
     MPI_Comm_rank(world, &rank);
     device = c10::Device(torch::kCUDA,rank);

From 4995a4023ee71819bb90b7699c1e1849dc7128df Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 10 Jul 2023 15:15:15 +0100
Subject: [PATCH 40/51] update for domain decomposition with new lammps
 wrapper.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 27e509164b6..c9408c54e45 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -331,11 +331,11 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
   forces = output.at("forces").toTensor();
   auto forces_ptr = static_cast<double*>(forces.data_ptr());
   auto k_forces = Kokkos::View<double*[3],Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(forces_ptr,n_nodes);
-  Kokkos::parallel_for("PairMACEKokkos: Extract k_forces.", nlocal, KOKKOS_LAMBDA(const int ii) {
+  Kokkos::parallel_for("PairMACEKokkos: Extract k_forces.", n_nodes, KOKKOS_LAMBDA(const int ii) {
     const int i = d_ilist(ii);
-    f(i,0) = k_forces(i,0);
-    f(i,1) = k_forces(i,1);
-    f(i,2) = k_forces(i,2);
+    f(i,0) += k_forces(i,0);
+    f(i,1) += k_forces(i,1);
+    f(i,2) += k_forces(i,2);
   });
 
   // mace virials (local atoms only)
@@ -344,12 +344,12 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
     // TODO: is this cpu transfer necessary?
     auto vir = output.at("virials").toTensor().to("cpu");
     // caution: lammps does not use voigt ordering
-    virial[0] = vir[0][0][0].item<double>();
-    virial[1] = vir[0][1][1].item<double>();
-    virial[2] = vir[0][2][2].item<double>();
-    virial[3] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
-    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
-    virial[5] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+    virial[0] += vir[0][0][0].item<double>();
+    virial[1] += vir[0][1][1].item<double>();
+    virial[2] += vir[0][2][2].item<double>();
+    virial[3] += 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+    virial[4] += 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
+    virial[5] += 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
   }
 
   // TODO: investigate this

From 8b3243853a0790a3edab3142dd3a7f4c20640003 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 10 Jul 2023 15:29:51 +0100
Subject: [PATCH 41/51] same update for non-kokkos style.

---
 src/ML-MACE/pair_mace.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index e44645bdd4a..e5bd0b8bbef 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -239,11 +239,11 @@ void PairMACE::compute(int eflag, int vflag)
   //   -> derivatives of total mace energy
   forces = output.at("forces").toTensor().cpu();
   #pragma omp parallel for
-  for (int ii=0; ii<list->inum; ++ii) {
+  for (int ii=0; ii<n_nodes; ++ii) {
     int i = list->ilist[ii];
-    atom->f[i][0] = forces[i][0].item<double>();
-    atom->f[i][1] = forces[i][1].item<double>();
-    atom->f[i][2] = forces[i][2].item<double>();
+    atom->f[i][0] += forces[i][0].item<double>();
+    atom->f[i][1] += forces[i][1].item<double>();
+    atom->f[i][2] += forces[i][2].item<double>();
   }
 
   // mace site energies
@@ -261,12 +261,12 @@ void PairMACE::compute(int eflag, int vflag)
   //   -> derivatives of sum of site energies of local atoms
   if (vflag_global) {
     auto vir = output.at("virials").toTensor().cpu();
-    virial[0] = vir[0][0][0].item<double>();
-    virial[1] = vir[0][1][1].item<double>();
-    virial[2] = vir[0][2][2].item<double>();
-    virial[3] = 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
-    virial[4] = 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
-    virial[5] = 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+    virial[0] += vir[0][0][0].item<double>();
+    virial[1] += vir[0][1][1].item<double>();
+    virial[2] += vir[0][2][2].item<double>();
+    virial[3] += 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
+    virial[4] += 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
+    virial[5] += 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
   }
 
   // mace site virials

From dd850990aa29f23925909940412c8040fd1a10fd Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 10 Jul 2023 17:26:23 +0100
Subject: [PATCH 42/51] enable multi-node.

---
 src/ML-MACE/pair_mace.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index e5bd0b8bbef..a9290034254 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -305,9 +305,13 @@ void PairMACE::coeff(int narg, char **arg)
     device = c10::Device(torch::kCPU);
   } else {
     std::cout << "CUDA found, setting device type to torch::kCUDA." << std::endl;
-    int rank;
-    MPI_Comm_rank(world, &rank);
-    device = c10::Device(torch::kCUDA,rank);
+    //int worldrank;
+    //MPI_Comm_rank(world, &worldrank);
+    MPI_Comm local;
+    MPI_Comm_split_type(world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local);
+    int localrank;
+    MPI_Comm_rank(local, &localrank);
+    device = c10::Device(torch::kCUDA,localrank);
   }
 
   std::cout << "Loading MACE model from \"" << arg[2] << "\" ...";

From 11ef3c412ff1c394eefe81aa8a2ff9f00546f091 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 20 Nov 2023 19:46:34 +0000
Subject: [PATCH 43/51] check pair_style input.

---
 src/ML-MACE/pair_mace.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index a9290034254..d7ccfdbfeff 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -288,6 +288,8 @@ void PairMACE::settings(int narg, char **arg)
     if (strcmp(arg[0], "no_domain_decomposition") == 0) {
       domain_decomposition = false;
       // TODO: add check against MPI rank
+    } else {
+      error->all(FLERR, "Unrecognized argument for pair_style mace.");
     }
   }
 }

From 344780a91dc05ac6514cd71708aae3e76ab4bbba Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 20 Nov 2023 20:25:50 +0000
Subject: [PATCH 44/51] bugfix affecting cases when lammps has fewer atom types
 than mace.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index c9408c54e45..8dc3c25c797 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -374,7 +374,7 @@ void PairMACEKokkos<DeviceType>::coeff(int narg, char **arg)
   Kokkos::deep_copy(k_lammps_atomic_numbers, k_lammps_atomic_numbers_mirror);
   k_mace_atomic_numbers = Kokkos::View<int64_t*,DeviceType>("k_mace_atomic_numbers",mace_atomic_numbers.size());
   auto k_mace_atomic_numbers_mirror = Kokkos::create_mirror_view(k_mace_atomic_numbers);
-  for (int i=0; i<lammps_atomic_numbers.size(); ++i) {
+  for (int i=0; i<mace_atomic_numbers.size(); ++i) {
     k_mace_atomic_numbers_mirror(i) = mace_atomic_numbers[i];
   }
   Kokkos::deep_copy(k_mace_atomic_numbers, k_mace_atomic_numbers_mirror);

From f41a99ef6a4e5c837a6c5c614f14198e54141f7f Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 20 Nov 2023 21:03:09 +0000
Subject: [PATCH 45/51] delete outdated.

---
 src/ML-MACE/pair_mace.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index d7ccfdbfeff..1dd37a324ea 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -307,8 +307,6 @@ void PairMACE::coeff(int narg, char **arg)
     device = c10::Device(torch::kCPU);
   } else {
     std::cout << "CUDA found, setting device type to torch::kCUDA." << std::endl;
-    //int worldrank;
-    //MPI_Comm_rank(world, &worldrank);
     MPI_Comm local;
     MPI_Comm_split_type(world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local);
     int localrank;

From 2de384d48c9047ca2c9f3693c1fcb72397a1c74a Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 20 Nov 2023 23:26:21 +0000
Subject: [PATCH 46/51] print more info about type mapping.

---
 src/ML-MACE/pair_mace.cpp | 28 +++++++++++++++++-----------
 src/ML-MACE/pair_mace.h   |  1 +
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 1dd37a324ea..abfcba5bd76 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -165,16 +165,6 @@ void PairMACE::compute(int eflag, int vflag)
   }
 
   // ----- node_attrs -----
-  // node_attrs is one-hot encoding for atomic numbers
-  auto mace_type = [this](int lammps_type) {
-    for (int i=0; i<mace_atomic_numbers.size(); ++i) {
-      if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
-        return i+1;
-      }
-    }
-    error->all(FLERR, "ERROR: problem converting lammps_type to mace_type.");
-    return -1;
-  };
   int n_node_feats = mace_atomic_numbers.size();
   auto node_attrs = torch::zeros({n_nodes,n_node_feats}, torch_float_dtype);
   #pragma omp parallel for
@@ -343,7 +333,7 @@ void PairMACE::coeff(int narg, char **arg)
   for (int i=0; i<a_n.size(0); ++i) {
     mace_atomic_numbers.push_back(a_n[i].item<int64_t>());
   }
-  std::cout << "  - The model atomic numbers are: " << mace_atomic_numbers << "." << std::endl;
+  std::cout << "  - The MACE model atomic numbers are: " << mace_atomic_numbers << "." << std::endl;
 
   // extract atomic numbers from pair_coeff
   for (int i=3; i<narg; ++i) {
@@ -353,6 +343,11 @@ void PairMACE::coeff(int narg, char **arg)
   }
   std::cout << "  - The pair_coeff atomic numbers are: " << lammps_atomic_numbers << "." << std::endl;
 
+  for (int i=1; i<=lammps_atomic_numbers.size(); ++i) {
+    std::cout << "  - Mapping LAMMPS type " << i << " (" << periodic_table[i-1]
+      << ") to MACE type " << mace_type(i) << "." << std::endl;
+  }
+
   for (int i=1; i<atom->ntypes+1; i++)
     for (int j=i; j<atom->ntypes+1; j++)
       setflag[i][j] = 1;
@@ -396,3 +391,14 @@ void PairMACE::allocate()
 
   memory->create(cutsq, atom->ntypes+1, atom->ntypes+1, "pair:cutsq");
 }
+
+int PairMACE::mace_type(int lammps_type)
+{
+    for (int i=0; i<mace_atomic_numbers.size(); ++i) {
+      if (mace_atomic_numbers[i]==lammps_atomic_numbers[lammps_type-1]) {
+        return i+1;
+      }
+    }
+    error->all(FLERR, "Problem converting lammps_type to mace_type.");
+    return -1;
+ }
diff --git a/src/ML-MACE/pair_mace.h b/src/ML-MACE/pair_mace.h
index 57b60e2bb17..4c8eadf86d6 100644
--- a/src/ML-MACE/pair_mace.h
+++ b/src/ML-MACE/pair_mace.h
@@ -56,6 +56,7 @@ class PairMACE : public Pair {
   int64_t num_interactions;
   std::vector<int64_t> mace_atomic_numbers;
   std::vector<int64_t> lammps_atomic_numbers;
+  int mace_type(int lammps_type);
   const std::array<std::string,118> periodic_table =
     { "H", "He",
      "Li", "Be",                                                              "B",  "C",  "N",  "O",  "F", "Ne",

From 9988368f0e7fe7ed02cc8d3025794af3978b8073 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Sun, 10 Dec 2023 00:57:26 +0000
Subject: [PATCH 47/51] bugfix.

---
 src/ML-MACE/pair_mace.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index abfcba5bd76..146262fafff 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -344,7 +344,8 @@ void PairMACE::coeff(int narg, char **arg)
   std::cout << "  - The pair_coeff atomic numbers are: " << lammps_atomic_numbers << "." << std::endl;
 
   for (int i=1; i<=lammps_atomic_numbers.size(); ++i) {
-    std::cout << "  - Mapping LAMMPS type " << i << " (" << periodic_table[i-1]
+    std::cout << "  - Mapping LAMMPS type " << i
+      << " (" << periodic_table[lammps_atomic_numbers[i-1]-1]
       << ") to MACE type " << mace_type(i) << "." << std::endl;
   }
 

From 184b6fdeccd0c430fd259e7467ca37d616632b63 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Thu, 1 Feb 2024 05:36:26 +0000
Subject: [PATCH 48/51] remove hardcoded c++14. better to set at command line.

---
 cmake/Modules/Packages/ML-MACE.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/Modules/Packages/ML-MACE.cmake b/cmake/Modules/Packages/ML-MACE.cmake
index a1624a370ef..e2956ae5984 100644
--- a/cmake/Modules/Packages/ML-MACE.cmake
+++ b/cmake/Modules/Packages/ML-MACE.cmake
@@ -4,4 +4,3 @@ find_package(Torch REQUIRED)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 
 target_link_libraries(lammps PRIVATE "${TORCH_LIBRARIES}")
-set_property(TARGET lammps PROPERTY CXX_STANDARD 14)

From 2a22b697874ce8ef4cbb316064ca4a95249d6b5a Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 28 Oct 2024 04:02:04 -0400
Subject: [PATCH 49/51] Add domain header.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 8dc3c25c797..406e7ff4b45 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -20,6 +20,7 @@
 
 #include "atom_kokkos.h"
 #include "atom_masks.h"
+#include "domain.h"
 #include "error.h"
 #include "force.h"
 #include "kokkos.h"

From f51963aada27c5df4a87a37faec5a4125d0f2e82 Mon Sep 17 00:00:00 2001
From: Chuck Witt <witt.chuck@gmail.com>
Date: Mon, 28 Oct 2024 04:03:44 -0400
Subject: [PATCH 50/51] Change item to template item.

---
 src/KOKKOS/pair_mace_kokkos.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/pair_mace_kokkos.cpp b/src/KOKKOS/pair_mace_kokkos.cpp
index 406e7ff4b45..b984b5fc210 100644
--- a/src/KOKKOS/pair_mace_kokkos.cpp
+++ b/src/KOKKOS/pair_mace_kokkos.cpp
@@ -345,12 +345,14 @@ void PairMACEKokkos<DeviceType>::compute(int eflag, int vflag)
     // TODO: is this cpu transfer necessary?
     auto vir = output.at("virials").toTensor().to("cpu");
     // caution: lammps does not use voigt ordering
-    virial[0] += vir[0][0][0].item<double>();
-    virial[1] += vir[0][1][1].item<double>();
-    virial[2] += vir[0][2][2].item<double>();
-    virial[3] += 0.5*(vir[0][1][0].item<double>() + vir[0][0][1].item<double>());
-    virial[4] += 0.5*(vir[0][2][0].item<double>() + vir[0][0][2].item<double>());
-    virial[5] += 0.5*(vir[0][2][1].item<double>() + vir[0][1][2].item<double>());
+    // also: it would be nice to get rid of the 'template item' stuff,
+    //       but some compilers seem to require it
+    virial[0] += vir[0][0][0].template item<double>();
+    virial[1] += vir[0][1][1].template item<double>();
+    virial[2] += vir[0][2][2].template item<double>();
+    virial[3] += 0.5*(vir[0][1][0].template item<double>() + vir[0][0][1].template item<double>());
+    virial[4] += 0.5*(vir[0][2][0].template item<double>() + vir[0][0][2].template item<double>());
+    virial[5] += 0.5*(vir[0][2][1].template item<double>() + vir[0][1][2].template item<double>());
   }
 
   // TODO: investigate this

From e55b3212da455af14f02772efe29b9e787e16eec Mon Sep 17 00:00:00 2001
From: Logan Ward <ward.logan.t@gmail.com>
Date: Thu, 24 Apr 2025 15:39:03 +0000
Subject: [PATCH 51/51] Rely on GPU pinning externally

Avoids the need to compile with MPI support
---
 src/ML-MACE/pair_mace.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/ML-MACE/pair_mace.cpp b/src/ML-MACE/pair_mace.cpp
index 146262fafff..9b7547653fc 100644
--- a/src/ML-MACE/pair_mace.cpp
+++ b/src/ML-MACE/pair_mace.cpp
@@ -297,11 +297,8 @@ void PairMACE::coeff(int narg, char **arg)
     device = c10::Device(torch::kCPU);
   } else {
     std::cout << "CUDA found, setting device type to torch::kCUDA." << std::endl;
-    MPI_Comm local;
-    MPI_Comm_split_type(world, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local);
-    int localrank;
-    MPI_Comm_rank(local, &localrank);
-    device = c10::Device(torch::kCUDA,localrank);
+    int localrank = 0;  // Assume GPU pinning occurs outside of LAMMPS
+    device = c10::Device(torch::kCUDA, localrank);
   }
 
   std::cout << "Loading MACE model from \"" << arg[2] << "\" ...";