SC-SGS · constracktor · Apr 15, 2025 · Apr 15, 2025 · Jul 11, 2025 · Jul 14, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -45,7 +45,11 @@ jobs:
 
     - name: Choose environment
       run: |
-        cp spack-repo/environments/spack_cpu_gcc.yaml spack.yaml
+        # gprat_cpu_gcc_dist (rather than gprat_cpu_gcc) builds HPX with networking=tcp instead
+        # of networking=none, which --hpx:localities (used by the distributed multi-locality
+        # tests, see test/CMakeLists.txt) requires. It has no MKL variant, so the rest of this
+        # job uses OpenBLAS (-DGPRAT_ENABLE_MKL=OFF / -DUSE_MKL=OFF below) instead.
+        cp spack-repo/environments/spack_cpu_gcc_dist.yaml spack.yaml
         cat spack-repo/environments/ci_env_settings.yaml.tpl >> spack.yaml
 
     - name: Concretize
@@ -61,7 +65,7 @@ jobs:
       shell: spack-bash {0}
       run: |
         spack env activate .
-        cmake "--preset=ci-${{ matrix.os }}"
+        cmake "--preset=ci-${{ matrix.os }}" -DGPRAT_ENABLE_MKL=OFF -DGPRAT_WITH_DISTRIBUTED=ON -DGPRAT_TEST_MULTI_LOCALITY=ON
 
     - name: Build
       run: cmake --build build --config Release
@@ -87,7 +91,7 @@ jobs:
       shell: spack-bash {0}
       run: |
         spack env activate .
-        cmake -G "Unix Makefiles" -S examples/gprat_cpp -B build_examples -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=$PWD/prefix -DUSE_MKL=ON
+        cmake -G "Unix Makefiles" -S examples/gprat_cpp -B build_examples -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=$PWD/prefix -DUSE_MKL=OFF
 
     - name: Build example project
       run: cmake --build build_examples --config Release

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -3,7 +3,6 @@ name: Code linting
 on:
   push:
     branches:
-      - main
   pull_request:
 
 jobs:

diff --git a/.gitignore b/.gitignore
@@ -204,5 +204,8 @@ compile_commands.json
 # Build files
 build*
 
+# CTest output
+Testing/
+
 # Ignore folder
 ignore
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -29,6 +29,8 @@ cmake_dependent_option(GPRAT_ENABLE_TESTS "Build unit and integration tests"
 
 cmake_dependent_option(GPRAT_ENABLE_MKL "Enable support for Intel oneMKL"
                        ${PROJECT_IS_TOP_LEVEL} "GPRAT_BUILD_CORE" OFF)
+option(GPRAT_ENABLE_BENCHMARK_CACHE_EVICTIONS
+       "Evict data from caches before running BLAS operations" ON)
 
 option(GPRAT_ENABLE_FORMAT_TARGETS "Enable clang-format / cmake-format targets"
        ${PROJECT_IS_TOP_LEVEL})
@@ -86,6 +88,7 @@ if(GPRAT_BUILD_CORE)
 
   # HPX
   find_package(HPX REQUIRED)
+  include(cmake/hpx-apex-compat.cmake)
 
   # Add core subdirectiory
   add_subdirectory(core)
@@ -122,6 +125,11 @@ if(NOT CMAKE_SKIP_INSTALL_RULES AND GPRAT_BUILD_CORE)
     RENAME "${package}Config.cmake"
     COMPONENT Development)
 
+  install(
+    FILES cmake/hpx-apex-compat.cmake
+    DESTINATION "${GPRat_INSTALL_CMAKEDIR}"
+    COMPONENT Development)
+
   install(
     FILES "${PROJECT_BINARY_DIR}/${package}ConfigVersion.cmake"
     DESTINATION "${GPRat_INSTALL_CMAKEDIR}"
@@ -142,6 +150,9 @@ endif()
 # ##############################################################################
 if(GPRAT_ENABLE_EXAMPLES)
   add_subdirectory(examples/gprat_cpp)
+  if(GPRAT_WITH_DISTRIBUTED)
+    add_subdirectory(examples/gprat_distributed)
+  endif()
 endif()
 
 # Tests

diff --git a/README.md b/README.md
@@ -6,6 +6,10 @@ Leveraging the asynchronous many-task runtime HPX, we aim to combine the perform
 with the ease of use of commonly available Python libraries.
 Thus, GPRat can be conveniently integrated into Python projects without binding overheads or used directly with pure C++
 code.
+Computations run on CPUs as well as NVIDIA GPUs (CUDA) and Intel/AMD GPUs (SYCL), in single (fp32) and double (fp64)
+precision.
+GPRat further provides a NUMA-aware allocator for tile data, performance counters, and optional distributed execution
+via HPX actions.
 
 ## Dependencies
 
@@ -21,6 +25,9 @@ A script to install and setup spack for `GPRat` is provided in [`spack-repo`](sp
 Spack environment configurations and setup scripts for CPU and GPU use are provided in
 [`spack-repo/environments`](spack-repo/environments).
 
+Since Spack is not available on Windows, we also support dependency installation using vcpkg.
+For now, vcpkg builds are only tested on Windows.
+
 ## How To Compile
 
 GPRat makes use of [CMake presets][1] to simplify the process of configuring the project.
@@ -35,8 +42,9 @@ ctest --preset=dev-linux
 
 As a developer, you may create a `CMakeUserPresets.json` file at the root of the project that contains additional
 presets local to your machine.
-In addition to the build configuration `dev-linux`, there are `release-linux`, `dev-linux-gpu`, `release-linux-gpu`, `dev-linux-sycl`, and `release-linux-sycl`.
-The configurations suffixed with `-gpu` build the library with CUDA for NVIDIA GPUs, and those suffixed with `-sycl` build it with SYCL support for Intel and AMD GPUs.
+In addition to the build configuration `dev-linux`, there are `release-linux`, `dev-linux-cuda`, `release-linux-cuda`, `dev-linux-sycl`, and `release-linux-sycl`.
+For Windows, we have similar presets called `dev-windows` and `release-windows`.
+The configurations suffixed with `-cuda` build the library with CUDA for NVIDIA GPUs, and those suffixed with `-sycl` build it with SYCL support for Intel and AMD GPUs.
 
 GPRat can be build with or without Python bindings.
 The following options can be set to include / exclude parts of the project:
@@ -45,14 +53,31 @@ The following options can be set to include / exclude parts of the project:
 |--------------------------------|--------------------------------------------------------------------------------------|-----------------|
 | GPRAT_BUILD_CORE               | Enable/Disable building of the core library                                          | ON              |
 | GPRAT_BUILD_BINDINGS           | Enable/Disable building of the Python bindings                                       | ON              |
-| GPRAT_ENABLE_FORMAT_TARGETS    | Enable/Disable code formatting helper targets                                        | ON if top-level |
 | GPRAT_ENABLE_EXAMPLES          | Enable/Disable example projects                                                      | ON if top-level |
-| GPRAT_USE_MKL                  | Enable/Disable usage of MKL library                                                  | OFF             |
+| GPRAT_ENABLE_TESTS             | Enable/Disable building of unit and integration tests                                | ON if top-level |
+| GPRAT_ENABLE_FORMAT_TARGETS    | Enable/Disable code formatting helper targets                                        | ON if top-level |
+| GPRAT_ENABLE_MKL               | Enable/Disable support for Intel oneMKL                                              | OFF             |
 | GPRAT_WITH_CUDA                | Enable/disable compilation with CUDA support (NVIDIA GPUs)                           | OFF             |
 | GPRAT_WITH_SYCL                | Enable/disable compilation with SYCL support (Intel and AMD GPUs via oneMath)        | OFF             |
+| GPRAT_WITH_DISTRIBUTED         | Enable/disable distributed GP support via HPX actions                                | OFF             |
 | GPRAT_APEX_STEPS               | Enable/disable compilation for steps duration measurement with APEX                  | OFF             |
 | GPRAT_APEX_CHOLESKY            | Enable/disable compilation for measuring cholesky assembly and computation with APEX | OFF             |
 
+A convenience script `compile_gprat.sh` is provided to configure, build, and install GPRat with a single command.
+It takes five parameters:
+
+```sh
+./compile_gprat.sh [python/cpp] [cpu/cuda/sycl] [release/dev] [mkl/none] [steps/cholesky/none]
+```
+
+- `$1`: build the Python bindings (`python`) or the C++ library (`cpp`)
+- `$2`: backend, CPU (`cpu`), CUDA for NVIDIA GPUs (`cuda`), or SYCL for Intel and AMD GPUs (`sycl`)
+- `$3`: build in `release` or `dev` mode
+- `$4`: enable Intel oneMKL (`mkl`) or use OpenBLAS (`none`)
+- `$5`: APEX profiling, measure step durations (`steps`), cholesky assembly and computation (`cholesky`), or disable profiling (`none`)
+
+Computations are supported in both single (fp32) and double (fp64) precision.
+
 Respective scripts can be found in this directory.
 
 We also provide a spack package for GPRat in [`spack-repo/packages`](spack-repo/packages) for portable and convenient compilation. When the repository is added to spack, GPRat can be installed with `spack install gprat~cuda~bindings~examples blas={mkl,openblas}`
@@ -68,23 +93,61 @@ implementations based on TensorFlow ([GPflow](https://github.com/GPflow/GPflow))
 - Go to [`examples/gprat_cpp`](examples/gprat_cpp/)
 - Set parameters in [`execute.cpp`](examples/gprat_cpp/src/execute.cpp)
 - The example is built as part of the main project.
-  - Go to `build/` and execute `./gprat_cpp [--use_gpu]` to run the example.
+  - Go to `build/` and execute `./gprat_cpp [--use-gpu]` to run the example.
   - If you want to use an installed GPRat version:
-    Run `./run_gprat_cpp.sh [cpu/gpu] [x86/arm/riscv]` to build and run the example.
+    Run `./run_gprat_cpp.sh [cpu/cuda/sycl] [nvidia/amd/intel]` to build and run the example.
+    The second parameter selects the SYCL device and is only required when GPRat was compiled with the SYCL backend.
 
 ### To run GPRat with Python
 
 - Go to [`examples/gprat_python`](examples/gprat_python/)
 - Set parameters in [`config.json`](examples/gprat_python/config.json)
-- Run `./run_gprat_python.sh [cpu/gpu]` to run the example
+- Run `./run_gprat_python.sh [cpu/cuda/sycl] [nvidia/amd/intel]` to run the example.
+  The second parameter selects the SYCL device and is only required when GPRat was compiled with the SYCL backend.
+
+### To run the distributed GPRat benchmark
+
+- Configure the main project with `-DGPRAT_WITH_DISTRIBUTED=ON` to build [`examples/gprat_distributed`](examples/gprat_distributed/).
+- The example is a CLI-driven scaling benchmark (no `config.json`) rather than a single "run one example" tool,
+  since it sweeps over training-set sizes rather than running one fixed configuration.
+- Go to `build/` and execute `./gprat_distributed [options]`, or run `./run_gprat_distributed.sh [options]` to
+  build and run it. Useful options:
+  - `--start`/`--end`/`--step`: training-set sizes to sweep over (e.g. `--start 128 --end 4096 --step 2`)
+  - `--tiles`, `--regressors`, `--n_test`, `--opt_iter`, `--loop`: problem size and repetition count
+  - `--enabled`: bitmask to select which of cholesky/optimize/predict/predict_with_uncertainty/predict_with_full_cov to run
+  - `--train_x_path`/`--train_y_path`/`--test_path`: point at a larger dataset (e.g. one generated via
+    [`data/generators`](data/generators/)) for a real scaling study; the defaults point at the small `data/data_1024`
+    correctness fixture
+  - `--output_csv`: where per-run timings are appended (defaults to `examples/gprat_distributed/output.csv`,
+    matching the other examples)
+- By default (`GPRAT_DIST_MULTI_LOCALITY=1`, on unless you set it to `0` before running
+  `run_gprat_distributed.sh`) the script runs across multiple localities on one node: it builds a
+  networking-enabled binary and, for each locality count in `GPRAT_DIST_LOCALITIES` (default `"1 2 4"`),
+  launches that many processes itself with `--hpx:localities=N --hpx:node=0..N-1` (node `0` is the console
+  process that receives your CLI options; the others just join), waiting for each round to finish before
+  moving to the next. All arguments you pass are forwarded to node `0`. Set `GPRAT_DIST_MULTI_LOCALITY=0` to
+  opt back into a single-locality run against the default `gprat_cpu_gcc` build.
+  **Important:** HPX's TCP parcelport zero-copy path (`hpx.parcel.zero_copy_serialization_threshold`,
+  8192 bytes by default) reliably hangs once tile sizes exceed it in a multi-locality run, so the script
+  always raises it (`--hpx:ini=hpx.parcel.zero_copy_serialization_threshold=999999999`) for these runs.
+  Running across multiple actual nodes additionally requires cluster-specific network configuration
+  (AGAS bootstrap addresses, hostfile/job-scheduler integration) not set up here.
+  - The default Spack environment (`gprat_cpu_gcc`) builds HPX with `networking=none`, which rejects
+    `--hpx:localities` outright. `GPRAT_DIST_MULTI_LOCALITY=1` instead uses the `gprat_cpu_gcc_dist`
+    Spack environment (`networking=tcp`, OpenBLAS-only — see
+    `spack-repo/environments/setup_gprat_cpu_gcc_dist.sh`) and builds into a separate
+    `build/release-linux-dist` directory to avoid mixing the two toolchains.
+  - Enable `-DGPRAT_TEST_MULTI_LOCALITY=ON` (in addition to `-DGPRAT_WITH_DISTRIBUTED=ON`) to register
+    CTest smoke tests (`GPRat_test_distributed_multi_locality_{1,2,4}`) that launch `gprat_distributed`
+    across 1/2/4 localities; off by default since it needs the same networking-enabled HPX build.
 
 ### To run GPflow reference
 
 - Go to [`examples/gpflow_reference`](examples/gpflow_reference/)
 - Set parameters in [`config.json`](examples/gpflow_reference/config.json)
 - Run `./run_gpflow.sh [cpu/gpu/arm]` to run example
 
-### To run GPflow reference
+### To run GPyTorch reference
 
 - Go to [`examples/gpytorch_reference`](examples/gpytorch_reference/)
 - Set parameters in [`config.json`](examples/gpytorch_reference/config.json)
@@ -107,9 +170,12 @@ We specifically thank the follow contributors:
 - [Henrik Möllmann](https://www.linkedin.com/in/moellh/):
   [CUDA backend via cuBLAS/cuSOLVER](tbd.).
 
-- Marcel Graf:
+- [Marcel Graf](https://github.com/MarcelGraf0710):
   [SYCL backend via oneMath](tbd.).
 
+- [Tim Niederhausen](https://github.com/timniederhausen):
+  [Distributed GP via HPX actions](tbd.).
+
 ## How To Cite
 
 ```

diff --git a/Testing/Temporary/CTestCostData.txt b/Testing/Temporary/CTestCostData.txt
diff --git a/bindings/CMakeLists.txt b/bindings/CMakeLists.txt
@@ -1,5 +1,5 @@
 # try finding pybind11
-set(GPRat_pybind11_VERSION 2.10.3)
+set(GPRat_pybind11_VERSION 2.13.6)
 find_package(pybind11 ${GPRat_pybind11_VERSION} QUIET)
 if(pybind11_FOUND)
   message(STATUS "Found package pybind11.")

diff --git a/bindings/gprat_py.cpp b/bindings/gprat_py.cpp
@@ -1,4 +1,5 @@
-#include "gprat_c.hpp"
+#include "gprat/gprat.hpp"
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
@@ -31,19 +32,19 @@ void init_gprat(py::module &m)
     // Set hyperparameters to default values in `AdamParams` class, unless
     // specified. Python object has full access to each hyperparameter and a
     // string representation `__repr__`.
-    py::class_<gprat_hyper::AdamParams>(m, "AdamParams")
+    py::class_<gprat::AdamParams>(m, "AdamParams")
         .def(py::init<double, double, double, double, int>(),
              py::arg("learning_rate") = 0.001,
              py::arg("beta1") = 0.9,
              py::arg("beta2") = 0.999,
              py::arg("epsilon") = 1e-8,
              py::arg("opt_iter") = 0)
-        .def_readwrite("learning_rate", &gprat_hyper::AdamParams::learning_rate)
-        .def_readwrite("beta1", &gprat_hyper::AdamParams::beta1)
-        .def_readwrite("beta2", &gprat_hyper::AdamParams::beta2)
-        .def_readwrite("epsilon", &gprat_hyper::AdamParams::epsilon)
-        .def_readwrite("opt_iter", &gprat_hyper::AdamParams::opt_iter)
-        .def("__repr__", &gprat_hyper::AdamParams::repr);
+        .def_readwrite("learning_rate", &gprat::AdamParams::learning_rate)
+        .def_readwrite("beta1", &gprat::AdamParams::beta1)
+        .def_readwrite("beta2", &gprat::AdamParams::beta2)
+        .def_readwrite("epsilon", &gprat::AdamParams::epsilon)
+        .def_readwrite("opt_iter", &gprat::AdamParams::opt_iter)
+        .def("__repr__", &gprat::AdamParams::repr);
 
     // Initializes Gaussian Process with `GP` class. Sets default parameters for
     // squared exponential kernel, number of regressors and trainable, unless
@@ -136,6 +137,8 @@ n_units to a value enables computations on the GPU.
              py::arg("test_data"),
              py::arg("m_tiles"),
              py::arg("m_tile_size"))
+        .def(
+            "cholesky", &gprat::GP::cholesky, "Compute and return the Cholesky decomposition of the covariance matrix.")
         .def("optimize", &gprat::GP::optimize, py::arg("AdamParams"))
         .def("optimize_step", &gprat::GP::optimize_step, py::arg("AdamParams"), py::arg("iter"))
         .def("compute_loss", &gprat::GP::calculate_loss);

diff --git a/bindings/utils_py.cpp b/bindings/utils_py.cpp
@@ -1,5 +1,6 @@
-#include "target.hpp"
-#include "utils_c.hpp"
+#include "gprat/target.hpp"
+#include "gprat/utils.hpp"
+
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
@@ -32,7 +33,7 @@ void start_hpx_wrapper(std::vector<std::string> args, std::size_t n_cores)
     }
     argv.push_back(nullptr);
     int argc = static_cast<int>(args.size());
-    utils::start_hpx_runtime(argc, argv.data());
+    gprat::start_hpx_runtime(argc, argv.data());
 }
 
 /**
@@ -43,7 +44,7 @@ void start_hpx_wrapper(std::vector<std::string> args, std::size_t n_cores)
 void init_utils(py::module &m)
 {
     m.def("compute_train_tiles",
-          &utils::compute_train_tiles,
+          &gprat::compute_train_tiles,
           py::arg("n_samples"),
           py::arg("n_tile_size"),
           R"pbdoc(
@@ -58,7 +59,7 @@ void init_utils(py::module &m)
           )pbdoc");
 
     m.def("compute_train_tile_size",
-          &utils::compute_train_tile_size,
+          &gprat::compute_train_tile_size,
           py::arg("n_samples"),
           py::arg("n_tiles"),
           R"pbdoc(
@@ -73,7 +74,7 @@ void init_utils(py::module &m)
           )pbdoc");
 
     m.def("compute_test_tiles",
-          &utils::compute_test_tiles,
+          &gprat::compute_test_tiles,
           py::arg("m_samples"),
           py::arg("n_tiles"),
           py::arg("n_tile_size"),
@@ -90,20 +91,20 @@ void init_utils(py::module &m)
           )pbdoc");
 
     m.def("print_vector",
-          &utils::print_vector,
+          &gprat::print_vector,
           py::arg("vec"),
           py::arg("start") = 0,
           py::arg("end") = -1,
           py::arg("separator") = " ",
           "Print elements of a vector with optional start, end, and separator parameters");
 
     m.def("start_hpx", &start_hpx_wrapper, py::arg("args"), py::arg("n_cores"));  // Using the wrapper function
-    m.def("resume_hpx", &utils::resume_hpx_runtime);
-    m.def("suspend_hpx", &utils::suspend_hpx_runtime);
-    m.def("stop_hpx", &utils::stop_hpx_runtime);
+    m.def("resume_hpx", &gprat::resume_hpx_runtime);
+    m.def("suspend_hpx", &gprat::suspend_hpx_runtime);
+    m.def("stop_hpx", &gprat::stop_hpx_runtime);
 
-    m.def("compiled_with_cuda", &utils::compiled_with_cuda, "Check if the code was compiled with CUDA support");
-    m.def("compiled_with_sycl", &utils::compiled_with_sycl, "Check if the code was compiled with SYCL support");
+    m.def("compiled_with_cuda", &gprat::compiled_with_cuda, "Check if the code was compiled with CUDA support");
+    m.def("compiled_with_sycl", &gprat::compiled_with_sycl, "Check if the code was compiled with SYCL support");
 
     m.def("print_available_gpus", &gprat::print_available_gpus, "Print available GPUs with their properties");
     m.def("gpu_count", &gprat::gpu_count, "Return the number of available GPUs");

diff --git a/cmake/hpx-apex-compat.cmake b/cmake/hpx-apex-compat.cmake
@@ -0,0 +1,32 @@
+# HPX built with +static and instrumentation=apex embeds APEX's private
+# zlib/rapidjson/otf2 dependencies into HPXTargets.cmake by bare name instead of
+# as proper (exported) targets. Since no target with those names exists in a
+# consuming project, CMake falls back to raw "-l<name>" linker flags, which
+# fail: "-lzlib" has no matching library file (real zlib produces libz, not
+# libzlib) and "-lrapidjson" is header-only and never produces a library file at
+# all. Defining targets with these exact names satisfies
+# target_link_libraries()'s lookup before it degrades to a linker flag. This is
+# purely additive: targets are only created when the real dependency can be
+# found, so builds that don't hit this HPX export bug are unaffected.
+if(NOT TARGET zlib)
+  find_package(ZLIB QUIET)
+  if(ZLIB_FOUND)
+    add_library(zlib INTERFACE IMPORTED)
+    target_link_libraries(zlib INTERFACE ZLIB::ZLIB)
+  endif()
+endif()
+
+if(NOT TARGET rapidjson)
+  add_library(rapidjson INTERFACE IMPORTED)
+endif()
+
+if(NOT TARGET otf2)
+  find_library(
+    GPRat_OTF2_LIBRARY
+    NAMES otf2
+    HINTS "${Otf2_ROOT}/lib")
+  if(GPRat_OTF2_LIBRARY)
+    add_library(otf2 INTERFACE IMPORTED)
+    target_link_libraries(otf2 INTERFACE "${GPRat_OTF2_LIBRARY}")
+  endif()
+endif()
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,7 +3,6 @@ name: Code linting @@
     on:
       push:
         branches:
-          - main
       pull_request:
     jobs:
@@ Expand Down @@