snowflakedb · sfc-gh-reyazda · Aug 29, 2025 · Sep 8, 2025 · Sep 10, 2025 · Sep 10, 2025
@@ -406,6 +406,8 @@ def build_deepspeed_config(self) -> Self:
         from transformers import AutoConfig
 
         model_config = AutoConfig.from_pretrained(self.model.name_or_path)
+        model_config = model_config.text_config if hasattr(model_config, "text_config") else model_config
+
         if hasattr(model_config, "hidden_size"):
             hidden_size = model_config.hidden_size
         elif hasattr(model_config, "hidden_sizes"):
@@ -543,6 +545,7 @@ def get_config(config_file_or_dict: Union[Path, Dict]) -> BaseConfig:
 
     trainer_cls = get_registered_trainer(trainer_type)
     config_cls = _get_class_attr_type_hints(trainer_cls, "config")[0]
+
     config = config_cls(**config_dict)
 
     return config
@@ -0,0 +1,15 @@
+#include "comm.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+{
+    m.def("init_comm_group", &ds_create_comm, "create comm group");
+    m.def("barrier", &ds_barrier, "barrier");
+    m.def("broadcast", &ds_broadcast, "broadcast");
+    m.def("wait_comm", &wait_comm, "wait on communication event");
+    m.def("allReduce", &ds_allreduce, "AllReduce");
+    m.def("alltoall", &ds_alltoall, "AllToAll");
+    m.def("allGather", &ds_allgather, "AllGather");
+    m.def("get_nccl_uid", &ds_get_nccl_uid, "Get NCCL UID");
+    m.def("init_nccl_comm", &ds_create_nccl_comm, "Create NCCL Comm");
+}
+//////
@@ -0,0 +1,124 @@
+#include "comm.h"
+#include <c10/cuda/CUDAStream.h>
+#include <torch/extension.h>
+
+#include <vector>
+#include "context.h"
+
+void ds_create_comm(std::vector<int>& comm_ranks, int rank)
+{
+    CoMMContext::Instance().create_comm_group(comm_ranks, rank);
+}
+
+void ds_create_nccl_comm(std::vector<int>& comm_ranks, int rank, torch::Tensor& nccl_uid)
+{
+    CoMMContext::Instance().create_nccl_comm(comm_ranks, rank, nccl_uid.data_ptr());
+}
+
+void ds_allreduce(torch::Tensor& send_buf, torch::Tensor& rcv_buf, int size, bool async_op)
+{
+    if (async_op) CoMMContext::Instance().SynchComp();
+    ncclAllReduce(send_buf.data_ptr(),
+                  rcv_buf.data_ptr(),
+                  size,
+                  (send_buf.scalar_type() == at::kFloat ? ncclFloat : (send_buf.scalar_type() == at::kHalf ? ncclHalf : ncclBfloat16)),
+                  ncclSum,
+                  CoMMContext::Instance().GetNCCLComm(),
+                  CoMMContext::Instance().GetCommStream());
+}
+
+void ds_allgather(torch::Tensor& send_buf, torch::Tensor& rcv_buf, int size, bool async_op)
+{
+    if (async_op) CoMMContext::Instance().SynchComp();
+    ncclAllGather(send_buf.data_ptr(),
+                  rcv_buf.data_ptr(),
+                  size,
+                  (send_buf.scalar_type() == at::kFloat ? ncclFloat : (send_buf.scalar_type() == at::kHalf ? ncclHalf : ncclBfloat16)),
+                  CoMMContext::Instance().GetNCCLComm(),
+                  CoMMContext::Instance().GetCommStream());
+}
+
+void wait_comm() { CoMMContext::Instance().SynchComm(); }
+
+void ds_broadcast(torch::Tensor& send_buf, torch::Tensor& rcv_buf, int size, bool async_op)
+{
+    ncclBroadcast(send_buf.data_ptr(),
+                  rcv_buf.data_ptr(),
+                  size,
+                  (send_buf.scalar_type() == at::kFloat ? ncclFloat : (send_buf.scalar_type() == at::kHalf ? ncclHalf : ncclBfloat16)),
+                  0,
+                  CoMMContext::Instance().GetNCCLComm(),
+                  CoMMContext::Instance().GetCommStream());
+}
+
+void ds_barrier() { CoMMContext::Instance().barrier(); }
+
+inline size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+    case ncclUint8:
+      return 1;
+    case ncclHalf:
+    case ncclBfloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+    case ncclUint32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+      return 8;
+    default: return 0;
+  }
+}
+
+void ncclAlltoAll(void* sendbuff,
+                        void* recvbuff,
+                        int32_t *send_counts,
+                        int32_t *recv_counts,
+                        size_t max_count,
+                        ncclDataType_t type,
+                        const unsigned nRanks,
+                        ncclComm_t comm,
+                        cudaStream_t stream) {
+
+  size_t rankOffset = max_count * wordSize(type);
+
+  ncclGroupStart();
+  for (int r=0; r<nRanks; r++) {
+    ncclSend(((char*)sendbuff)+r*rankOffset, send_counts[r], type, r, comm, stream);
+    ncclRecv(((char*)recvbuff)+r*rankOffset, recv_counts[r], type, r, comm, stream);
+  }
+  ncclGroupEnd();
+}
+
+void ds_alltoall(torch::Tensor& send_buf, torch::Tensor& rcv_buf, torch::Tensor& send_counts, torch::Tensor& recv_counts, size_t max_count, bool async_op)
+{
+    ncclAlltoAll(send_buf.data_ptr(),
+                  rcv_buf.data_ptr(),
+                  (int32_t*)send_counts.data_ptr(),
+                  (int32_t*)recv_counts.data_ptr(),
+                  max_count,
+                  (send_buf.scalar_type() == at::kFloat ?
+                    ncclFloat :
+                    (send_buf.scalar_type() == at::kHalf ?
+                    ncclHalf :
+                    (send_buf.scalar_type() == torch::kInt8 ? ncclUint8 : ncclBfloat16))),
+                  CoMMContext::Instance().GetNumRanks(),
+                  CoMMContext::Instance().GetNCCLComm(),
+                  CoMMContext::Instance().GetCommStream());
+}
+
+torch::Tensor ds_get_nccl_uid()
+{
+
+  auto options = at::TensorOptions()
+                       .dtype(torch::kUInt8)
+                       .layout(torch::kStrided)
+                       .device(torch::kCPU)
+                       .requires_grad(false);
+    auto nccl_uid = CoMMContext::Instance().get_nccl_uid();
+    auto uid_tensor = torch::from_blob((void*)&nccl_uid, {sizeof(ncclUniqueId)}, options);
+    return uid_tensor;
+}
@@ -0,0 +1,15 @@
+
+#include <torch/extension.h>
+#include <stdint.h>
+
+#include "stdio.h"
+
+void ds_barrier();
+void wait_comm();
+void ds_create_comm(std::vector<int>& comm_ranks, int rank);
+void ds_allreduce(torch::Tensor& send_buf, torch::Tensor& rcv_buf, int size, bool async_op);
+void ds_allgather(torch::Tensor& send_buf, torch::Tensor& rcv_buf, int size, bool async_op);
+void ds_broadcast(torch::Tensor& send_buf, torch::Tensor& rcv_buf, int size, bool async_op);
+void ds_alltoall(torch::Tensor& send_buf, torch::Tensor& rcv_buf, torch::Tensor& send_counts, torch::Tensor& recv_counts, size_t max_count, bool async_op);
+torch::Tensor ds_get_nccl_uid();
+void ds_create_nccl_comm(std::vector<int>& comm_ranks, int rank, torch::Tensor& nccl_uid);
@@ -0,0 +1,138 @@
+# Copyright 2025 Snowflake Inc.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+
+from arctic_training.op_builder import CommBuilder
+
+from .layout import Layout
+
+ds_comm = None
+
+
+class Comm:
+    current_comm = None
+
+    def __init__(self, layout: Layout, local_rank: int):
+        global ds_comm
+        if ds_comm is None:
+            ds_comm = CommBuilder().load()
+
+        self.ds_comm = ds_comm
+        self._layout = layout
+        self.my_rank = local_rank
+        self.global_rank = torch.distributed.get_rank()
+        self.group_size = layout._group_size
+
+        self.global_ranks = layout.sibling_ranks(self.global_rank)
+        self.local_ranks = list(range(self.group_size))
+        self.rank_map = dict(zip(self.global_ranks, self.local_ranks))
+        self.counts_pinned_data = torch.empty(1024, dtype=torch.int32, device="cpu", pin_memory=True)
+        self.recv_counts_pinned_data = torch.empty(1024, dtype=torch.int32, device="cpu", pin_memory=True)
+
+        print("Initializing comm ...")
+
+    def all_reduce(self, val, inplace=True, async_op=False):
+        val_sum = val if inplace else torch.empty_like(val)
+        op = communicate_op(val, val_sum, async_op, op_type="all_reduce")
+        return val_sum, op
+
+    def all_gather(self, val, inplace=True, async_op=False):
+        val_gather = torch.empty((self.group_size * val.size(0), *val.shape[1:]), device=val.device, dtype=val.dtype)
+        op = communicate_op(val, val_gather, async_op, op_type="all_gather")
+        return val_gather, op
+
+    def all_to_all(
+        self, val, counts=None, receive_counts=None, max_count=None, result=None, inplace=True, async_op=False
+    ):
+        if counts is not None:
+            if receive_counts is None:
+                receive_counts = torch.empty_like(counts)
+                torch.distributed.all_to_all_single(receive_counts, counts)
+
+            self.counts_pinned_data[: receive_counts.numel()].copy_(counts)
+
+            if max_count is None:
+                max_count = counts.max()
+                torch.distributed.all_reduce(max_count, op=torch.distributed.ReduceOp.MAX)
+
+            max_count = max_count.item()
+
+            receive_counts = self.recv_counts_pinned_data[: receive_counts.numel()].copy_(receive_counts)
+            counts = self.counts_pinned_data[: counts.numel()]
+        else:
+            max_count = val.size(0) // self.group_size
+            counts = torch.full((self.group_size,), max_count, device="cpu", dtype=torch.int32)
+            receive_counts = counts
+
+        result = result if result is not None else torch.empty_like(val)
+        op = communicate_op(
+            val,
+            result,
+            async_op,
+            world_size=self.group_size,
+            op_type="all_to_all",
+            send_counts=counts,
+            recv_counts=receive_counts,
+            max_count=max_count,
+        )
+        return result, op
+
+    def broadcast(self, val, inplace=True, async_op=False):
+        val_bcst = torch.empty_like(val)
+        op = communicate_op(val, val_bcst, async_op, op_type="broadcast")
+        return val_bcst, op
+
+    def barrier(self):
+        ds_comm.wait_comm()
+        ds_comm.barrier()
+
+    @classmethod
+    def get_current_comm(cls):
+        if cls.current_comm is None:
+            from arctic_training.kernels.comm.nccl import NcclComm
+
+            cls.current_comm = NcclComm()
+        return cls.current_comm
+
+
+class communicate_op:
+    def __init__(
+        self,
+        val,
+        result,
+        async_op,
+        world_size=None,
+        op_type="all_reduce",
+        send_counts=None,
+        recv_counts=None,
+        max_count=None,
+    ):
+        if op_type == "all_reduce":
+            ds_comm.allReduce(val, result, val.numel(), async_op)
+        elif op_type == "all_gather":
+            ds_comm.allGather(val, result, val.numel(), async_op)
+        elif op_type == "all_to_all":
+            ds_comm.alltoall(val, result, send_counts, recv_counts, max_count, async_op)
+        elif op_type == "broadcast":
+            ds_comm.broadcast(val, result, val.numel(), async_op)
+
+    def wait(self):
+        ds_comm.wait_comm()
+
+
+def get_default_comm():
+    return Comm.get_current_comm()