diff --git a/runtime/cpp/include/fletcher/context.h b/runtime/cpp/include/fletcher/context.h index fa8851855..58a534869 100644 --- a/runtime/cpp/include/fletcher/context.h +++ b/runtime/cpp/include/fletcher/context.h @@ -49,7 +49,18 @@ enum class MemType { * Selecting CACHE may result in higher performance if there is data reuse by the kernel, but may result in lower * performance if the data is not reused by the kernel (for example fully streamable kernels). */ - CACHE + CACHE, + + /** + * @brief Allocate a buffer for the data on the on-board memory of the device, but don't copy the buffer contents to it. + * + * If available, this forces a buffer to be allocated on device on-board memory (e.g. some seperate DRAM chips sitting + * on the accelerator PCB next to the FPGA, but it could be HBM on top of the FPGA fabric in the same chip, or + * BRAM. This depends on the platform). However, the data currently in the provided buffer is not copied to the device; + * the device memory is left uninitialized. This is useful for output buffers, or if you want to handle initialization + * manually. + */ + ALLOC_ONLY }; /// A buffer on the device diff --git a/runtime/cpp/src/fletcher/context.cc b/runtime/cpp/src/fletcher/context.cc index 7c49b22a8..58cfabb7b 100644 --- a/runtime/cpp/src/fletcher/context.cc +++ b/runtime/cpp/src/fletcher/context.cc @@ -67,6 +67,10 @@ Status Context::Enable() { device_buf.size); // Cache always allocates on device. device_buf.was_alloced = true; + } else if (type == MemType::ALLOC_ONLY) { + status = platform_->DeviceMalloc(&device_buf.device_address, + device_buf.size); + device_buf.was_alloced = true; } else { status = Status::ERROR("Invalid / unsupported MemType."); } diff --git a/runtime/python/pyfletcher/context.pxi b/runtime/python/pyfletcher/context.pxi index dcd8b0ed9..0b7d320ab 100644 --- a/runtime/python/pyfletcher/context.pxi +++ b/runtime/python/pyfletcher/context.pxi @@ -51,6 +51,7 @@ cdef class Context(): record_batch : Arrow RecordBatch to queue memtype (str): Memory type: - 'any' results in least effort to make data available to FPGA (depending on the platform implementation). - 'cache' force copy to accelerator on-board DRAM memory, if available. + - 'alloc_only' force allocation on accelerator on-board DRAM memory (if available), but don't copy. """ @@ -60,6 +61,8 @@ cdef class Context(): queue_mem_type = MemType.ANY elif mem_type == "cache": queue_mem_type = MemType.CACHE + elif mem_type == "alloc_only": + queue_mem_type = MemType.ALLOC_ONLY else: raise ValueError("mem_type argument can be only 'any' or 'cache'") diff --git a/runtime/python/pyfletcher/includes/libfletcher.pxd b/runtime/python/pyfletcher/includes/libfletcher.pxd index 024a6fba8..e36ad62c1 100644 --- a/runtime/python/pyfletcher/includes/libfletcher.pxd +++ b/runtime/python/pyfletcher/includes/libfletcher.pxd @@ -37,8 +37,9 @@ cdef extern from "fletcher/arrow-utils.h" namespace "fletcher": cdef extern from "fletcher/context.h" namespace "fletcher": cdef enum MemType: - ANY "fletcher::MemType::ANY", - CACHE "fletcher::MemType::CACHE" + ANY "fletcher::MemType::ANY", + CACHE "fletcher::MemType::CACHE" + ALLOC_ONLY "fletcher::MemType::ALLOC_ONLY" cdef extern from "fletcher/api.h" namespace "fletcher" nogil: