diff --git a/.gitignore b/.gitignore
index 52c3f813..dab4c354 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ input/*
 !input/sphere1.obj
 !input/bunnyhead.obj
 build/
+build_debug/
 include/rxmesh/util/git_sha1.cpp
 .vscode/
-scripts/*.log
\ No newline at end of file
+scripts/*.log
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d036c085..4ed2dc31 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,22 @@ FetchContent_Declare(cereal
 )
 FetchContent_Populate(cereal)
 
+# Package Management
+# TODO: Consider using CPM for the various libraries above 
+include(cmake/CPM.cmake) 
+
+# Add cuCollection with priority queue. This should eventually come from
+# NVIDIA.
+CPMAddPackage(
+    NAME cuco
+    GITHUB_REPOSITORY andrewbriand/cuCollections
+    GIT_TAG d58dd9fedde721a264c8ae960f7393a3a3b08c58
+    OPTIONS
+       "BUILD_TESTS OFF"
+       "BUILD_BENCHMARKS OFF"
+       "BUILD_EXAMPLES OFF"
+)
+
 # Auto-detect GPU architecture
 include("cmake/AutoDetectCudaArch.cmake")
 
@@ -133,7 +149,7 @@ set(cxx_flags
 
 set(MSVC_XCOMPILER_FLAGS "/openmp:experimental /MP /std:c++17 /Zi")
 set(cuda_flags
-    -Xcompiler=$<$<CXX_COMPILER_ID:GNU>:-Wall -fopenmp -O3 -Wno-unused-function>
+    -Xcompiler=$<$<CXX_COMPILER_ID:GNU>:-rdynamic -Wall -fopenmp -O3 -Wno-unused-function>
 	-Xcompiler=$<$<CXX_COMPILER_ID:MSVC>:${MSVC_XCOMPILER_FLAGS}>
     #Disables warning
     #177-D "function XXX was declared but never referenced"
diff --git a/apps/CMakeLists.txt b/apps/CMakeLists.txt
index 55d14245..4fc91b42 100644
--- a/apps/CMakeLists.txt
+++ b/apps/CMakeLists.txt
@@ -4,12 +4,13 @@ add_subdirectory(MCF)
 add_subdirectory(Geodesic)
 add_subdirectory(Delaunay)
 add_subdirectory(GaussianCurvature)
-add_subdirectory(XPBD )
+add_subdirectory(XPBD)
 #add_subdirectory(Simplification)
 add_subdirectory(ShortestEdgeCollapse)
 add_subdirectory(Remesh)
+add_subdirectory(SECPriority)
+add_subdirectory(SurfaceTracking)
 add_subdirectory(SurfaceTracking)
 add_subdirectory(SCP)
 add_subdirectory(ARAP)
-add_subdirectory(Heat)
-
+add_subdirectory(Heat)
\ No newline at end of file
diff --git a/apps/SECPriority/CMakeLists.txt b/apps/SECPriority/CMakeLists.txt
new file mode 100644
index 00000000..4f391430
--- /dev/null
+++ b/apps/SECPriority/CMakeLists.txt
@@ -0,0 +1,41 @@
+add_executable(SECPriority)
+
+set(SOURCE_LIST
+    #main.cu
+    secp.cu
+	secp_rxmesh.cuh
+	secp_kernels.cuh
+)
+
+set(COMMON_LIST    
+    ../common/openmesh_trimesh.h
+	../common/openmesh_report.h
+)
+
+target_sources(SECPriority 
+    PRIVATE
+    ${SOURCE_LIST} ${COMMON_LIST}
+)
+
+if (WIN32)
+    target_compile_definitions(SECPriority
+      PRIVATE _USE_MATH_DEFINES 
+      PRIVATE NOMINMAX
+      PRIVATE _CRT_SECURE_NO_WARNINGS)
+endif()
+
+set_target_properties(SECPriority PROPERTIES FOLDER "apps")
+
+set_property(TARGET SECPriority PROPERTY CUDA_SEPARABLE_COMPILATION ON)
+
+source_group(TREE ${CMAKE_CURRENT_LIST_DIR} PREFIX "SECPriority" FILES ${SOURCE_LIST})
+
+target_link_libraries(SECPriority     
+    PRIVATE RXMesh
+    PRIVATE gtest_main
+	PRIVATE OpenMeshCore
+    PRIVATE OpenMeshTools
+    PRIVATE cuco
+)
+
+#gtest_discover_tests( SECPriority )
\ No newline at end of file
diff --git a/apps/SECPriority/main.cu b/apps/SECPriority/main.cu
new file mode 100644
index 00000000..e6062fd6
--- /dev/null
+++ b/apps/SECPriority/main.cu
@@ -0,0 +1,96 @@
+#include <cuco/priority_queue.cuh>
+#include <cuco/detail/pair.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <cooperative_groups.h>
+#include <cuda_runtime.h>
+
+#include <map>
+#include <vector>
+
+#include <iostream>
+#include <random>
+
+using namespace cuco;
+namespace cg = cooperative_groups;
+
+// grab some bits from priority queue tests and benchmarks
+
+// -- simulate reading the mesh, computing edge length
+// -- cuco:pair<float, uint32_t>
+//
+// setup pair_less template
+//
+// setup device function to pop items from queue
+//
+
+template <typename T>
+struct pair_less 
+{
+    __host__ __device__ bool operator()(const T& a, const T& b) const
+    {
+        return a.first < b.first;
+    }
+};
+
+template <typename PairType, typename OutputIt>
+void generate_kv_pairs_uniform(OutputIt output_begin, OutputIt output_end)
+{
+  std::random_device rd;
+  std::mt19937 gen{rd()};
+
+  const auto num_keys = std::distance(output_begin, output_end);
+  for(auto i = 0; i < num_keys; i++)
+  {
+    output_begin[i] = {static_cast<typename PairType::first_type>(gen()),
+                       static_cast<typename PairType::second_type>(i)};
+  }
+}
+
+void sp_pair()
+{
+  // Setup the cuco::priority_queue
+  const size_t insertion_size = 200;
+  const size_t deletion_size  = 100;
+  using PairType              = cuco::pair<float, uint32_t>;
+  using Compare               = pair_less<PairType>;
+
+  cuco::priority_queue<PairType, Compare> pq(insertion_size);
+
+  // Generate data for the queue
+  std::vector<PairType> h_pairs(insertion_size);
+  generate_kv_pairs_uniform<PairType>(h_pairs.begin(), h_pairs.end());
+
+  for(auto i = 0; i < h_pairs.size(); i++)
+  {
+    std::cout << "Priority: " << h_pairs[i].first 
+              << "\tID: " << h_pairs[i].second << "\n";
+  }
+
+  // Fill the priority queue
+  thrust::device_vector<PairType> d_pairs(h_pairs);
+  pq.push(d_pairs.begin(), d_pairs.end());
+  cudaDeviceSynchronize();
+
+  // Pop the priority queue
+  thrust::device_vector<PairType> d_popped(deletion_size);
+  pq.pop(d_popped.begin(), d_popped.end());
+  cudaDeviceSynchronize();
+
+  std::cout << "-----After Pop-----\n";
+  thrust::host_vector<PairType> h_popped(d_popped);
+  for(auto i = 0; i < h_popped.size(); i++)
+  {
+    std::cout << "Priority: " << h_popped[i].first 
+              << "\tID: " << h_popped[i].second << "\n";
+  }
+}
+
+int main(int argc, char* argv[])
+{
+  sp_pair();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/apps/SECPriority/secp.cu b/apps/SECPriority/secp.cu
new file mode 100644
index 00000000..8da2cfe8
--- /dev/null
+++ b/apps/SECPriority/secp.cu
@@ -0,0 +1,102 @@
+#include "gtest/gtest.h"
+#include "rxmesh/util/log.h"
+#include "rxmesh/util/macros.h"
+#include "rxmesh/util/util.h"
+
+#include <filesystem>
+
+struct arg
+{
+    std::string obj_file_name = STRINGIFY(INPUT_DIR) "dragon.obj";
+    std::string output_folder = STRINGIFY(OUTPUT_DIR);
+    float       target        = 0.1;
+    float       edgefrac = 0.1;
+    uint32_t    device_id     = 0;
+    char**      argv;
+    int         argc;
+} Arg;
+
+#include "secp_rxmesh.cuh"
+
+TEST(Apps, SECPriority)
+{
+    using namespace rxmesh;
+
+    // Select device
+    cuda_query(Arg.device_id);
+
+    // RXMeshDynamic rx(Arg.obj_file_name);
+
+    const std::string p_file = STRINGIFY(OUTPUT_DIR) +
+                               extract_file_name(Arg.obj_file_name) +
+                               "_patches";
+    RXMeshDynamic rx(Arg.obj_file_name, p_file);
+    if (!std::filesystem::exists(p_file)) {
+        rx.save(p_file);
+    }
+
+    ASSERT_TRUE(rx.is_edge_manifold());
+
+    ASSERT_TRUE(rx.is_closed());
+
+    uint32_t final_num_vertices = Arg.target * rx.get_num_vertices();
+
+    secp_rxmesh(rx, final_num_vertices, Arg.edgefrac);
+}
+
+
+int main(int argc, char** argv)
+{
+    using namespace rxmesh;
+    Log::init();
+
+    ::testing::InitGoogleTest(&argc, argv);
+    Arg.argv = argv;
+    Arg.argc = argc;
+
+
+    if (argc > 1) {
+        if (cmd_option_exists(argv, argc + argv, "-h")) {
+            // clang-format off
+            RXMESH_INFO("\nUsage: SECPriority.exe < -option X>\n"
+                        " -h:          Display this massage and exit\n"
+                        " -input:      Input file. Input file should be under the input/ subdirectory\n"
+                        "              Default is {} \n"
+                        "              Hint: Only accept OBJ files\n"
+                        " -target:     The fraction of output #vertices from the input\n"
+                        " -edgefrac:   The fraction of edges to collapse in a round\n"
+                        " -o:          JSON file output folder. Default is {} \n"
+                        " -device_id:  GPU device ID. Default is {}",
+            Arg.obj_file_name, Arg.output_folder, Arg.device_id);
+            // clang-format on
+            exit(EXIT_SUCCESS);
+        }
+
+        if (cmd_option_exists(argv, argc + argv, "-input")) {
+            Arg.obj_file_name =
+                std::string(get_cmd_option(argv, argv + argc, "-input"));
+        }
+        if (cmd_option_exists(argv, argc + argv, "-o")) {
+            Arg.output_folder =
+                std::string(get_cmd_option(argv, argv + argc, "-o"));
+        }
+        if (cmd_option_exists(argv, argc + argv, "-device_id")) {
+            Arg.device_id =
+                atoi(get_cmd_option(argv, argv + argc, "-device_id"));
+        }
+        if (cmd_option_exists(argv, argc + argv, "-target")) {
+            Arg.target = atof(get_cmd_option(argv, argv + argc, "-target"));
+        }
+        if (cmd_option_exists(argv, argc + argv, "-edgefrac")) {
+            Arg.edgefrac = atof(get_cmd_option(argv, argv + argc, "-edgefrac"));
+        }
+    }
+
+    RXMESH_TRACE("input= {}", Arg.obj_file_name);
+    RXMESH_TRACE("output_folder= {}", Arg.output_folder);
+    RXMESH_TRACE("device_id= {}", Arg.device_id);
+    RXMESH_TRACE("target= {}", Arg.target);
+    RXMESH_TRACE("edgefrac= {}", Arg.edgefrac);
+
+    return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/apps/SECPriority/secp_kernels.cuh b/apps/SECPriority/secp_kernels.cuh
new file mode 100644
index 00000000..bd05b62d
--- /dev/null
+++ b/apps/SECPriority/secp_kernels.cuh
@@ -0,0 +1,225 @@
+#pragma once
+#include "rxmesh/cavity_manager.cuh"
+#include "../ShortestEdgeCollapse/link_condition.cuh"
+
+#include <cooperative_groups.h>
+#include <cuda_runtime.h>
+
+template <typename T, uint32_t blockThreads>
+__global__ static void secp(rxmesh::Context                   context,
+                            rxmesh::VertexAttribute<T>        coords,
+                            const int                         reduce_threshold,
+                            rxmesh::EdgeAttribute<bool>       e_pop_attr)
+{
+    using namespace rxmesh;
+    auto           block = cooperative_groups::this_thread_block();
+    ShmemAllocator shrd_alloc;
+    CavityManager<blockThreads, CavityOp::EV> cavity(
+        block, context, shrd_alloc, true);
+
+    const uint32_t pid = cavity.patch_id();
+
+    if (pid == INVALID32) {
+        return;
+    }
+
+    // we first use this mask to set the edge we want to collapse (and then
+    // filter them). Then after cavity.prologue, we reuse this bitmask to mark
+    // the newly added edges
+    Bitmask edge_mask(cavity.patch_info().edges_capacity[0], shrd_alloc);
+    edge_mask.reset(block);
+
+    // we use this bitmask to mark the other end of to-be-collapse edge during
+    // checking for the link condition
+    Bitmask v0_mask(cavity.patch_info().num_vertices[0], shrd_alloc);
+    Bitmask v1_mask(cavity.patch_info().num_vertices[0], shrd_alloc);
+
+
+    // Precompute EV
+    Query<blockThreads> ev_query(context, pid);
+    ev_query.prologue<Op::EV>(block, shrd_alloc);
+    block.sync();
+
+    // 1a) mark edge we want to collapse given e_pop_attr
+    for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
+        assert(eh.local_id() < cavity.patch_info().num_edges[0]);
+
+        //edge_mask.set(eh.local_id(), e_pop_attr(eh));
+        if(true == e_pop_attr(eh))
+        {
+           edge_mask.set(eh.local_id(), true);
+        }
+
+    });
+    block.sync();
+
+    // 2a) check edge link condition.
+    link_condition(block, cavity.patch_info(), ev_query, 
+                   edge_mask, v0_mask, v1_mask);
+    block.sync();
+
+    for_each_edge(cavity.patch_info(), [&](EdgeHandle eh) {
+        assert(eh.local_id() < cavity.patch_info().num_edges[0]);
+        if (edge_mask(eh.local_id())) {
+            cavity.create(eh);
+        } 
+    });
+    block.sync();
+
+    ev_query.epilogue(block, shrd_alloc);
+
+    // create the cavity
+    if (cavity.prologue(block, shrd_alloc, coords)) {
+        edge_mask.reset(block);
+        block.sync();
+
+        // fill in the cavities
+        cavity.for_each_cavity(block, [&](uint16_t c, uint16_t size) {
+            const EdgeHandle src = cavity.template get_creator<EdgeHandle>(c);
+
+            // TODO handle boundary edges
+
+            VertexHandle v0, v1;
+
+            cavity.get_vertices(src, v0, v1);
+
+            const VertexHandle new_v = cavity.add_vertex();
+
+            if (new_v.is_valid()) {
+
+                coords(new_v, 0) = (coords(v0, 0) + coords(v1, 0)) * T(0.5);
+                coords(new_v, 1) = (coords(v0, 1) + coords(v1, 1)) * T(0.5);
+                coords(new_v, 2) = (coords(v0, 2) + coords(v1, 2)) * T(0.5);
+
+
+                DEdgeHandle e0 =
+                    cavity.add_edge(new_v, cavity.get_cavity_vertex(c, 0));
+
+                if (e0.is_valid()) {
+                    edge_mask.set(e0.local_id(), true);
+
+                    const DEdgeHandle e_init = e0;
+
+                    for (uint16_t i = 0; i < size; ++i) {
+                        const DEdgeHandle e = cavity.get_cavity_edge(c, i);
+
+                        const VertexHandle v_end =
+                            cavity.get_cavity_vertex(c, (i + 1) % size);
+
+                        const DEdgeHandle e1 =
+                            (i == size - 1) ?
+                                e_init.get_flip_dedge() :
+                                cavity.add_edge(
+                                    cavity.get_cavity_vertex(c, i + 1), new_v);
+
+                        if (!e1.is_valid()) {
+                            break;
+                        }
+
+                        if (i != size - 1) {
+                            edge_mask.set(e1.local_id(), true);
+                        }
+
+                        const FaceHandle new_f = cavity.add_face(e0, e, e1);
+
+                        if (!new_f.is_valid()) {
+                            break;
+                        }
+                        e0 = e1.get_flip_dedge();
+                    }
+                }
+            }
+        });
+    }
+
+    cavity.epilogue(block);
+    block.sync();
+}
+
+//template <typename View, typename InputIt>
+template <typename T, uint32_t blockThreads>
+__global__ static void  compute_edge_priorities(
+    rxmesh::Context                  context,
+    const rxmesh::VertexAttribute<T> coords,
+    PQView_t                         pq_view,
+    size_t                           pq_num_bytes)
+{
+    using namespace rxmesh;
+    namespace cg = cooperative_groups;
+    cg::thread_block g = cg::this_thread_block();
+    ShmemAllocator      shrd_alloc;
+
+    Query<blockThreads> query(context);
+    auto intermediatePairs = shrd_alloc.alloc<PriorityPair_t>(query.get_patch_info().num_edges[0]);
+    __shared__ int pair_counter;
+    pair_counter = 0;
+
+    auto edge_len = [&](const EdgeHandle& eh, const VertexIterator& iter) {
+        const VertexHandle v0 = iter[0];
+        const VertexHandle v1 = iter[1];
+
+        const Vec3<T> p0(coords(v0, 0), coords(v0, 1), coords(v0, 2));
+        const Vec3<T> p1(coords(v1, 0), coords(v1, 1), coords(v1, 2));
+
+        T len2 = glm::distance2(p0, p1);
+
+        auto p_e = rxmesh::detail::unpack(eh.unique_id());
+        //printf("p_id:%u\te_id:%hu\n", p_e.first, p_e.second);
+        //printf("e_id:%llu\t, len:%f\n", eh.unique_id(), len2);
+
+        // repack the EdgeHandle into smaller 32 bits for
+        // use with priority queue. Need to check elsewhere
+        // that there are less than 2^16 patches.
+        auto id32 = unique_id32(p_e.second, (uint16_t)p_e.first);
+        //auto p_e_32 = unpack32(id32);
+        //printf("32bit p_id:%hu\te_id:%hu\n", p_e_32.first, p_e_32.second);
+
+        PriorityPair_t p{len2, id32};
+        //PriorityPair_t p{len2, eh};
+
+        auto val_counter = atomicAdd(&pair_counter, 1);
+        intermediatePairs[val_counter] = p;
+    };
+
+    auto block = cooperative_groups::this_thread_block();
+    query.dispatch<Op::EV>(block, shrd_alloc, edge_len);
+    block.sync();
+
+    char * pq_shrd_mem = shrd_alloc.alloc(pq_num_bytes);
+    pq_view.push(block, intermediatePairs, intermediatePairs + pair_counter, pq_shrd_mem);
+}
+
+template <uint32_t blockThreads>
+__global__ static void pop_and_mark_edges_to_collapse(
+    PQView_t pq_view,
+    rxmesh::EdgeAttribute<bool> marked_edges,
+    uint32_t pop_num_edges)
+{
+    // setup shared memory array to store the popped pairs
+    // 
+    // device api pop pairs
+    namespace cg = cooperative_groups;
+    using namespace rxmesh;
+    ShmemAllocator      shrd_alloc;
+
+    auto intermediatePairs = shrd_alloc.alloc<PriorityPair_t>(blockThreads);
+    char * pq_shrd_mem = shrd_alloc.alloc(pq_view.get_shmem_size(blockThreads));
+    cg::thread_block g = cg::this_thread_block();
+    pq_view.pop(g, intermediatePairs, intermediatePairs + blockThreads, pq_shrd_mem);
+
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int local_tid = threadIdx.x;
+
+    // Make sure the index is within bounds
+    if(tid < pop_num_edges)
+    {
+        //printf("tid: %d\n", tid);
+        //unpack the uid to get the patch and edge ids
+        auto p_e = unpack32(intermediatePairs[local_tid].second);
+        //printf("32bit p_id:%hu\te_id:%hu\n", p_e.first, p_e.second);
+        rxmesh::EdgeHandle eh(p_e.first, rxmesh::LocalEdgeT(p_e.second));
+
+        //use the eh to index into a passed in edge attribute
+        marked_edges(eh) = true;
+    }
+}
diff --git a/apps/SECPriority/secp_rxmesh.cuh b/apps/SECPriority/secp_rxmesh.cuh
new file mode 100644
index 00000000..76270d97
--- /dev/null
+++ b/apps/SECPriority/secp_rxmesh.cuh
@@ -0,0 +1,358 @@
+#pragma once
+
+#define GLM_ENABLE_EXPERIMENTAL
+#include <glm/glm.hpp>
+#include <glm/gtx/norm.hpp>
+
+
+#include "rxmesh/query.cuh"
+#include "rxmesh/rxmesh_dynamic.h"
+
+// Priority Queue related includes
+#include <cuco/priority_queue.cuh>
+#include <cuco/detail/pair.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+/**
+ * @brief Return unique index of the local mesh element composed by the
+ * patch id and the local index
+ *
+ * @param local_id the local within-patch mesh element id
+ * @param patch_id the patch owning the mesh element
+ * @return
+ */
+constexpr __device__ __host__ __forceinline__ uint32_t
+unique_id32(const uint16_t local_id, const uint16_t patch_id)
+{
+    uint32_t ret = patch_id;
+    ret          = (ret << 16);
+    ret |= local_id;
+    return ret;
+}
+
+/**
+ * @brief unpack a 32 uint to its high and low 16 bits. 
+ * This is used to convert the unique id to its local id (16
+ * low bit) and patch id (high 16 bit)
+ * @param uid unique id
+ * @return a std::pair storing the patch id and local id
+ */
+constexpr __device__ __host__ __forceinline__ std::pair<uint16_t, uint16_t>
+                                              unpack32(uint32_t uid)
+{
+    uint16_t local_id = uid & ((1 << 16) - 1);
+    uint16_t patch_id = uid >> 16;
+    return std::make_pair(patch_id, local_id);
+}
+
+// Priority queue setup. Use 'pair_less' to prioritize smaller values.
+template <typename T>
+struct pair_less 
+{
+    __host__ __device__ bool operator()(const T& a, const T& b) const
+    {
+        return a.first < b.first;
+    }
+};
+
+using PriorityPair_t        = cuco::pair<float, uint32_t>;
+using PriorityCompare       = pair_less<PriorityPair_t>;
+using PriorityQueue_t       = cuco::priority_queue<PriorityPair_t, PriorityCompare>;
+using PQView_t              = PriorityQueue_t::device_mutable_view;
+
+
+template <typename T>
+using Vec3 = glm::vec<3, T, glm::defaultp>;
+
+#include "secp_kernels.cuh"
+
+#include "rxmesh/util/report.h"
+
+template <typename T>
+void render_edge_attr(rxmesh::RXMeshDynamic& rx,
+    const std::shared_ptr<rxmesh::EdgeAttribute<T>>& edge_attr)
+{
+    using namespace rxmesh;
+    //make sure the attribute is on the HOST
+    edge_attr->move(DEVICE, HOST);
+
+    std::vector<float> edgeColors(rx.get_num_edges());
+    rx.for_each_edge(HOST,
+        [&](EdgeHandle eh) {
+            if(true == (*edge_attr)(eh))
+            {
+                edgeColors[rx.linear_id(eh)] = 200.0f;
+            }
+            else
+            {
+                edgeColors[rx.linear_id(eh)] = eh.patch_id();
+            }
+        });
+
+    auto ps_mesh = rx.get_polyscope_mesh();
+    auto edge_colors = ps_mesh->addEdgeScalarQuantity("Edges to Collapse", edgeColors);
+    edge_colors->setEnabled(true);
+}
+
+inline void secp_rxmesh(rxmesh::RXMeshDynamic& rx,
+                       const uint32_t         final_num_vertices,
+                       const float edge_reduce_ratio)
+{
+    EXPECT_TRUE(rx.validate());
+
+    using namespace rxmesh;
+    constexpr uint32_t blockThreads = 256;
+
+    rxmesh::Report report("SECP_RXMesh");
+    report.command_line(Arg.argc, Arg.argv);
+    report.device();
+    report.system();
+    report.model_data(Arg.obj_file_name + "_before", rx, "model_before");
+    report.add_member("method", std::string("RXMesh"));
+    report.add_member("blockThreads", blockThreads);
+
+    auto coords = rx.get_input_vertex_coordinates();
+
+    LaunchBox<blockThreads> launch_box;
+
+    float total_time   = 0;
+    float app_time     = 0;
+    float slice_time   = 0;
+    float cleanup_time = 0;
+    float pq_time      = 0;
+    float pop_mark_time      = 0;
+    float e_priority_time      = 0;
+
+    auto e_pop_attr = rx.add_edge_attribute<bool>("ePop", 1);
+
+    RXMESH_INFO("#Vertices {}", rx.get_num_vertices());
+    RXMESH_INFO("#Edges {}", rx.get_num_edges());
+    RXMESH_INFO("#Faces {}", rx.get_num_faces());
+    RXMESH_INFO("#Patches {}", rx.get_num_patches());
+
+    size_t   max_smem_bytes_dyn           = 0;
+    size_t   max_smem_bytes_static        = 0;
+    uint32_t max_num_registers_per_thread = 0;
+    uint32_t max_num_blocks               = 0;
+
+#if USE_POLYSCOPE
+    rx.render_vertex_patch();
+    rx.render_edge_patch();
+    rx.render_face_patch();
+    // polyscope::show();
+#endif
+
+    bool validate = false;
+
+    int num_passes = 0;
+
+    CUDA_ERROR(cudaProfilerStart());
+    GPUTimer timer;
+    timer.start();
+    while(rx.get_num_vertices(true) > final_num_vertices)
+    {
+        ++num_passes;
+
+        GPUTimer pq_timer;
+        pq_timer.start();
+
+        // rebuild every round? Not necessarily a great way to use a pq.
+        PriorityQueue_t pq(rx.get_num_edges());
+        e_pop_attr->reset(false, DEVICE);
+
+        //rx.prepare_launch_box(
+        rx.update_launch_box(
+            {Op::EV},
+            launch_box,
+            (void*)compute_edge_priorities<float, blockThreads>,
+            false, false, false, false,
+            [&](uint32_t v, uint32_t e, uint32_t f){
+              // Allocate enough additional memory
+              // for the priority queue and the intermediate
+              // array of PriorityPair_t.
+              return pq.get_shmem_size(blockThreads) + (e*sizeof(PriorityPair_t));
+            }
+        );
+
+        GPUTimer edge_priorities_timer;
+        edge_priorities_timer.start();
+        compute_edge_priorities<float, blockThreads>
+            <<<launch_box.blocks,
+               launch_box.num_threads,
+               launch_box.smem_bytes_dyn>>>( rx.get_context(), *coords, pq.get_mutable_device_view(), pq.get_shmem_size(blockThreads));
+        edge_priorities_timer.stop();
+        e_priority_time += edge_priorities_timer.elapsed_millis();
+        //cudaDeviceSynchronize();
+        //RXMESH_TRACE("launch_box.smem_bytes_dyn = {}", launch_box.smem_bytes_dyn);
+        //RXMESH_TRACE("pq.get_shmem_size = {}", pq.get_shmem_size(blockThreads));
+
+        // Next kernel needs to pop some percentage of the top
+        // elements in the priority queue and store popped elements
+        // to be used by the next kernel that actually does the collapses
+
+        float reduce_ratio = edge_reduce_ratio;
+        const int num_edges_before = int(rx.get_num_edges());
+        const int reduce_threshold =
+            std::max(1, int(reduce_ratio * float(num_edges_before)));
+        // Mark the edge attributes to be collapsed
+        uint32_t pop_num_edges = reduce_threshold; //reduce_ratio * rx.get_num_edges();
+        //RXMESH_TRACE("pop_num_edges: {}", pop_num_edges);
+
+        constexpr uint32_t threads_per_block = 256;
+        uint32_t number_of_blocks = (pop_num_edges + threads_per_block - 1) / threads_per_block;
+        int shared_mem_bytes = pq.get_shmem_size(threads_per_block) +
+                               (threads_per_block * sizeof(PriorityPair_t));
+        //RXMESH_TRACE("threads_per_block: {}", threads_per_block);
+        //RXMESH_TRACE("number_of_blocks: {}", number_of_blocks);
+        //RXMESH_TRACE("shared_mem_bytes: {}", shared_mem_bytes);
+
+        GPUTimer pop_mark_timer;
+        pop_mark_timer.start();
+        pop_and_mark_edges_to_collapse<threads_per_block>
+            <<<number_of_blocks, threads_per_block, shared_mem_bytes>>>
+                (pq.get_mutable_device_view(),
+                 *e_pop_attr,
+                 pop_num_edges);
+
+       // if(num_passes == 1)
+       // {
+       //     render_edge_attr<bool>(rx, e_pop_attr);
+       // }
+        CUDA_ERROR(cudaDeviceSynchronize());
+        CUDA_ERROR(cudaGetLastError());
+        pop_mark_timer.stop();
+        pop_mark_time += pop_mark_timer.elapsed_millis();
+
+        pq_timer.stop();
+
+        pq_time += pq_timer.elapsed_millis();
+
+        // loop over the mesh, and try to collapse
+
+        rx.reset_scheduler();
+        while(!rx.is_queue_empty() &&
+              rx.get_num_vertices(true) > final_num_vertices)
+        {
+
+            //RXMESH_INFO(" Queue size = {}",
+            //            rx.get_context().m_patch_scheduler.size());
+
+            //rx.prepare_launch_box(
+            rx.update_launch_box(
+                {Op::EV},
+                launch_box,
+                (void*)secp<float, blockThreads>,
+                true, false, false, false,
+                [&](uint32_t v, uint32_t e, uint32_t f) {
+                    return detail::mask_num_bytes(e) +
+                           2 * detail::mask_num_bytes(v) +
+                           3 * ShmemAllocator::default_alignment;
+                }
+            );
+
+            max_smem_bytes_dyn =
+                std::max(max_smem_bytes_dyn, launch_box.smem_bytes_dyn);
+            max_smem_bytes_static =
+                std::max(max_smem_bytes_static, launch_box.smem_bytes_static);
+            max_num_registers_per_thread =
+                std::max(max_num_registers_per_thread,
+                         launch_box.num_registers_per_thread);
+            max_num_blocks =
+                std::max(max_num_blocks, DIVIDE_UP(launch_box.blocks, 8));
+            GPUTimer app_timer;
+
+            app_timer.start();
+            secp<float, blockThreads>
+                <<<DIVIDE_UP(launch_box.blocks, 8),
+                   launch_box.num_threads,
+                   launch_box.smem_bytes_dyn>>>(rx.get_context(),
+                                                *coords,
+                                                reduce_threshold,
+                                                *e_pop_attr);
+            // should we cudaDeviceSyn here? stopping timers too soon?
+            //CUDA_ERROR(cudaDeviceSynchronize());
+            //CUDA_ERROR(cudaGetLastError());
+            
+            app_timer.stop();
+
+            GPUTimer cleanup_timer;
+            cleanup_timer.start();
+            rx.cleanup();
+            cleanup_timer.stop();
+
+            GPUTimer slice_timer;
+            slice_timer.start();
+            rx.slice_patches(*coords);
+            slice_timer.stop();
+
+            GPUTimer cleanup_timer2;
+            cleanup_timer2.start();
+            rx.cleanup();
+            cleanup_timer2.stop();
+
+
+            CUDA_ERROR(cudaDeviceSynchronize());
+            CUDA_ERROR(cudaGetLastError());
+
+            app_time += app_timer.elapsed_millis();
+            slice_time += slice_timer.elapsed_millis();
+            cleanup_time += cleanup_timer.elapsed_millis();
+            cleanup_time += cleanup_timer2.elapsed_millis();
+        }
+    }
+    timer.stop();
+    total_time += timer.elapsed_millis();
+    CUDA_ERROR(cudaProfilerStop());
+
+    RXMESH_INFO("secp_rxmesh() RXMesh SEC took {} (ms), num_passes= {}",
+                total_time,
+                num_passes);
+    RXMESH_INFO("secp_rxmesh() PriorityQ time {} (ms)", pq_time);
+    RXMESH_INFO("secp_rxmesh() |-Edge priorities time {} (ms)", e_priority_time);
+    RXMESH_INFO("secp_rxmesh() |-Pop and Mark time {} (ms)", pop_mark_time);
+    RXMESH_INFO("secp_rxmesh() App time {} (ms)", app_time);
+    RXMESH_INFO("secp_rxmesh() Slice timer {} (ms)", slice_time);
+    RXMESH_INFO("secp_rxmesh() Cleanup timer {} (ms)", cleanup_time);
+
+    RXMESH_INFO("#Vertices {}", rx.get_num_vertices(true));
+    RXMESH_INFO("#Edges {}", rx.get_num_edges(true));
+    RXMESH_INFO("#Faces {}", rx.get_num_faces(true));
+    RXMESH_INFO("#Patches {}", rx.get_num_patches(true));
+
+
+    rx.update_host();
+    
+    coords->move(DEVICE, HOST);
+
+    report.add_member("num_passes", num_passes);
+    report.add_member("max_smem_bytes_dyn", max_smem_bytes_dyn);
+    report.add_member("max_smem_bytes_static", max_smem_bytes_static);
+    report.add_member("max_num_registers_per_thread",
+                      max_num_registers_per_thread);
+    report.add_member("max_num_blocks", max_num_blocks);
+    report.add_member("secp_remesh_time", total_time);
+    report.add_member("priority_queue_time", pq_time);
+    report.add_member("app_time", app_time);
+    report.add_member("slice_time", slice_time);
+    report.add_member("cleanup_time", cleanup_time);
+    report.add_member("attributes_memory_mg", coords->get_memory_mg());
+    report.model_data(Arg.obj_file_name + "_after", rx, "model_after");
+
+#if USE_POLYSCOPE
+    rx.update_polyscope();
+
+    auto ps_mesh = rx.get_polyscope_mesh();
+    ps_mesh->updateVertexPositions(*coords);
+    ps_mesh->setEnabled(false);
+
+    rx.render_vertex_patch();
+    rx.render_edge_patch();
+    rx.render_face_patch();
+    polyscope::show();
+#endif
+
+    report.write(Arg.output_folder + "/rxmesh_secp",
+                 "SECP_RXMesh_" + extract_file_name(Arg.obj_file_name));
+}
\ No newline at end of file
diff --git a/cmake/CPM.cmake b/cmake/CPM.cmake
new file mode 100644
index 00000000..d0fd0e8e
--- /dev/null
+++ b/cmake/CPM.cmake
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: MIT
+#
+# SPDX-FileCopyrightText: Copyright (c) 2019-2023 Lars Melchior and contributors
+
+set(CPM_DOWNLOAD_VERSION 0.39.0)
+set(CPM_HASH_SUM "66639bcac9dd2907b2918de466783554c1334446b9874e90d38e3778d404c2ef")
+
+if(CPM_SOURCE_CACHE)
+  set(CPM_DOWNLOAD_LOCATION "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+elseif(DEFINED ENV{CPM_SOURCE_CACHE})
+  set(CPM_DOWNLOAD_LOCATION "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+else()
+  set(CPM_DOWNLOAD_LOCATION "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
+endif()
+
+# Expand relative path. This is important if the provided path contains a tilde (~)
+get_filename_component(CPM_DOWNLOAD_LOCATION ${CPM_DOWNLOAD_LOCATION} ABSOLUTE)
+
+file(DOWNLOAD
+     https://github.com/cpm-cmake/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
+     ${CPM_DOWNLOAD_LOCATION} EXPECTED_HASH SHA256=${CPM_HASH_SUM}
+)
+
+include(${CPM_DOWNLOAD_LOCATION})
diff --git a/sweep_edgefrac_test.sh b/sweep_edgefrac_test.sh
new file mode 100755
index 00000000..9ffb5195
--- /dev/null
+++ b/sweep_edgefrac_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Start value
+start=0.001
+# End value
+end=0.9
+# Step value
+step=0.031
+
+# Command path
+command="./build/bin/SECPriority"
+# Input file
+input_file="./input/rocker-arm.obj"
+
+# Loop through the range
+for target in $(seq $start $step $end)
+do
+    echo "Running with edgefrac = $target"
+    $command -input $input_file -edgefrac $target
+done
+
diff --git a/sweep_test.sh b/sweep_test.sh
new file mode 100755
index 00000000..6d7ee81e
--- /dev/null
+++ b/sweep_test.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Start value
+start=0.001
+# End value
+end=0.9
+# Step value
+step=0.031
+
+# Command path
+command="./build/bin/SECPriority"
+# Input file
+input_file="./input/rocker-arm.obj"
+
+# Loop through the range
+for target in $(seq $start $step $end)
+do
+    echo "Running with target = $target"
+    $command -input $input_file -target $target
+done
+