diff --git a/src/driver/amdxdna/ve2_host_queue.h b/src/driver/amdxdna/ve2_host_queue.h index f7feb7f99..25f5f90c7 100644 --- a/src/driver/amdxdna/ve2_host_queue.h +++ b/src/driver/amdxdna/ve2_host_queue.h @@ -117,8 +117,9 @@ struct ve2_hsa_queue { struct hsa_queue *hsa_queue_p; struct ve2_mem hsa_queue_mem; struct ve2_hq_complete hq_complete; - /* hq_lock protects hsa_queue_p->hq_header->[read | write]_index */ + // hq_lock protects [read | write]_index and reserved_write_index struct mutex hq_lock; + u64 reserved_write_index; }; /* handshake */ @@ -193,14 +194,16 @@ struct handshake { u32 last_ddr_dm2mm_addr_low; // 94 u32 last_ddr_mm2dm_addr_high; // 98 u32 last_ddr_mm2dm_addr_low; // 9c - struct { /* Hardware sync required */ // a0 + /* Hardware sync required - offset 0xa0 */ + struct { u32 fw_state; u32 abs_page_index; //absolute index of page where current control code are in u32 ppc; // previous pc(relative to current page) drives current_job_context to NULL } vm; - struct { /* Hardware sync required */ // ac - u32 ear; //exception address + /* Hardware sync required - offset 0xac */ + struct { + u32 ear; /* exception address */ u32 esr; //exception status u32 pc; //exception pc } diff --git a/src/driver/amdxdna/ve2_hwctx.c b/src/driver/amdxdna/ve2_hwctx.c index 4764ee204..af83683c6 100644 --- a/src/driver/amdxdna/ve2_hwctx.c +++ b/src/driver/amdxdna/ve2_hwctx.c @@ -65,39 +65,163 @@ static inline struct ve2_dpu_data *get_ve2_dpu_data_next(struct ve2_dpu_data *dp return dpu_data + 1; } -static int hsa_queue_reserve_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_priv *priv, - u64 *slot) +/* + * ve2_check_slot_available - Check if a queue slot is available + * @hwctx: Hardware context + * + * Returns true if at least one slot is available, false otherwise. + * This is used as the condition for wait_event_interruptible_timeout. + * + */ +static bool ve2_check_slot_available(struct amdxdna_ctx *hwctx) { + struct amdxdna_ctx_priv *priv = hwctx->priv; struct ve2_hsa_queue *queue = &priv->hwctx_hsa_queue; struct host_queue_header *header = &queue->hsa_queue_p->hq_header; u32 capacity = header->capacity; + enum ert_cmd_state state; + u64 outstanding; + bool available; u32 slot_idx; mutex_lock(&queue->hq_lock); - if (header->write_index < header->read_index) { - XDNA_ERR(xdna, "HSA Queue read %llx before write %llx", - header->read_index, header->write_index); + outstanding = queue->reserved_write_index - header->read_index; + if (outstanding >= capacity) { mutex_unlock(&queue->hq_lock); - return -EINVAL; - } else if ((header->write_index - header->read_index) < capacity) { - slot_idx = header->write_index % capacity; - enum ert_cmd_state state = queue->hq_complete.hqc_mem[slot_idx]; + return false; + } - if (state != ERT_CMD_STATE_INVALID) { - /* Slot is still active */ - mutex_unlock(&queue->hq_lock); - return -EAGAIN; - } - *slot = header->write_index++; - XDNA_DBG(xdna, "slot %lld", *slot); - } else { - /* Queue is full - return EAGAIN without error print (expected during retries) */ + /* + * Also check that the next slot to be reserved is actually available. + * Slot is available when in INVALID state (set by ve2_hwctx_job_release + * after job completion, or zero-initialized for fresh slots). + */ + slot_idx = queue->reserved_write_index % capacity; + state = queue->hq_complete.hqc_mem[slot_idx]; + available = (state == ERT_CMD_STATE_INVALID); + mutex_unlock(&queue->hq_lock); + + return available; +} + +/* + * ve2_wait_for_retry_slot - Wait for a queue slot to become available + * @hwctx: Hardware context + * @timeout_ms: Maximum time to wait in milliseconds + * + * This function uses wait_event_interruptible_timeout to sleep until + * a slot becomes available. The IRQ handler will wake us up when + * commands complete and slots are freed. + * + * Returns: + * 0 on success (slot available) + * -ETIMEDOUT if timeout expired + * negative error code if interrupted + */ +static int ve2_wait_for_retry_slot(struct amdxdna_ctx *hwctx, u32 timeout_ms) +{ + struct amdxdna_ctx_priv *priv = hwctx->priv; + unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms); + int ret; + + ret = wait_event_interruptible_timeout(priv->waitq, + ve2_check_slot_available(hwctx), + timeout_jiffies); + + if (ret == 0) + return -ETIMEDOUT; + if (ret < 0) + return ret; /* Interrupted */ + + return 0; +} + +static struct host_queue_packet * +hsa_queue_reserve_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_priv *priv, u64 *slot) +{ + struct ve2_hsa_queue *queue = &priv->hwctx_hsa_queue; + struct host_queue_header *header = &queue->hsa_queue_p->hq_header; + u32 capacity = header->capacity; + u32 slot_idx; + u64 outstanding; + + mutex_lock(&queue->hq_lock); + + /* + * Check against reserved_write_index to account for in-flight reservations. + */ + if (queue->reserved_write_index < header->read_index) { + XDNA_ERR(xdna, "HSA Queue: reserved_write_index(%llu) < read_index(%llu)", + queue->reserved_write_index, header->read_index); + mutex_unlock(&queue->hq_lock); + return NULL; + } + + outstanding = queue->reserved_write_index - header->read_index; + if (outstanding >= capacity) { + /* Use DBG level - expected during high queue utilization */ + XDNA_DBG(xdna, "HSA Queue full: outstanding=%llu >= capacity=%u", + outstanding, capacity); mutex_unlock(&queue->hq_lock); - return -EAGAIN; + return ERR_PTR(-EBUSY); + } + + slot_idx = queue->reserved_write_index % capacity; + enum ert_cmd_state state = queue->hq_complete.hqc_mem[slot_idx]; + + /* + * Slot can only be reused when it's in INVALID state, which is set by + * ve2_hwctx_job_release() after the job is fully released from pending array. + * Note: ERT_CMD_STATE_INVALID == 0, so this also covers zero-initialized slots. + * This ensures the pending array slot is free before we reserve the HSA queue slot. + */ + if (state != ERT_CMD_STATE_INVALID) { + XDNA_DBG(xdna, "Slot %u is still in use with state %u", slot_idx, state); + mutex_unlock(&queue->hq_lock); + return ERR_PTR(-EBUSY); + } + + /* Reserve this slot by incrementing reserved_write_index. */ + *slot = queue->reserved_write_index++; + queue->hq_complete.hqc_mem[slot_idx] = ERT_CMD_STATE_NEW; + + mutex_unlock(&queue->hq_lock); + + /* Return packet pointer. Caller can now prepare packet in parallel. */ + return &queue->hsa_queue_p->hq_entry[slot_idx]; +} + +/* Commit the prepared packet by updating write_index when all prior slots are ready. + * This ensures CERT sees packets in order even if prepared out-of-order. + */ +static void hsa_queue_commit_slot(struct amdxdna_dev *xdna, struct amdxdna_ctx_priv *priv, + u64 slot) +{ + struct ve2_hsa_queue *queue = &priv->hwctx_hsa_queue; + struct host_queue_header *header = &queue->hsa_queue_p->hq_header; + u32 capacity = header->capacity; + u32 slot_idx = slot % capacity; + struct host_queue_packet *pkt = &queue->hsa_queue_p->hq_entry[slot_idx]; + + mutex_lock(&queue->hq_lock); + /* Set packet type to valid so CERT can process it */ + pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC; + + /* Mark this slot as ready in driver tracking */ + queue->hq_complete.hqc_mem[slot_idx] = ERT_CMD_STATE_SUBMITTED; + + /* Advance write_index as far as possible through all ready slots. */ + while (header->write_index < queue->reserved_write_index) { + u32 next_idx = header->write_index % capacity; + enum ert_cmd_state state = queue->hq_complete.hqc_mem[next_idx]; + + if (state != ERT_CMD_STATE_SUBMITTED) + break; + + header->write_index++; } mutex_unlock(&queue->hq_lock); - return 0; } static void ve2_job_release(struct kref *ref) @@ -221,45 +345,28 @@ static inline struct host_queue_packet *hsa_queue_get_pkt(struct hsa_queue *queu return &queue->hq_entry[slot & (queue->hq_header.capacity - 1)]; } -static inline int hsa_queue_pkt_is_valid(struct host_queue_packet *pkt) -{ - return pkt->xrt_header.common_header.type == HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC; -} - -static void *get_host_queue_pkt(struct amdxdna_ctx *hwctx, u64 *seq) +static void *get_host_queue_pkt(struct amdxdna_ctx *hwctx, u64 *seq, int *err) { struct amdxdna_dev *xdna = hwctx->client->xdna; - struct hsa_queue *queue = NULL; struct host_queue_packet *pkt; - int ret; - ret = hsa_queue_reserve_slot(xdna, hwctx->priv, seq); - if (ret) { + pkt = hsa_queue_reserve_slot(xdna, hwctx->priv, seq); + if (IS_ERR(pkt)) { + *err = PTR_ERR(pkt); /* Expected during retry - use DBG level */ - XDNA_DBG(xdna, "No slot available in Host queue"); + XDNA_DBG(xdna, "No slot available in Host queue (err=%d)", *err); return NULL; } - - queue = (struct hsa_queue *)hwctx->priv->hwctx_hsa_queue.hsa_queue_p; - if (!queue) { - XDNA_ERR(xdna, "Invalid Host queue"); - return NULL; - } - - pkt = hsa_queue_get_pkt(queue, *seq); - if (hsa_queue_pkt_is_valid(pkt)) { - XDNA_ERR(xdna, "pkt of slot %llx is already selected", *seq); + if (!pkt) { + *err = -EINVAL; + XDNA_ERR(xdna, "Failed to get host queue packet"); return NULL; } + *err = 0; return pkt; } -static inline void hsa_queue_pkt_set_valid(struct host_queue_packet *pkt) -{ - pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_VENDOR_SPECIFIC; -} - static inline void hsa_queue_pkt_set_invalid(struct host_queue_packet *pkt) { pkt->xrt_header.common_header.type = HOST_QUEUE_PACKET_TYPE_INVALID; @@ -386,6 +493,8 @@ static int ve2_create_host_queue(struct amdxdna_dev *xdna, struct ve2_hsa_queue /* Initialize mutex here */ mutex_init(&queue->hq_lock); + /* Initialize reserved_write_index to track slot reservations */ + queue->reserved_write_index = 0; /* Set the base DMA address for hsa queue */ queue->hsa_queue_mem.dma_addr = dma_handle; @@ -437,20 +546,20 @@ static int submit_command_indirect(struct amdxdna_ctx *hwctx, void *cmd_data, u6 struct ve2_dpu_data *dpu; struct hsa_queue *queue; u64 slot_id = 0; - int ret; dpu = (struct ve2_dpu_data *)cmd_data; - ret = hsa_queue_reserve_slot(xdna, ve2_ctx, &slot_id); - if (ret) - return ret; + pkt = hsa_queue_reserve_slot(xdna, ve2_ctx, &slot_id); + if (IS_ERR(pkt)) { + XDNA_DBG(xdna, "No slot available in Host queue"); + return PTR_ERR(pkt); + } + if (!pkt) { + XDNA_ERR(xdna, "Failed to reserve slot"); + return -EINVAL; + } hq_queue = (struct ve2_hsa_queue *)&ve2_ctx->hwctx_hsa_queue; queue = (struct hsa_queue *)hq_queue->hsa_queue_p; - pkt = hsa_queue_get_pkt(queue, slot_id); - if (hsa_queue_pkt_is_valid(pkt)) { - XDNA_ERR(xdna, "pkt of slot %llx is already selected", slot_id); - return -EINVAL; - } *seq = slot_id; XDNA_DBG(xdna, "slot %llx is selected", slot_id); @@ -514,7 +623,8 @@ static int submit_command_indirect(struct amdxdna_ctx *hwctx, void *cmd_data, u6 if (verbosity >= VERBOSITY_LEVEL_DBG) packet_dump(xdna, queue, slot_id); - hsa_queue_pkt_set_valid(pkt); + /* Commit the slot - this sets hqc_mem to SUBMITTED and advances write_index */ + hsa_queue_commit_slot(xdna, ve2_ctx, *seq); return 0; } @@ -529,27 +639,24 @@ static int submit_command(struct amdxdna_ctx *hwctx, void *cmd_data, u64 *seq, b struct host_queue_packet *pkt; struct exec_buf *ebp; u64 slot_id = 0; + int err; if (!cmd_data) { XDNA_ERR(xdna, "Invalid command requested"); return -EINVAL; } - pkt = (struct host_queue_packet *)get_host_queue_pkt(hwctx, &slot_id); + pkt = (struct host_queue_packet *)get_host_queue_pkt(hwctx, &slot_id, &err); if (!pkt) { /* Expected during retry - use DBG level */ - XDNA_DBG(xdna, "Getting host queue packet failed"); - return -EINVAL; + XDNA_DBG(xdna, "Getting host queue packet failed (err=%d)", err); + return err; } *seq = slot_id; XDNA_DBG(xdna, "pkt %p of slot %llx is selected", (void *)pkt, slot_id); slot_id = slot_id & (hq_queue->hsa_queue_p->hq_header.capacity - 1); - mutex_lock(&hq_queue->hq_lock); - hq_queue->hq_complete.hqc_mem[slot_id] = ERT_CMD_STATE_NEW; - mutex_unlock(&hq_queue->hq_lock); - hdr = &pkt->xrt_header; hdr->common_header.opcode = HOST_QUEUE_PACKET_EXEC_BUF; hdr->common_header.chain_flag = last_cmd ? LAST_CMD : NOT_LAST_CMD; @@ -572,10 +679,8 @@ static int submit_command(struct amdxdna_ctx *hwctx, void *cmd_data, u64 *seq, b ebp->args_host_addr_high = 0; XDNA_DBG(xdna, "dpu instruction addr: 0x%llx", dpu_cmd->instruction_buffer); - mutex_lock(&hq_queue->hq_lock); - hq_queue->hq_complete.hqc_mem[slot_id] = ERT_CMD_STATE_SUBMITTED; - mutex_unlock(&hq_queue->hq_lock); - hsa_queue_pkt_set_valid(pkt); + /* Commit the slot - this sets hqc_mem to SUBMITTED and advances write_index */ + hsa_queue_commit_slot(xdna, ve2_ctx, *seq); return 0; } @@ -594,26 +699,61 @@ static int ve2_submit_cmd_single(struct amdxdna_ctx *hwctx, struct amdxdna_sched return -EINVAL; } - if (get_ve2_dpu_data_next(cmd_data)) - ret = submit_command_indirect(hwctx, cmd_data, seq, true); - else - ret = submit_command(hwctx, cmd_data, seq, true); + while (true) { + if (get_ve2_dpu_data_next(cmd_data)) + ret = submit_command_indirect(hwctx, cmd_data, seq, true); + else + ret = submit_command(hwctx, cmd_data, seq, true); + + if (ret != -EBUSY) + break; + + XDNA_DBG(xdna, "Queue full, waiting for slot to become available (IRQ-driven)"); + + ret = ve2_wait_for_retry_slot(hwctx, VE2_RETRY_TIMEOUT_MS); + if (ret == -ETIMEDOUT) { + XDNA_DBG(xdna, "Submit timeout: no slot available after %ums", + VE2_RETRY_TIMEOUT_MS); + return -EAGAIN; + } else if (ret < 0) { + XDNA_ERR(xdna, "Submit interrupted while waiting for slot"); + return ret; + } + + XDNA_DBG(xdna, "Slot available, retrying single command submission"); + } + if (ret) { - /* Expected during retry - use DBG level */ - XDNA_DBG(xdna, "Submit single command failed, error %d", ret); + XDNA_ERR(xdna, "Submit single command failed, error %d", ret); return ret; } return ve2_hwctx_add_job(hwctx, job, *seq, 1); } -static int ve2_submit_cmd_chain(struct amdxdna_ctx *hwctx, struct amdxdna_sched_job *job, u64 *seq) +/* + * ve2_submit_cmd_chain_partial - Submit commands from a chain starting at start_idx + * @hwctx: Hardware context + * @job: Job containing the command chain + * @start_idx: Index to start submitting from + * @seq: Output sequence number (set to last submitted command's slot) + * @submitted_count: Output count of successfully submitted commands + * + * Returns: + * 0 on success (all remaining commands submitted) + * -EBUSY if queue became full (partial submission, check submitted_count) + * Other negative error codes on failure + */ +static int ve2_submit_cmd_chain_partial(struct amdxdna_ctx *hwctx, struct amdxdna_sched_job *job, + u32 start_idx, u64 *seq, u32 *submitted_count) { struct amdxdna_dev *xdna = hwctx->client->xdna; struct amdxdna_gem_obj *cmd_bo = job->cmd_bo; struct amdxdna_cmd_chain *cmd_chain; u32 cmd_chain_len; - int ret; + int ret = 0; + + *submitted_count = 0; cmd_chain = amdxdna_cmd_get_payload(cmd_bo, &cmd_chain_len); if (!cmd_chain || cmd_chain_len < struct_size(cmd_chain, data, cmd_chain->command_count)) { @@ -621,7 +761,7 @@ static int ve2_submit_cmd_chain(struct amdxdna_ctx *hwctx, struct amdxdna_sched_ return -EINVAL; } - for (int i = 0; i < cmd_chain->command_count; i++) { + for (u32 i = start_idx; i < cmd_chain->command_count; i++) { u32 boh = (u32)(cmd_chain->data[i]); struct amdxdna_gem_obj *abo; bool last_cmd = false; @@ -647,15 +787,83 @@ static int ve2_submit_cmd_chain(struct amdxdna_ctx *hwctx, struct amdxdna_sched_ ret = submit_command_indirect(hwctx, cmd_data, seq, last_cmd); else ret = submit_command(hwctx, cmd_data, seq, last_cmd); - if (ret) { - /* Expected during retry - use DBG level */ - XDNA_DBG(xdna, "Submit chain command(%d/%d) failed, error %d", i, + + amdxdna_gem_put_obj(abo); + + if (ret == -EBUSY) { + /* Queue full - return with partial count for retry */ + XDNA_DBG(xdna, "Queue full at cmd %u/%u", i, cmd_chain->command_count); + return -EBUSY; + } else if (ret) { + XDNA_ERR(xdna, "Submit chain command(%u/%u) failed, error %d", i, cmd_chain->command_count, ret); - amdxdna_gem_put_obj(abo); return ret; } - amdxdna_gem_put_obj(abo); + (*submitted_count)++; + } + + return 0; +} + +static int ve2_submit_cmd_chain(struct amdxdna_ctx *hwctx, struct amdxdna_sched_job *job, u64 *seq) +{ + struct amdxdna_dev *xdna = hwctx->client->xdna; + struct amdxdna_gem_obj *cmd_bo = job->cmd_bo; + struct amdxdna_cmd_chain *cmd_chain; + u32 total_submitted = 0; + u32 submitted_count = 0; + u32 start_idx = 0; + int ret; + + cmd_chain = amdxdna_cmd_get_payload(cmd_bo, NULL); + if (!cmd_chain) { + XDNA_ERR(xdna, "Invalid command chain"); + return -EINVAL; + } + + while (start_idx < cmd_chain->command_count) { + ret = ve2_submit_cmd_chain_partial(hwctx, job, start_idx, seq, &submitted_count); + + if (ret == 0) { + total_submitted += submitted_count; + break; + } else if (ret == -EBUSY) { + total_submitted += submitted_count; + start_idx += submitted_count; + + XDNA_DBG(xdna, + "Queue full at cmd %u/%u, waiting for slot (IRQ-driven)", + start_idx, cmd_chain->command_count); + + ret = ve2_wait_for_retry_slot(hwctx, VE2_RETRY_TIMEOUT_MS); + if (ret == -ETIMEDOUT) { + XDNA_DBG(xdna, + "Submit chain timeout: no slot available after %ums (%u/%u cmds done)", + VE2_RETRY_TIMEOUT_MS, total_submitted, + cmd_chain->command_count); + if (total_submitted > 0) { + ve2_hwctx_add_job(hwctx, job, *seq, total_submitted); + amdxdna_cmd_set_state(cmd_bo, ERT_CMD_STATE_TIMEOUT); + } + return -EAGAIN; + } else if (ret < 0) { + XDNA_ERR(xdna, "Submit chain interrupted while waiting for slot"); + if (total_submitted > 0) + ve2_hwctx_add_job(hwctx, job, *seq, total_submitted); + return ret; + } + + XDNA_DBG(xdna, + "Slot available, retrying chain submission from cmd %u/%u", + start_idx, cmd_chain->command_count); + } else { + XDNA_ERR(xdna, "Submit chain failed with error %d (%u/%u cmds done)", + ret, total_submitted, cmd_chain->command_count); + if (total_submitted > 0) + ve2_hwctx_add_job(hwctx, job, *seq, total_submitted); + return ret; + } } return ve2_hwctx_add_job(hwctx, job, *seq, cmd_chain->command_count); @@ -686,9 +894,9 @@ int ve2_cmd_submit(struct amdxdna_ctx *hwctx, struct amdxdna_sched_job *job, u32 ret = ve2_submit_cmd_single(hwctx, job, seq); if (ret) { - /* Caller expecting this return value for retry. */ + /* Return -ERESTARTSYS for -EAGAIN so userspace can retry */ if (ret == -EAGAIN) { - XDNA_DBG(xdna, "Failed to submit a command (retry expected)\n"); + XDNA_ERR(xdna, "Failed to submit a command (retry expected)\n"); return -ERESTARTSYS; } diff --git a/src/driver/amdxdna/ve2_of.h b/src/driver/amdxdna/ve2_of.h index b60b0218a..fc47fb5d5 100644 --- a/src/driver/amdxdna/ve2_of.h +++ b/src/driver/amdxdna/ve2_of.h @@ -17,6 +17,16 @@ #define VERBOSITY_LEVEL_DBG 2 +/* + * VE2_RETRY_TIMEOUT_MS - Total timeout for command retry attempts + * + * This is the maximum time we'll wait for a queue slot to become available + * when the hardware queue is full. The wait is event-driven (IRQ wakes us + * when a slot frees up), not polling-based, so this timeout is only hit + * if the hardware becomes unresponsive. + */ +#define VE2_RETRY_TIMEOUT_MS 5000 + #define aie_calc_part_id(start_col, num_col) \ (((start_col) << AIE_PART_ID_START_COL_SHIFT) + \ ((num_col) << AIE_PART_ID_NUM_COLS_SHIFT))