From 04b008024684377973c64573c717e89c1daa8641 Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 18:21:19 +0300 Subject: [PATCH 01/12] jtagtap: Extract platform_delay_busy() * Copy the snippet verbatim from 10 identical places * Ask GCC to please inline it back (so codegen does not change) compromising 116 bytes of flash for less jumping around --- src/platforms/common/jtagtap.c | 38 ++++++++++++++++------------------ 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/platforms/common/jtagtap.c b/src/platforms/common/jtagtap.c index 16cf5f6edfb..468b1cc39a7 100644 --- a/src/platforms/common/jtagtap.c +++ b/src/platforms/common/jtagtap.c @@ -31,6 +31,7 @@ jtag_proc_s jtag_proc; static void jtagtap_reset(void); +static inline void platform_delay_busy(const uint32_t loops) __attribute__((always_inline)); static void jtagtap_tms_seq(uint32_t tms_states, size_t ticks); static void jtagtap_tdi_tdo_seq(uint8_t *data_out, bool final_tms, const uint8_t *data_in, size_t clock_cycles); static void jtagtap_tdi_seq(bool final_tms, const uint8_t *data_in, size_t clock_cycles); @@ -69,15 +70,20 @@ static void jtagtap_reset(void) jtagtap_soft_reset(); } +/* Busy-looping delay snippet for GPIO bitbanging (rely on inlining) */ +static inline void platform_delay_busy(const uint32_t loops) +{ + for (volatile uint32_t counter = loops; counter > 0; --counter) + continue; +} + static bool jtagtap_next_clk_delay() { gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); const uint16_t result = gpio_get(TDO_PORT, TDO_PIN); gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); return result != 0; } @@ -105,12 +111,10 @@ static void jtagtap_tms_seq_clk_delay(uint32_t tms_states, const size_t clock_cy const bool state = tms_states & 1U; gpio_set_val(TMS_PORT, TMS_PIN, state); gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); tms_states >>= 1U; gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } } @@ -153,8 +157,7 @@ static void jtagtap_tdi_tdo_seq_clk_delay( gpio_set_val(TDI_PORT, TDI_PIN, data_in[byte] & (1U << bit)); /* Start the clock cycle */ gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); /* If TDO is high, store a 1 in the appropriate position in the value being accumulated */ if (gpio_get(TDO_PORT, TDO_PIN)) value |= 1U << bit; @@ -164,8 +167,7 @@ static void jtagtap_tdi_tdo_seq_clk_delay( } /* Finish the clock cycle */ gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } /* If clock_cycles is not divisible by 8, we have some extra data to write back here. */ if (clock_cycles & 7U) { @@ -241,12 +243,10 @@ static void jtagtap_tdi_seq_clk_delay(const uint8_t *const data_in, const bool f /* Set up the TDI pin and start the clock cycle */ gpio_set_val(TDI_PORT, TDI_PIN, data_in[byte] & (1U << bit)); gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); /* Finish the clock cycle */ gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } } @@ -294,11 +294,9 @@ static void jtagtap_cycle_clk_delay(const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_set(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); gpio_clear(TCK_PORT, TCK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } } From ea3e132db6c209ecf4c5df74f58927f6fd5e3b3f Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 18:37:08 +0300 Subject: [PATCH 02/12] swdptap: Extract platform_delay_busy() * Copy the snippet verbatim from 10 similar places, preserving the +1 in argument (turnaround/parity) for automatic 0 clock delay instead of checking against UINT32_MAX * Mark as static for GCC to inline it back (so codegen does not change) compromising 96 bytes of flash for less jumping around --- src/platforms/common/swdptap.c | 38 ++++++++++++++++------------------ 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index 6742ba9fbe8..7256de916c7 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -40,6 +40,7 @@ typedef enum swdio_status_e { swd_proc_s swd_proc; +static inline void platform_delay_busy(const uint32_t loops) __attribute__((always_inline, optimize(3))); static void swdptap_turnaround(swdio_status_t dir) __attribute__((optimize(3))); static uint32_t swdptap_seq_in(size_t clock_cycles) __attribute__((optimize(3))); static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) __attribute__((optimize(3))); @@ -54,6 +55,13 @@ void swdptap_init(void) swd_proc.seq_out_parity = swdptap_seq_out_parity; } +/* Busy-looping delay snippet for GPIO bitbanging (rely on inlining) */ +static inline void platform_delay_busy(const uint32_t loops) +{ + for (volatile uint32_t counter = loops; counter > 0; --counter) + continue; +} + static void swdptap_turnaround(const swdio_status_t dir) { static swdio_status_t olddir = SWDIO_STATUS_FLOAT; @@ -71,12 +79,10 @@ static void swdptap_turnaround(const swdio_status_t dir) } else gpio_clear(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider + 1); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider + 1); if (dir == SWDIO_STATUS_DRIVE) { gpio_clear(SWCLK_PORT, SWCLK_PIN); @@ -92,11 +98,9 @@ static uint32_t swdptap_seq_in_clk_delay(const size_t clock_cycles) for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_clear(SWCLK_PORT, SWCLK_PIN); value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } gpio_clear(SWCLK_PORT, SWCLK_PIN); return value; @@ -129,15 +133,13 @@ static uint32_t swdptap_seq_in(size_t clock_cycles) static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) { const uint32_t result = swdptap_seq_in(clock_cycles); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider + 1); const bool parity = calculate_odd_parity(result); const bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider + 1); *ret = result; /* Terminate the read cycle now */ @@ -152,11 +154,9 @@ static void swdptap_seq_out_clk_delay(const uint32_t tms_states, const size_t cl for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_clear(SWCLK_PORT, SWCLK_PIN); gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1 << cycle)); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider); } gpio_clear(SWCLK_PORT, SWCLK_PIN); } @@ -187,10 +187,8 @@ static void swdptap_seq_out_parity(const uint32_t tms_states, const size_t clock const bool parity = calculate_odd_parity(tms_states); swdptap_seq_out(tms_states, clock_cycles); gpio_set_val(SWDIO_PORT, SWDIO_PIN, parity); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider + 1); gpio_set(SWCLK_PORT, SWCLK_PIN); - for (volatile uint32_t counter = target_clk_divider + 1; counter > 0; --counter) - continue; + platform_delay_busy(target_clk_divider + 1); gpio_clear(SWCLK_PORT, SWCLK_PIN); } From 682349cb91c8d375c643997ee87b1b0c7cd68ed5 Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 18:13:55 +0300 Subject: [PATCH 03/12] jtagtap, swdptap: Replace platform_delay_busy() with a faster version * Use predecrement and check against zero, as before * Stop touching the stack with `volatile` variables, a register is sufficient * Use normal SysTick platform_delay() in jtagtap_reset(), which is both longer duration and does not depend on platform HCLK. --- src/platforms/common/jtagtap.c | 8 ++++---- src/platforms/common/swdptap.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/platforms/common/jtagtap.c b/src/platforms/common/jtagtap.c index 468b1cc39a7..251fb169e47 100644 --- a/src/platforms/common/jtagtap.c +++ b/src/platforms/common/jtagtap.c @@ -62,8 +62,8 @@ static void jtagtap_reset(void) #ifdef TRST_PORT if (platform_hwversion() == 0) { gpio_clear(TRST_PORT, TRST_PIN); - for (volatile size_t i = 0; i < 10000U; i++) - continue; + /* Hold low for approximately 1.5 ms */ + platform_delay(1U); /* Requires SysTick interrupt to be unblocked */ gpio_set(TRST_PORT, TRST_PIN); } #endif @@ -73,8 +73,8 @@ static void jtagtap_reset(void) /* Busy-looping delay snippet for GPIO bitbanging (rely on inlining) */ static inline void platform_delay_busy(const uint32_t loops) { - for (volatile uint32_t counter = loops; counter > 0; --counter) - continue; + for (register uint32_t counter = loops; --counter > 0U;) + __asm__(""); } static bool jtagtap_next_clk_delay() diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index 7256de916c7..9bc74866377 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -58,8 +58,8 @@ void swdptap_init(void) /* Busy-looping delay snippet for GPIO bitbanging (rely on inlining) */ static inline void platform_delay_busy(const uint32_t loops) { - for (volatile uint32_t counter = loops; counter > 0; --counter) - continue; + for (register uint32_t counter = loops; --counter > 0U;) + __asm__(""); } static void swdptap_turnaround(const swdio_status_t dir) From e70d536ab5de242e7753521e7a80ecb0ea15c24c Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 19:45:54 +0300 Subject: [PATCH 04/12] timing_stm32: Copy the platform_delay_busy() here --- src/platforms/common/stm32/timing_stm32.c | 26 +++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/platforms/common/stm32/timing_stm32.c b/src/platforms/common/stm32/timing_stm32.c index d9bc5bcca40..6c4e3f392e6 100644 --- a/src/platforms/common/stm32/timing_stm32.c +++ b/src/platforms/common/stm32/timing_stm32.c @@ -199,3 +199,29 @@ uint32_t platform_max_frequency_get(void) return result; #endif } + +/* Busy-looping delay for GPIO bitbanging operations. SUBS+BNE.N take 4 cycles. */ +void platform_delay_busy(const uint32_t loops) +{ + /* Avoid using `volatile` variables which incur stack accesses */ +#if 0 + register uint32_t i = loops; + do { + /* + * A "tactical" single NOP takes 0-1 cycles on Cortex-M0/M3/M4/M7 + * and avoids DCE, but consumes 2 bytes of flash, amplified by static/inline. + * A normal `continue` in for/while/do-loops with no side-effects + * makes the whole loop disappear to DCE at higher than -O1. + */ + __asm__("nop"); + } while (--i > 0U); +#else + /* + * Another version which still assembles to SUBS+BNE.N; the NOP can be eliminated + * in favor of an empty inline asm/volatile block if it's enough to suppress DCE. + * Note that the predecrement has to be merged into the condition expression. + */ + for (register uint32_t i = loops; --i > 0U;) + __asm__(""); +#endif +} From 39673a078e6a546ca1c3e9347f4b61e67e4574dc Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 20:04:18 +0300 Subject: [PATCH 05/12] stm32/gpio: Drop double writes on set/reset since F4Discovery days --- src/platforms/common/stm32/gpio.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/platforms/common/stm32/gpio.h b/src/platforms/common/stm32/gpio.h index cd0f9707b40..fd078f1ceb6 100644 --- a/src/platforms/common/stm32/gpio.h +++ b/src/platforms/common/stm32/gpio.h @@ -29,11 +29,6 @@ static inline void bmp_gpio_set(const uint32_t gpioport, const uint16_t gpios) { /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ GPIO_BSRR(gpioport) = gpios; -#if defined(STM32F4) || defined(STM32F7) - /* FIXME: Check if doubling is still needed */ - /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ - GPIO_BSRR(gpioport) = gpios; -#endif } #define gpio_set bmp_gpio_set @@ -44,10 +39,6 @@ static inline void bmp_gpio_clear(const uint32_t gpioport, const uint16_t gpios) /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ GPIO_BRR(gpioport) = gpios; #else -#if defined(STM32F4) || defined(STM32F7) - /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ - GPIO_BSRR(gpioport) = gpios << 16U; -#endif /* NOLINTNEXTLINE(clang-diagnostic-int-to-pointer-cast) */ GPIO_BSRR(gpioport) = gpios << 16U; #endif From 1b4bbd9bb30791b34454c6e3b54e37d904c7a8fc Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 19:41:08 +0300 Subject: [PATCH 06/12] timing_stm32: Do not default clk_divider to 0; fix frequency_get() * Avoid a zero loops delay, which underflows in `platform_delay_busy()`, hanging the adapter --- src/platforms/common/stm32/timing_stm32.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/platforms/common/stm32/timing_stm32.c b/src/platforms/common/stm32/timing_stm32.c index 6c4e3f392e6..24eea82d38f 100644 --- a/src/platforms/common/stm32/timing_stm32.c +++ b/src/platforms/common/stm32/timing_stm32.c @@ -30,7 +30,7 @@ bool running_status = false; static volatile uint32_t time_ms = 0; -uint32_t target_clk_divider = 0; +uint32_t target_clk_divider = UINT32_MAX; static size_t morse_tick = 0; #if defined(PLATFORM_HAS_POWER_SWITCH) && defined(STM32F1) @@ -172,6 +172,11 @@ void platform_max_frequency_set(const uint32_t frequency) target_clk_divider = UINT32_MAX; return; } + /* A zero loops delay will underflow and hang in platform_delay_busy() */ + if (divisor == 0U) { + target_clk_divider = UINT32_MAX; + return; + } divisor /= 2U; target_clk_divider = divisor / (CYCLES_PER_CNT * frequency); if (target_clk_divider * (CYCLES_PER_CNT * frequency) < divisor) @@ -194,8 +199,10 @@ uint32_t platform_max_frequency_get(void) const uint32_t ratio = (target_clk_divider * BITBANG_DIVIDER_FACTOR) + BITBANG_DIVIDER_OFFSET; return rcc_ahb_frequency / ratio; #else + if (target_clk_divider == UINT32_MAX) + return rcc_ahb_frequency / USED_SWD_CYCLES; uint32_t result = rcc_ahb_frequency; - result /= USED_SWD_CYCLES + CYCLES_PER_CNT * target_clk_divider; + result /= USED_SWD_CYCLES + CYCLES_PER_CNT * target_clk_divider * 2U; return result; #endif } From ad0f9b36ec3b7e4b761bdec6526f6b8e0965f6a8 Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 19:46:25 +0300 Subject: [PATCH 07/12] timing_stm32: Update factor and offset for uncalibrated platforms * For STM32F4, dial in 12 & 4 as measured by LA and MSO. This became faster thanks to dropping double writes and volatile on-stack counters. * For STM32F0, dial in slightly bigger 16 & 6 because armv6-m codegen is different. * For STM32F1, dial in 18 & 4: less than 22 thanks to no volatile on-stack counters, and similar busy delay impact (armv7-m). * For everything else, keep the old numbers in hope someone measures it later. --- src/platforms/common/stm32/timing_stm32.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/platforms/common/stm32/timing_stm32.c b/src/platforms/common/stm32/timing_stm32.c index 24eea82d38f..69f6eae58cb 100644 --- a/src/platforms/common/stm32/timing_stm32.c +++ b/src/platforms/common/stm32/timing_stm32.c @@ -140,9 +140,23 @@ uint32_t platform_time_ms(void) * per delay loop count with 2 delay loops per clock */ +#if defined(STM32F4) +/* Values for STM32F411 at 96 MHz */ +#define USED_SWD_CYCLES 12 +#define CYCLES_PER_CNT 4 +#elif defined(STM32F1) /* Values for STM32F103 at 72 MHz */ +#define USED_SWD_CYCLES 18 +#define CYCLES_PER_CNT 4 +#elif defined(STM32F0) +/* Values for STM32F072 at 48 MHz */ +#define USED_SWD_CYCLES 16 +#define CYCLES_PER_CNT 6 +#else +/* Inherit defaults for other platforms (F3, F7) */ #define USED_SWD_CYCLES 22 #define CYCLES_PER_CNT 10 +#endif void platform_max_frequency_set(const uint32_t frequency) { From 12c40bcc2727d2b22e3eb4902f6d565a31e64579 Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 20:18:26 +0300 Subject: [PATCH 08/12] swdptap: Tweak `_no_delay()` timings for a more even duty cycle * Reference: jtagtap_tms_seq_no_delay(). Also add comments. * Use appropriate type for TMS state output. --- src/platforms/common/swdptap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index 9bc74866377..2bee118b8fc 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -114,8 +114,10 @@ static uint32_t swdptap_seq_in_no_delay(const size_t clock_cycles) for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_clear(SWCLK_PORT, SWCLK_PIN); value |= gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN) ? 1U << cycle : 0U; + /* Reordering barrier (in place of a delay) to retain timings */ + __asm__("" ::: "memory"); gpio_set(SWCLK_PORT, SWCLK_PIN); - __asm__("nop"); + __asm__("" ::: "memory"); } gpio_clear(SWCLK_PORT, SWCLK_PIN); return value; @@ -166,9 +168,14 @@ static void swdptap_seq_out_no_delay(uint32_t tms_states, size_t clock_cycles) _ static void swdptap_seq_out_no_delay(const uint32_t tms_states, const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { + const bool state = tms_states & (1U << cycle); + /* Block the compiler from re-ordering the TMS states calculation to preserve timings */ + __asm__("" ::: "memory"); gpio_clear(SWCLK_PORT, SWCLK_PIN); - gpio_set_val(SWDIO_PORT, SWDIO_PIN, tms_states & (1 << cycle)); + gpio_set_val(SWDIO_PORT, SWDIO_PIN, state); + __asm__("" ::: "memory"); gpio_set(SWCLK_PORT, SWCLK_PIN); + __asm__("" ::: "memory"); } gpio_clear(SWCLK_PORT, SWCLK_PIN); } From 4bff6d19c57a58e970963c57d97557203e7ccc37 Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sat, 25 Nov 2023 22:23:13 +0300 Subject: [PATCH 09/12] swdptap: Handle no_delay parity/turnaround as well * This allows dropping the 0-loops check from delay_busy --- src/platforms/common/swdptap.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/platforms/common/swdptap.c b/src/platforms/common/swdptap.c index 2bee118b8fc..91fcffa9450 100644 --- a/src/platforms/common/swdptap.c +++ b/src/platforms/common/swdptap.c @@ -79,10 +79,12 @@ static void swdptap_turnaround(const swdio_status_t dir) } else gpio_clear(SWCLK_PORT, SWCLK_PIN); - platform_delay_busy(target_clk_divider + 1); + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); gpio_set(SWCLK_PORT, SWCLK_PIN); - platform_delay_busy(target_clk_divider + 1); + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); if (dir == SWDIO_STATUS_DRIVE) { gpio_clear(SWCLK_PORT, SWCLK_PIN); @@ -135,13 +137,15 @@ static uint32_t swdptap_seq_in(size_t clock_cycles) static bool swdptap_seq_in_parity(uint32_t *ret, size_t clock_cycles) { const uint32_t result = swdptap_seq_in(clock_cycles); - platform_delay_busy(target_clk_divider + 1); + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); const bool parity = calculate_odd_parity(result); const bool bit = gpio_get(SWDIO_IN_PORT, SWDIO_IN_PIN); gpio_set(SWCLK_PORT, SWCLK_PIN); - platform_delay_busy(target_clk_divider + 1); + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); *ret = result; /* Terminate the read cycle now */ @@ -194,8 +198,12 @@ static void swdptap_seq_out_parity(const uint32_t tms_states, const size_t clock const bool parity = calculate_odd_parity(tms_states); swdptap_seq_out(tms_states, clock_cycles); gpio_set_val(SWDIO_PORT, SWDIO_PIN, parity); - platform_delay_busy(target_clk_divider + 1); + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); + gpio_set(SWCLK_PORT, SWCLK_PIN); - platform_delay_busy(target_clk_divider + 1); + if (target_clk_divider != UINT32_MAX) + platform_delay_busy(target_clk_divider); + gpio_clear(SWCLK_PORT, SWCLK_PIN); } From 1a39f13ccbcf75de101eb0f3182eedc3e0b098b5 Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sun, 26 Nov 2023 00:07:34 +0300 Subject: [PATCH 10/12] jtagtap: optimize _no_delay() paths at -O3 like swdptap * Also update signature of jtagtap_tms_seq() (s/ticks/clock_cycles/) --- src/platforms/common/jtagtap.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/platforms/common/jtagtap.c b/src/platforms/common/jtagtap.c index 251fb169e47..0d317efd189 100644 --- a/src/platforms/common/jtagtap.c +++ b/src/platforms/common/jtagtap.c @@ -32,11 +32,12 @@ jtag_proc_s jtag_proc; static void jtagtap_reset(void); static inline void platform_delay_busy(const uint32_t loops) __attribute__((always_inline)); -static void jtagtap_tms_seq(uint32_t tms_states, size_t ticks); -static void jtagtap_tdi_tdo_seq(uint8_t *data_out, bool final_tms, const uint8_t *data_in, size_t clock_cycles); -static void jtagtap_tdi_seq(bool final_tms, const uint8_t *data_in, size_t clock_cycles); -static bool jtagtap_next(bool tms, bool tdi); -static void jtagtap_cycle(bool tms, bool tdi, size_t clock_cycles); +static void jtagtap_tms_seq(uint32_t tms_states, size_t clock_cycles) __attribute__((optimize(3))); +static void jtagtap_tdi_tdo_seq(uint8_t *data_out, bool final_tms, const uint8_t *data_in, size_t clock_cycles) + __attribute__((optimize(3))); +static void jtagtap_tdi_seq(bool final_tms, const uint8_t *data_in, size_t clock_cycles) __attribute__((optimize(3))); +static bool jtagtap_next(bool tms, bool tdi) __attribute__((optimize(3))); +static void jtagtap_cycle(bool tms, bool tdi, size_t clock_cycles) __attribute__((optimize(3))); void jtagtap_init(void) { @@ -87,6 +88,8 @@ static bool jtagtap_next_clk_delay() return result != 0; } +static bool jtagtap_next_no_delay() __attribute__((optimize(3))); + static bool jtagtap_next_no_delay() { gpio_set(TCK_PORT, TCK_PIN); @@ -118,6 +121,9 @@ static void jtagtap_tms_seq_clk_delay(uint32_t tms_states, const size_t clock_cy } } +static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cycles) + __attribute__((noinline, optimize(3))); + static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cycles) { bool state = tms_states & 1U; @@ -134,13 +140,13 @@ static void jtagtap_tms_seq_no_delay(uint32_t tms_states, const size_t clock_cyc } } -static void jtagtap_tms_seq(const uint32_t tms_states, const size_t ticks) +static void jtagtap_tms_seq(const uint32_t tms_states, const size_t clock_cycles) { gpio_set(TDI_PORT, TDI_PIN); if (target_clk_divider != UINT32_MAX) - jtagtap_tms_seq_clk_delay(tms_states, ticks); + jtagtap_tms_seq_clk_delay(tms_states, clock_cycles); else - jtagtap_tms_seq_no_delay(tms_states, ticks); + jtagtap_tms_seq_no_delay(tms_states, clock_cycles); } static void jtagtap_tdi_tdo_seq_clk_delay( @@ -176,6 +182,9 @@ static void jtagtap_tdi_tdo_seq_clk_delay( } } +static void jtagtap_tdi_tdo_seq_no_delay(const uint8_t *const data_in, uint8_t *const data_out, const bool final_tms, + const size_t clock_cycles) __attribute__((optimize(3))); + static void jtagtap_tdi_tdo_seq_no_delay( const uint8_t *const data_in, uint8_t *const data_out, const bool final_tms, const size_t clock_cycles) { @@ -250,6 +259,9 @@ static void jtagtap_tdi_seq_clk_delay(const uint8_t *const data_in, const bool f } } +static void jtagtap_tdi_seq_no_delay(const uint8_t *const data_in, const bool final_tms, size_t clock_cycles) + __attribute__((optimize(3))); + static void jtagtap_tdi_seq_no_delay(const uint8_t *const data_in, const bool final_tms, size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles;) { @@ -300,6 +312,8 @@ static void jtagtap_cycle_clk_delay(const size_t clock_cycles) } } +static void jtagtap_cycle_no_delay(const size_t clock_cycles) __attribute__((optimize(3))); + static void jtagtap_cycle_no_delay(const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { From d425df22da4a52eb8076645233bcdecb127727a8 Mon Sep 17 00:00:00 2001 From: ALTracer Date: Sun, 26 Nov 2023 00:47:22 +0300 Subject: [PATCH 11/12] jtagtap: Equalize _next & _cycle, replace line reset * SWD line reset needs 50 clocks with high TMS/SWDIO, this was achieved by looping over jtagtap_next() which incurs function call delays; * Use jtagtap_cycle() instead which seems appropriate here, and discards TDO anyways; * The adiv5_swd.c:swd_line_reset_sequence() counterpart seems less fitting to duplicate in jtagtap. * Annotate any new nops in hot no_delay path. --- src/platforms/common/jtagtap.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/platforms/common/jtagtap.c b/src/platforms/common/jtagtap.c index 0d317efd189..ab1bf81de15 100644 --- a/src/platforms/common/jtagtap.c +++ b/src/platforms/common/jtagtap.c @@ -53,9 +53,8 @@ void jtagtap_init(void) jtag_proc.tap_idle_cycles = 1; /* Ensure we're in JTAG mode */ - for (size_t i = 0; i <= 50U; ++i) - jtagtap_next(true, false); /* 50 + 1 idle cycles for SWD reset */ - jtagtap_tms_seq(0xe73cU, 16U); /* SWD to JTAG sequence */ + jtagtap_cycle(true, false, 51U); /* 50 + 1 idle cycles for SWD reset */ + jtagtap_tms_seq(0xe73cU, 16U); /* SWD to JTAG sequence */ } static void jtagtap_reset(void) @@ -93,8 +92,12 @@ static bool jtagtap_next_no_delay() __attribute__((optimize(3))); static bool jtagtap_next_no_delay() { gpio_set(TCK_PORT, TCK_PIN); + /* Stretch the clock high time */ + __asm__("nop" ::: "memory"); const uint16_t result = gpio_get(TDO_PORT, TDO_PIN); gpio_clear(TCK_PORT, TCK_PIN); + /* Stretch the clock low time */ + __asm__("nop" ::: "memory"); return result != 0; } @@ -318,8 +321,11 @@ static void jtagtap_cycle_no_delay(const size_t clock_cycles) { for (size_t cycle = 0; cycle < clock_cycles; ++cycle) { gpio_set(TCK_PORT, TCK_PIN); + /* Stretch the clock high time */ __asm__ volatile("nop" ::: "memory"); gpio_clear(TCK_PORT, TCK_PIN); + /* Stretch the clock low time */ + __asm__ volatile("nop" ::: "memory"); } } From 58a242508e969da9a8de459624f4ff05ad3a7275 Mon Sep 17 00:00:00 2001 From: ALTracer Date: Mon, 27 Nov 2023 23:35:23 +0300 Subject: [PATCH 12/12] jtagtap: Protect tdi_tdo_seq from writing into NULL data_out ...by redirecting to faster tdi_seq * Do not use nullability attributes yet --- src/platforms/common/jtagtap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/platforms/common/jtagtap.c b/src/platforms/common/jtagtap.c index ab1bf81de15..23866cbabb9 100644 --- a/src/platforms/common/jtagtap.c +++ b/src/platforms/common/jtagtap.c @@ -237,6 +237,9 @@ static void jtagtap_tdi_tdo_seq_no_delay( static void jtagtap_tdi_tdo_seq( uint8_t *const data_out, const bool final_tms, const uint8_t *const data_in, size_t clock_cycles) { + /* In case the callsite passes NULL for data_out, don't bother sampling TDO */ + if (!data_out) + return jtagtap_tdi_seq(final_tms, data_in, clock_cycles); gpio_clear(TMS_PORT, TMS_PIN); gpio_clear(TDI_PORT, TDI_PIN); if (target_clk_divider != UINT32_MAX)