Optimizing IRQ latency on the STM32H743 @ 480 MHz, perhaps for NES ROM emulation... Best result so far: 100 nanoseconds input-to-output latency when the vector table and the IRQ handler are relocated to Tightly-Coupled Memory without making HAL calls. Not bad, but the GPIO controller (several buses away) looks like the real performance killer here.

STM32 development board with a STM32H743VI microcontroller.Oscilloscope trace of a rising edge trigger and the STM32's GPIO pulse (25 ns), the latency is 97.60 ns./* Define output sections */
SECTIONS
{
  /* The startup code goes into ITCM (relocated from FLASH)  */
  .isr_vector :
  {
    . = ALIGN(4);
    __isr_vector_start = .;
    KEEP(*(.isr_vector)) /* Startup code */
    . = ALIGN(4);
    __isr_vector_end = .;
  } >ITCMRAM AT> FLASH

  .itcm_text :
  {
    . = ALIGN(4);
    __itcm_text_start = .;
    *(.itcm_text)
    *(.itcm_text*)
    . = ALIGN(4);
    __itcm_text_end = .;
  } > ITCMRAM AT> FLASH

  /* used by the startup to initialize data */
  _si_isr_vector = LOADADDR(.isr_vector);
  _si_itcm_text = LOADADDR(".itcm_text");# Basic compiler flags
CFLAGS  = -O2 -flto -Wall -Wextra -std=c99 -pedantic
CFLAGS += -DSTM32H743xx -DUSE_HAL_DRIVER

# debug symbols?
#CFLAGS += -g -ggdb

# If this macro is set, system_stm32h7xx.c overwrites the interrupt
# vector table address (VTOR register) to a custom address.
CFLAGS += -DUSER_VECT_TAB_ADDRESS=0x00000000void relocate_to_itcm(void)
{
        extern volatile char _si_isr_vector;
        extern volatile char __isr_vector_start, __isr_vector_end;

        volatile char *flash_isr_vector_start = &_si_isr_vector;
        volatile char *ram_isr_vector_start = &__isr_vector_start;
        volatile char *ram_isr_vector_end = &__isr_vector_end;
        size_t len = ram_isr_vector_end - ram_isr_vector_start;

        for (size_t i = 0; i < len; i++) {
                ram_isr_vector_start[i] = flash_isr_vector_start[i];
        }

#ifdef SEMIHOSTING
        printf("relocate %p-%p to %p, %d bytes\n", flash_isr_vector_start, flash_isr_vector_start + len, ram_isr_vector_start, len);
#endif

        extern volatile char _si_itcm_text;
        extern volatile char __itcm_text_start, __itcm_text_end;

        volatile char *flash_itcm_text_start = &_si_itcm_text;
        volatile char *ram_itcm_text_start = &__itcm_text_start;
        volatile char *ram_itcm_text_end = &__itcm_text_end;
        len = ram_itcm_text_end - ram_itcm_text_start;

        for (size_t i = 0; i < len; i++) {
                ram_itcm_text_start[i] = flash_itcm_text_start[i];
        }

#ifdef SEMIHOSTING
        printf("relocate %p-%p to %p, %d bytes\n", flash_itcm_text_start, flash_itcm_text_start + len, ram_itcm_text_start, len);
#endif
}__attribute__((section (".itcm_text")))
void HAL_GPIO_EXTI_Callback(uint16_t GPIO_Pin)
{
	if (GPIO_Pin != GPIO_PIN_0)
		return;

	GPIOA->BSRR = (uint32_t) GPIO_PIN_2;
	GPIOA->BSRR = (uint32_t) GPIO_PIN_2 << 16U;
}__attribute__((section (".itcm_text")))
void SysTick_Handler(void)
{
	HAL_IncTick();
}

/******************************************************************************/
/*                 STM32H7xx Peripherals Interrupt Handlers                   */
/*  Add here the Interrupt Handler for the used peripheral(s) (PPP), for the  */
/*  available peripheral interrupt handler's name please refer to the startup */
/*  file (startup_stm32h7xx.s).                                               */
/******************************************************************************/
__attribute__((section (".itcm_text")))
void EXTI0_IRQHandler(void)
{
	HAL_GPIO_EXTI_Callback(GPIO_PIN_0);

	// Still make a dummy HAL call to ACK the IRQ
	// TODO: ACK it manually
	HAL_GPIO_EXTI_IRQHandler(0xFFFF);
}
0

If you have a fediverse account, you can quote this note from your own instance. Search https://mk.absturztau.be/notes/ajv8mjcw305b017a on your instance and quote it. (Note that quoting is not supported in Mastodon.)