Performance Optimization

Guide to optimizing performance in the Nexus Embedded Platform.

Overview

Performance optimization in embedded systems requires balancing:

  • Speed: Execution time and throughput

  • Memory: RAM and flash usage

  • Power: Energy consumption

  • Code Size: Flash footprint

This guide covers optimization techniques for the Nexus platform.

Profiling and Measurement

Timing Measurement

Cycle Counter

/* Enable DWT cycle counter */
CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
DWT->CYCCNT = 0;
DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;

/* Measure execution time */
uint32_t start = DWT->CYCCNT;
function_to_measure();
uint32_t cycles = DWT->CYCCNT - start;
uint32_t microseconds = cycles / (SystemCoreClock / 1000000);

Timing Macros

#define TIMING_START() \
    uint32_t _timing_start = DWT->CYCCNT

#define TIMING_END(name) \
    do { \
        uint32_t _cycles = DWT->CYCCNT - _timing_start; \
        LOG_INFO("%s: %u cycles", name, _cycles); \
    } while(0)

/* Usage */
TIMING_START();
process_data();
TIMING_END("process_data");

Memory Profiling

Stack Usage

/* Fill stack with pattern */
extern uint32_t _sstack;
extern uint32_t _estack;

void stack_fill(void) {
    uint32_t* p = &_sstack;
    while (p < &_estack) {
        *p++ = 0xDEADBEEF;
    }
}

/* Check stack usage */
uint32_t stack_get_usage(void) {
    uint32_t* p = &_sstack;
    uint32_t count = 0;
    while (p < &_estack && *p == 0xDEADBEEF) {
        p++;
        count++;
    }
    return (&_estack - &_sstack - count) * sizeof(uint32_t);
}

Heap Usage

/* Track heap allocations */
static size_t heap_allocated = 0;
static size_t heap_peak = 0;

void* tracked_malloc(size_t size) {
    void* ptr = malloc(size);
    if (ptr) {
        heap_allocated += size;
        if (heap_allocated > heap_peak) {
            heap_peak = heap_allocated;
        }
    }
    return ptr;
}

void tracked_free(void* ptr, size_t size) {
    free(ptr);
    heap_allocated -= size;
}

Code Optimization

Compiler Optimization

Optimization Levels

# Debug build - no optimization
set(CMAKE_C_FLAGS_DEBUG "-O0 -g")

# Release build - optimize for speed
set(CMAKE_C_FLAGS_RELEASE "-O2")

# Size-optimized build
set(CMAKE_C_FLAGS_MINSIZEREL "-Os")

# Maximum optimization
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g")

Function Inlining

/* Force inline for small, frequently called functions */
static inline __attribute__((always_inline))
uint32_t fast_multiply(uint32_t a, uint32_t b) {
    return a * b;
}

/* Prevent inlining for large functions */
__attribute__((noinline))
void large_function(void) {
    /* Large function body */
}

Algorithm Optimization

Use Efficient Algorithms

/* Bad: O(n²) search */
bool find_duplicate_slow(int* array, size_t length) {
    for (size_t i = 0; i < length; i++) {
        for (size_t j = i + 1; j < length; j++) {
            if (array[i] == array[j]) {
                return true;
            }
        }
    }
    return false;
}

/* Good: O(n) with hash set */
bool find_duplicate_fast(int* array, size_t length) {
    hash_set_t* seen = hash_set_create();
    for (size_t i = 0; i < length; i++) {
        if (hash_set_contains(seen, array[i])) {
            hash_set_destroy(seen);
            return true;
        }
        hash_set_add(seen, array[i]);
    }
    hash_set_destroy(seen);
    return false;
}

Loop Optimization

/* Bad: Function call in loop condition */
for (size_t i = 0; i < strlen(str); i++) {
    process(str[i]);
}

/* Good: Cache length */
size_t len = strlen(str);
for (size_t i = 0; i < len; i++) {
    process(str[i]);
}

/* Better: Loop unrolling for small fixed sizes */
#define PROCESS_4(base) \
    process(str[base]); \
    process(str[base+1]); \
    process(str[base+2]); \
    process(str[base+3])

Memory Optimization

Reduce Memory Usage

Use Bit Fields

/* Bad: Wastes memory */
typedef struct {
    bool flag1;     /* 1 byte */
    bool flag2;     /* 1 byte */
    bool flag3;     /* 1 byte */
    bool flag4;     /* 1 byte */
} flags_wasteful_t;  /* 4 bytes total */

/* Good: Pack into single byte */
typedef struct {
    uint8_t flag1 : 1;
    uint8_t flag2 : 1;
    uint8_t flag3 : 1;
    uint8_t flag4 : 1;
    uint8_t reserved : 4;
} flags_packed_t;  /* 1 byte total */

Use Const for Read-Only Data

/* Bad: Uses RAM */
static uint8_t lookup_table[256] = { /* ... */ };

/* Good: Uses flash */
static const uint8_t lookup_table[256] = { /* ... */ };

Memory Pools

/* Fixed-size memory pool */
#define POOL_SIZE 10
#define BLOCK_SIZE 64

static uint8_t memory_pool[POOL_SIZE][BLOCK_SIZE];
static bool pool_used[POOL_SIZE];

void* pool_alloc(void) {
    for (size_t i = 0; i < POOL_SIZE; i++) {
        if (!pool_used[i]) {
            pool_used[i] = true;
            return memory_pool[i];
        }
    }
    return NULL;
}

void pool_free(void* ptr) {
    for (size_t i = 0; i < POOL_SIZE; i++) {
        if (memory_pool[i] == ptr) {
            pool_used[i] = false;
            return;
        }
    }
}

DMA Optimization

Use DMA for Bulk Transfers

/* Bad: CPU-based transfer */
void uart_send_slow(const uint8_t* data, size_t length) {
    for (size_t i = 0; i < length; i++) {
        while (!(UART->SR & UART_SR_TXE));
        UART->DR = data[i];
    }
}

/* Good: DMA transfer */
void uart_send_fast(const uint8_t* data, size_t length) {
    /* Configure DMA */
    DMA_Stream->PAR = (uint32_t)&UART->DR;
    DMA_Stream->M0AR = (uint32_t)data;
    DMA_Stream->NDTR = length;
    DMA_Stream->CR |= DMA_SxCR_EN;

    /* CPU is free to do other work */
}

Cache Optimization

Data Cache

/* Align DMA buffers to cache line */
#define CACHE_LINE_SIZE 32

__attribute__((aligned(CACHE_LINE_SIZE)))
static uint8_t dma_buffer[256];

/* Clean cache before DMA TX */
void dma_tx_prepare(void* buffer, size_t size) {
    SCB_CleanDCache_by_Addr(buffer, size);
}

/* Invalidate cache after DMA RX */
void dma_rx_complete(void* buffer, size_t size) {
    SCB_InvalidateDCache_by_Addr(buffer, size);
}

Instruction Cache

/* Enable instruction cache */
void enable_icache(void) {
    SCB_EnableICache();
}

/* Place frequently called functions in fast memory */
__attribute__((section(".fast_code")))
void critical_function(void) {
    /* Time-critical code */
}

Power Optimization

Sleep Modes

/* Enter sleep mode when idle */
void idle_task(void) {
    while (1) {
        __WFI();  /* Wait for interrupt */
    }
}

/* Deep sleep mode */
void enter_deep_sleep(void) {
    SCB->SCR |= SCB_SCR_SLEEPDEEP_Msk;
    __WFI();
}

Clock Gating

/* Disable unused peripheral clocks */
void optimize_clocks(void) {
    /* Disable unused peripherals */
    RCC->AHB1ENR &= ~(RCC_AHB1ENR_GPIOCEN |
                      RCC_AHB1ENR_GPIODEN);

    /* Enable only when needed */
    if (need_spi) {
        RCC->APB2ENR |= RCC_APB2ENR_SPI1EN;
    }
}

See Also

Summary

Key optimization techniques:

  • Profiling: Measure before optimizing

  • Algorithms: Use efficient algorithms

  • Memory: Minimize RAM/flash usage

  • DMA: Offload bulk transfers

  • Cache: Optimize cache usage

  • Power: Use sleep modes and clock gating

Always measure the impact of optimizations to ensure they provide real benefits.