Performance Optimization¶
Guide to optimizing performance in the Nexus Embedded Platform.
Overview¶
Performance optimization in embedded systems requires balancing:
Speed: Execution time and throughput
Memory: RAM and flash usage
Power: Energy consumption
Code Size: Flash footprint
This guide covers optimization techniques for the Nexus platform.
Profiling and Measurement¶
Timing Measurement¶
Cycle Counter
/* Enable DWT cycle counter */
CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
DWT->CYCCNT = 0;
DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
/* Measure execution time */
uint32_t start = DWT->CYCCNT;
function_to_measure();
uint32_t cycles = DWT->CYCCNT - start;
uint32_t microseconds = cycles / (SystemCoreClock / 1000000);
Timing Macros
#define TIMING_START() \
uint32_t _timing_start = DWT->CYCCNT
#define TIMING_END(name) \
do { \
uint32_t _cycles = DWT->CYCCNT - _timing_start; \
LOG_INFO("%s: %u cycles", name, _cycles); \
} while(0)
/* Usage */
TIMING_START();
process_data();
TIMING_END("process_data");
Memory Profiling¶
Stack Usage
/* Fill stack with pattern */
extern uint32_t _sstack;
extern uint32_t _estack;
void stack_fill(void) {
uint32_t* p = &_sstack;
while (p < &_estack) {
*p++ = 0xDEADBEEF;
}
}
/* Check stack usage */
uint32_t stack_get_usage(void) {
uint32_t* p = &_sstack;
uint32_t count = 0;
while (p < &_estack && *p == 0xDEADBEEF) {
p++;
count++;
}
return (&_estack - &_sstack - count) * sizeof(uint32_t);
}
Heap Usage
/* Track heap allocations */
static size_t heap_allocated = 0;
static size_t heap_peak = 0;
void* tracked_malloc(size_t size) {
void* ptr = malloc(size);
if (ptr) {
heap_allocated += size;
if (heap_allocated > heap_peak) {
heap_peak = heap_allocated;
}
}
return ptr;
}
void tracked_free(void* ptr, size_t size) {
free(ptr);
heap_allocated -= size;
}
Code Optimization¶
Compiler Optimization¶
Optimization Levels
# Debug build - no optimization
set(CMAKE_C_FLAGS_DEBUG "-O0 -g")
# Release build - optimize for speed
set(CMAKE_C_FLAGS_RELEASE "-O2")
# Size-optimized build
set(CMAKE_C_FLAGS_MINSIZEREL "-Os")
# Maximum optimization
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g")
Function Inlining
/* Force inline for small, frequently called functions */
static inline __attribute__((always_inline))
uint32_t fast_multiply(uint32_t a, uint32_t b) {
return a * b;
}
/* Prevent inlining for large functions */
__attribute__((noinline))
void large_function(void) {
/* Large function body */
}
Algorithm Optimization¶
Use Efficient Algorithms
/* Bad: O(n²) search */
bool find_duplicate_slow(int* array, size_t length) {
for (size_t i = 0; i < length; i++) {
for (size_t j = i + 1; j < length; j++) {
if (array[i] == array[j]) {
return true;
}
}
}
return false;
}
/* Good: O(n) with hash set */
bool find_duplicate_fast(int* array, size_t length) {
hash_set_t* seen = hash_set_create();
for (size_t i = 0; i < length; i++) {
if (hash_set_contains(seen, array[i])) {
hash_set_destroy(seen);
return true;
}
hash_set_add(seen, array[i]);
}
hash_set_destroy(seen);
return false;
}
Loop Optimization
/* Bad: Function call in loop condition */
for (size_t i = 0; i < strlen(str); i++) {
process(str[i]);
}
/* Good: Cache length */
size_t len = strlen(str);
for (size_t i = 0; i < len; i++) {
process(str[i]);
}
/* Better: Loop unrolling for small fixed sizes */
#define PROCESS_4(base) \
process(str[base]); \
process(str[base+1]); \
process(str[base+2]); \
process(str[base+3])
Memory Optimization¶
Reduce Memory Usage¶
Use Bit Fields
/* Bad: Wastes memory */
typedef struct {
bool flag1; /* 1 byte */
bool flag2; /* 1 byte */
bool flag3; /* 1 byte */
bool flag4; /* 1 byte */
} flags_wasteful_t; /* 4 bytes total */
/* Good: Pack into single byte */
typedef struct {
uint8_t flag1 : 1;
uint8_t flag2 : 1;
uint8_t flag3 : 1;
uint8_t flag4 : 1;
uint8_t reserved : 4;
} flags_packed_t; /* 1 byte total */
Use Const for Read-Only Data
/* Bad: Uses RAM */
static uint8_t lookup_table[256] = { /* ... */ };
/* Good: Uses flash */
static const uint8_t lookup_table[256] = { /* ... */ };
Memory Pools
/* Fixed-size memory pool */
#define POOL_SIZE 10
#define BLOCK_SIZE 64
static uint8_t memory_pool[POOL_SIZE][BLOCK_SIZE];
static bool pool_used[POOL_SIZE];
void* pool_alloc(void) {
for (size_t i = 0; i < POOL_SIZE; i++) {
if (!pool_used[i]) {
pool_used[i] = true;
return memory_pool[i];
}
}
return NULL;
}
void pool_free(void* ptr) {
for (size_t i = 0; i < POOL_SIZE; i++) {
if (memory_pool[i] == ptr) {
pool_used[i] = false;
return;
}
}
}
DMA Optimization¶
Use DMA for Bulk Transfers¶
/* Bad: CPU-based transfer */
void uart_send_slow(const uint8_t* data, size_t length) {
for (size_t i = 0; i < length; i++) {
while (!(UART->SR & UART_SR_TXE));
UART->DR = data[i];
}
}
/* Good: DMA transfer */
void uart_send_fast(const uint8_t* data, size_t length) {
/* Configure DMA */
DMA_Stream->PAR = (uint32_t)&UART->DR;
DMA_Stream->M0AR = (uint32_t)data;
DMA_Stream->NDTR = length;
DMA_Stream->CR |= DMA_SxCR_EN;
/* CPU is free to do other work */
}
Cache Optimization¶
Data Cache¶
/* Align DMA buffers to cache line */
#define CACHE_LINE_SIZE 32
__attribute__((aligned(CACHE_LINE_SIZE)))
static uint8_t dma_buffer[256];
/* Clean cache before DMA TX */
void dma_tx_prepare(void* buffer, size_t size) {
SCB_CleanDCache_by_Addr(buffer, size);
}
/* Invalidate cache after DMA RX */
void dma_rx_complete(void* buffer, size_t size) {
SCB_InvalidateDCache_by_Addr(buffer, size);
}
Instruction Cache¶
/* Enable instruction cache */
void enable_icache(void) {
SCB_EnableICache();
}
/* Place frequently called functions in fast memory */
__attribute__((section(".fast_code")))
void critical_function(void) {
/* Time-critical code */
}
Power Optimization¶
Sleep Modes¶
/* Enter sleep mode when idle */
void idle_task(void) {
while (1) {
__WFI(); /* Wait for interrupt */
}
}
/* Deep sleep mode */
void enter_deep_sleep(void) {
SCB->SCR |= SCB_SCR_SLEEPDEEP_Msk;
__WFI();
}
Clock Gating¶
/* Disable unused peripheral clocks */
void optimize_clocks(void) {
/* Disable unused peripherals */
RCC->AHB1ENR &= ~(RCC_AHB1ENR_GPIOCEN |
RCC_AHB1ENR_GPIODEN);
/* Enable only when needed */
if (need_spi) {
RCC->APB2ENR |= RCC_APB2ENR_SPI1EN;
}
}
See Also¶
Debugging Guide - Profiling and debugging
测试 - Performance testing
Architecture Design - System architecture
Summary¶
Key optimization techniques:
Profiling: Measure before optimizing
Algorithms: Use efficient algorithms
Memory: Minimize RAM/flash usage
DMA: Offload bulk transfers
Cache: Optimize cache usage
Power: Use sleep modes and clock gating
Always measure the impact of optimizations to ensure they provide real benefits.