Performance

Latency Optimization

Sub-microsecond trading latency with kernel bypass and lock-free algorithms

Latency Optimization

Achieving sub-microsecond latency requires optimization at every layer, from network stack to CPU cache.

Latency Breakdown

┌────────────────────────────────────────────────────────────────────────┐
│                    ORDER LIFECYCLE LATENCY                              │
├────────────────────────────────────────────────────────────────────────┤
│                                                                         │
│   Network Receive        ████                         0.5-2 us         │
│   Protocol Decode        ██                           0.1-0.3 us       │
│   Validation             █                            0.05-0.1 us      │
│   Risk Check             █                            0.05-0.1 us      │
│   Orderbook Insert       ████████████████████████     0.002-0.5 us     │
│   Match Engine           ████████████████████████     0.002-0.5 us     │
│   Trade Execution        ██                           0.1-0.2 us       │
│   Persistence (async)    ████████████                 1-5 us (async)   │
│   Network Send           ████                         0.5-2 us         │
│                                                                         │
│   TOTAL (hot path):      ~2-5 us (CPU) / ~10 ns (GPU)                  │
│                                                                         │
└────────────────────────────────────────────────────────────────────────┘

Kernel Bypass Networking

io_uring (Linux 5.1+)

package network

import (
    "syscall"
    "unsafe"

    "github.com/iceber/iouring-go"
)

// UringReceiver provides kernel bypass networking
type UringReceiver struct {
    ring     *iouring.IOURing
    buffers  [][]byte
    bufferID uint16
}

func NewUringReceiver(fd int, bufferCount int) (*UringReceiver, error) {
    ring, err := iouring.New(4096,
        iouring.WithSQPoll(100), // Submission queue polling
    )
    if err != nil {
        return nil, err
    }

    // Pre-register buffers for zero-copy
    buffers := make([][]byte, bufferCount)
    for i := range buffers {
        buffers[i] = make([]byte, 65536)
    }

    // Register buffer ring
    ring.RegisterBuffers(buffers)

    return &UringReceiver{
        ring:    ring,
        buffers: buffers,
    }, nil
}

// RecvZeroCopy receives without kernel-to-user copy
func (r *UringReceiver) RecvZeroCopy(fd int) ([]byte, error) {
    sqe := r.ring.GetSQE()
    sqe.PrepareRecv(fd, nil, 0,
        iouring.IOSQE_BUFFER_SELECT,
    )
    sqe.SetBufferGroup(r.bufferID)

    cqe, err := r.ring.SubmitAndWait(1)
    if err != nil {
        return nil, err
    }

    bufIdx := cqe.Flags >> 16
    length := cqe.Res

    return r.buffers[bufIdx][:length], nil
}

DPDK Integration (Optional)

// +build dpdk

package network

/*
#cgo CFLAGS: -I/usr/local/include/dpdk
#cgo LDFLAGS: -L/usr/local/lib -ldpdk
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_mbuf.h>

static inline int dpdk_rx_burst(uint16_t port, uint16_t queue,
    struct rte_mbuf **bufs, uint16_t nb_bufs) {
    return rte_eth_rx_burst(port, queue, bufs, nb_bufs);
}
*/
import "C"

// DPDKReceiver bypasses kernel entirely
type DPDKReceiver struct {
    portID  uint16
    queueID uint16
    mbufs   []*C.struct_rte_mbuf
}

func (r *DPDKReceiver) Receive() ([][]byte, int) {
    n := C.dpdk_rx_burst(
        C.uint16_t(r.portID),
        C.uint16_t(r.queueID),
        &r.mbufs[0],
        C.uint16_t(len(r.mbufs)),
    )

    packets := make([][]byte, n)
    for i := 0; i < int(n); i++ {
        mbuf := r.mbufs[i]
        data := C.GoBytes(
            unsafe.Pointer(uintptr(unsafe.Pointer(mbuf))+uintptr(mbuf.data_off)),
            C.int(mbuf.data_len),
        )
        packets[i] = data
    }

    return packets, int(n)
}

Lock-Free Order Book

Atomic Price Level Updates

package orderbook

import (
    "sync/atomic"
    "unsafe"
)

// LockFreePriceLevel uses atomic operations only
type LockFreePriceLevel struct {
    price    int64 // Fixed-point price (8 decimals)
    quantity atomic.Int64
    head     atomic.Pointer[OrderNode]
    tail     atomic.Pointer[OrderNode]
}

type OrderNode struct {
    order Order
    next  atomic.Pointer[OrderNode]
}

// AddOrder lock-free insertion (Michael-Scott queue)
func (pl *LockFreePriceLevel) AddOrder(order *Order) {
    node := &OrderNode{order: *order}

    for {
        tail := pl.tail.Load()
        next := tail.next.Load()

        if tail == pl.tail.Load() { // Consistency check
            if next == nil {
                // Try to link new node
                if tail.next.CompareAndSwap(nil, node) {
                    // Try to swing tail
                    pl.tail.CompareAndSwap(tail, node)
                    pl.quantity.Add(order.Quantity)
                    return
                }
            } else {
                // Tail was not pointing to last node
                pl.tail.CompareAndSwap(tail, next)
            }
        }
    }
}

// RemoveOrder lock-free deletion
func (pl *LockFreePriceLevel) RemoveOrder(orderID uint64) *Order {
    for {
        head := pl.head.Load()
        tail := pl.tail.Load()
        next := head.next.Load()

        if head == pl.head.Load() {
            if head == tail {
                if next == nil {
                    return nil // Queue empty
                }
                pl.tail.CompareAndSwap(tail, next)
            } else {
                if next.order.ID == orderID {
                    if pl.head.CompareAndSwap(head, next) {
                        pl.quantity.Add(-next.order.Quantity)
                        return &next.order
                    }
                }
            }
        }
    }
}

Cache-Optimized Price Tree

package orderbook

import (
    "unsafe"
)

const (
    CacheLineSize = 64
    MaxLevels     = 1024
)

// CacheAlignedTree is a B-tree optimized for L1 cache
type CacheAlignedTree struct {
    // Each node fits in one cache line
    nodes [MaxLevels]CacheLineNode
    root  int32
    count int32
}

// CacheLineNode fits exactly in 64 bytes
type CacheLineNode struct {
    prices [3]int64  // 24 bytes - up to 3 prices per node
    counts [3]int64  // 24 bytes - quantities
    childs [4]int32  // 16 bytes - child indices
    // Total: 64 bytes = 1 cache line
}

func init() {
    // Compile-time assertion
    if unsafe.Sizeof(CacheLineNode{}) != CacheLineSize {
        panic("CacheLineNode must be exactly 64 bytes")
    }
}

// FindPrice uses binary search within cache lines
func (t *CacheAlignedTree) FindPrice(price int64) (int64, bool) {
    idx := t.root
    for idx >= 0 {
        node := &t.nodes[idx]

        // Binary search within node (cache-hot)
        for i := 0; i < 3; i++ {
            if node.prices[i] == price {
                return node.counts[i], true
            }
            if node.prices[i] == 0 || price < node.prices[i] {
                idx = node.childs[i]
                break
            }
            if i == 2 {
                idx = node.childs[3]
            }
        }
    }
    return 0, false
}

Branch Prediction Optimization

Branch-Free Order Side Handling

package orderbook

// BAD: Unpredictable branches
func (ob *OrderBook) ProcessOrderBranchy(order *Order) {
    if order.Side == Buy {
        ob.bids.Insert(order)
        if order.Price >= ob.bestAsk {
            ob.matchBuy(order)
        }
    } else {
        ob.asks.Insert(order)
        if order.Price <= ob.bestBid {
            ob.matchSell(order)
        }
    }
}

// GOOD: Branch-free using lookup tables
type OrderBook struct {
    books    [2]*PriceLevels // [Buy, Sell]
    bestPrices [2]int64      // [bestBid, bestAsk]
    compareFns [2]func(int64, int64) bool
}

func NewOrderBook() *OrderBook {
    return &OrderBook{
        books:      [2]*PriceLevels{NewPriceLevels(), NewPriceLevels()},
        compareFns: [2]func(int64, int64) bool{
            func(a, b int64) bool { return a >= b }, // Buy: price >= ask
            func(a, b int64) bool { return a <= b }, // Sell: price <= bid
        },
    }
}

func (ob *OrderBook) ProcessOrderBranchFree(order *Order) {
    side := order.Side // 0 = Buy, 1 = Sell
    oppSide := 1 - side

    ob.books[side].Insert(order)

    if ob.compareFns[side](order.Price, ob.bestPrices[oppSide]) {
        ob.match(order, side)
    }
}

Likely/Unlikely Hints

package orderbook

import "runtime"

//go:noinline
func likely(b bool) bool {
    if b {
        return true
    }
    return false
}

//go:noinline
func unlikely(b bool) bool {
    if !b {
        return false
    }
    return true
}

func (ob *OrderBook) Match(order *Order) *Trade {
    // Most orders don't match immediately
    if unlikely(ob.canMatch(order)) {
        return ob.executeMatch(order)
    }

    // Fast path: just insert
    ob.insert(order)
    return nil
}

func (ob *OrderBook) Validate(order *Order) error {
    // Most orders are valid
    if likely(order.Quantity > 0 && order.Price > 0) {
        return nil
    }

    // Slow path: detailed validation
    return ob.validateDetailed(order)
}

Memory Access Patterns

Prefetching

package orderbook

import (
    "runtime"
    "unsafe"
)

// Prefetch hints for upcoming memory access
func prefetch(addr unsafe.Pointer) {
    runtime.Prefetch(addr)
}

func (ob *OrderBook) ScanPriceLevels(callback func(*PriceLevel)) {
    levels := ob.levels

    for i := 0; i < len(levels); i++ {
        // Prefetch next cache line while processing current
        if i+4 < len(levels) {
            prefetch(unsafe.Pointer(&levels[i+4]))
        }

        callback(&levels[i])
    }
}

// Alternative using slice tricks
func (ob *OrderBook) BatchProcess(orders []Order) {
    const batchSize = 8

    for i := 0; i < len(orders); i += batchSize {
        end := i + batchSize
        if end > len(orders) {
            end = len(orders)
        }

        batch := orders[i:end]

        // Prefetch all orders in batch
        for j := range batch {
            _ = batch[j].Price // Touch memory
        }

        // Process batch (all in L1 cache now)
        for j := range batch {
            ob.processOrder(&batch[j])
        }
    }
}

Latency Measurement

High-Resolution Timing

package latency

import (
    "time"
    "unsafe"
    _ "unsafe"
)

//go:linkname nanotime runtime.nanotime
func nanotime() int64

// Timestamp returns nanoseconds without system call
func Timestamp() int64 {
    return nanotime()
}

// LatencyTracker for sub-microsecond measurement
type LatencyTracker struct {
    samples []int64
    index   int
    size    int
}

func NewLatencyTracker(size int) *LatencyTracker {
    return &LatencyTracker{
        samples: make([]int64, size),
        size:    size,
    }
}

func (lt *LatencyTracker) Record(start, end int64) {
    latency := end - start
    lt.samples[lt.index] = latency
    lt.index = (lt.index + 1) % lt.size
}

func (lt *LatencyTracker) Percentile(p float64) time.Duration {
    // Sort and find percentile
    sorted := make([]int64, len(lt.samples))
    copy(sorted, lt.samples)
    sort.Slice(sorted, func(i, j int) bool {
        return sorted[i] < sorted[j]
    })

    idx := int(float64(len(sorted)) * p)
    return time.Duration(sorted[idx])
}

RDTSC for CPU Cycles

// +build amd64

package latency

import (
    "unsafe"
)

// RDTSC reads CPU timestamp counter (cycles, not time)
func RDTSC() uint64

// Implement in assembly (latency_amd64.s)
// TEXT ·RDTSC(SB),NOSPLIT,$0
//     RDTSC
//     SHLQ $32, DX
//     ORQ  DX, AX
//     MOVQ AX, ret+0(FP)
//     RET

type CycleTracker struct {
    cyclesPerNs float64
}

func NewCycleTracker() *CycleTracker {
    // Calibrate
    start := RDTSC()
    time.Sleep(10 * time.Millisecond)
    end := RDTSC()

    cycles := float64(end - start)
    ns := float64(10 * time.Millisecond)

    return &CycleTracker{
        cyclesPerNs: cycles / ns,
    }
}

func (ct *CycleTracker) CyclesToNs(cycles uint64) float64 {
    return float64(cycles) / ct.cyclesPerNs
}

Latency Testing

# Run latency benchmarks
go test -bench=BenchmarkLatency -benchtime=10s ./pkg/latency/

# With detailed statistics
go test -bench=BenchmarkLatency -benchtime=10s -count=5 ./pkg/latency/ | tee latency.txt
benchstat latency.txt

# Histogram output
go test -bench=BenchmarkLatency -benchtime=10s \
    -cpuprofile=cpu.prof \
    -memprofile=mem.prof \
    ./pkg/latency/

# Linux perf for cache misses
perf stat -e L1-dcache-load-misses,L1-dcache-loads \
    go test -bench=BenchmarkOrderBook ./pkg/lx/

Results

BenchmarkOrderInsert-16         2054836    487.2 ns/op    0 B/op    0 allocs/op
BenchmarkOrderMatch-16          2189234    512.1 ns/op    0 B/op    0 allocs/op
BenchmarkOrderCancel-16         3456789    321.4 ns/op    0 B/op    0 allocs/op

Latency Distribution:
  P50:  487 ns
  P90:  892 ns
  P99:  2.3 us
  P999: 8.7 us
  Max:  45 us (GC pause)