Al Green
Al Green's Brother
Home -- News -- Articles -- Books -- Source Code -- Videos -- Xmas -- LinkedIn -- About

SRTP AES Optimization

Nils Liaaen Corneliusen
2 April 2010

An updated version of this article is available here: SRTP AES Optimization Revisited

The basics - code size reduction

SRTP AES encryption can be very time consuming on CPU challenged systems. My target system has very little cache, ram, and whatnot, so things will be slow. On the other hand, any code size reduction yields good results.

If you're using OpenSSL, you'll probably end up with an encryption/decryption routine looking something like the code below. Caveats are: Aligned blocks, size multiple of 16, big endian.

void srtp_encrypt_decrypt( uint32_t *src, uint32_t srclen, uint32_t *dst,
                           const AES_KEY *aeskey, uint32_t *iv )
{
    uint32_t blocks;
    uint32_t tmp[4];

    blocks = (srclen+0x0f)>>4;

    do {

        AES_encrypt( (uint8_t *)iv, (uint8_t *)tmp, aeskey );

        dst[0] = src[0] ^ tmp[0];
        dst[1] = src[1] ^ tmp[1];
        dst[2] = src[2] ^ tmp[2];
        dst[3] = src[3] ^ tmp[3];

        iv[3]++;

        dst += 4; src += 4; tmp += 4;

    } while( --blocks );

}

A basic first step would be to chop off all the unnecessary stuff from the AES_encrypt() function. As mentioned above, code size reduction is good on my target. Since data are aligned and we only use 128 bit encryption, there's lots to remove, the result being the code below:

void AES128_encrypt_aligned( const uint32_t *in, uint32_t *out, const AES_KEY *key )
{
    const u32 *rk;
    u32 s0, s1, s2, s3, t0, t1, t2, t3;

    rk = key->rd_key;

    /* round 0 */
    s0 = in[0] ^ rk[0];
    s1 = in[1] ^ rk[1];
    s2 = in[2] ^ rk[2];
    s3 = in[3] ^ rk[3];
    /* round 1: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
    /* round 2: */
    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
    /* round 3: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
    /* round 4: */
    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
    /* round 5: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
    /* round 6: */
    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
    /* round 7: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
    /* round 8: */
    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
    /* round 9: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];

    out[0] = (Te2[(t0 >> 24)       ] & 0xff000000) ^
             (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
             (Te0[(t2 >>  8) & 0xff] & 0x0000ff00) ^
             (Te1[(t3      ) & 0xff] & 0x000000ff) ^ rk[40];
    out[1] = (Te2[(t1 >> 24)       ] & 0xff000000) ^
             (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
             (Te0[(t3 >>  8) & 0xff] & 0x0000ff00) ^
             (Te1[(t0      ) & 0xff] & 0x000000ff) ^ rk[41];
    out[2] = (Te2[(t2 >> 24)       ] & 0xff000000) ^
             (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
             (Te0[(t0 >>  8) & 0xff] & 0x0000ff00) ^
             (Te1[(t1      ) & 0xff] & 0x000000ff) ^ rk[42];
    out[3] = (Te2[(t3 >> 24)       ] & 0xff000000) ^
             (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
             (Te0[(t1 >>  8) & 0xff] & 0x0000ff00) ^
             (Te1[(t2      ) & 0xff] & 0x000000ff) ^ rk[43];
}

Just this simple step gives a marked improvement on my target, probably not so on modern systems with more cache.

With the basics covered, we can move on to the interesting stuff.

The interesting stuff - AES XOR reduction

This little nugget is present in the SRTP AES specification:

4.1.1 AES in Counter Mode (...) Note that the initial value, IV, is fixed for each packet and is formed by "reserving" 16 zeros in the least significant bits for the purpose of the counter. (...)

So, back in the real world where data sizes are reasonable, this translates into something like this: Assuming packet size <= 4096 bytes, the IV is constant for each block except for the last byte which goes 0..255.

To take advantage of that, we have to merge the code from OpenSSL into the srtp_encrypt_decypt() function and precalculate based on the constant parts.

void srtp_encrypt_decrypt( uint32_t *src, uint32_t srclen, uint32_t *dst,
                           const AES_KEY *aeskey, const uint32_t *iv )
{
    uint32_t blocks;
    const uint32_t *key;
    uint32_t r1t0;
    uint32_t r2s0,r2s1,r2s2,r2s3;
    uint32_t s0, s1, s2, s3;
    uint32_t t0, t1, t2, t3;
    uint8_t k3, k3ctr;

    blocks = (srclen+0x0f)>>4;
    key = key->rd_key;

    // Precalculate constant parts
      s0 = HTONL(iv[0]) ^ key[0];
      s1 = HTONL(iv[1]) ^ key[1];
      s2 = HTONL(iv[2]) ^ key[2];
      s3 = HTONL(iv[3]) ^ key[3];

    r1t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff]                  ^ key[ 4];
      t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ key[ 5];
      t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ key[ 6];
      t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ key[ 7];

    r2s0 =                 Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ key[ 8];
    r2s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff]                  ^ key[ 9];
    r2s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff]                         ^ Te3[t1 & 0xff] ^ key[10];
    r2s3 = Te0[t3 >> 24]                          ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ key[11];

    k3 = (uint8_t)key[3];
    k3ctr = 0;

    do {

        // round 0/1
        //    s3 = iv[3] ^ key[3];
        //    t0 = Te3[s3&0xff] ^ r1t0;
        // => t0 = Te3[ iv[3] ^ key[3]) & 0xff] ^ r1t0;
        // which reduces to:
        t0 = Te3[k3ctr ^ k3] ^ r1t0;

        // round 2
        s0 = Te0[ t0   >> 24        ] ^ r2s0;
        s1 = Te3[ t0          & 0xff] ^ r2s1;
        s2 = Te2[(t0   >>  8) & 0xff] ^ r2s2;
        s3 = Te1[(t0   >> 16) & 0xff] ^ r2s3;

        //
        // round 3-9 as specified above here
        //
        
        dst[0] = src[0] ^ ((Te2[(t0 >> 24)       ] & 0xff000000) ^
                           (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
                           (Te0[(t2 >>  8) & 0xff] & 0x0000ff00) ^
                           (Te1[(t3      ) & 0xff] & 0x000000ff) ^ key[40]);

        dst[1] = src[1] ^ ((Te2[(t1 >> 24)       ] & 0xff000000) ^
                           (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
                           (Te0[(t3 >>  8) & 0xff] & 0x0000ff00) ^
                           (Te1[(t0      ) & 0xff] & 0x000000ff) ^ key[41]);

        dst[2] = src[2] ^ ((Te2[(t2 >> 24)       ] & 0xff000000) ^
                           (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
                           (Te0[(t0 >>  8) & 0xff] & 0x0000ff00) ^
                           (Te1[(t1      ) & 0xff] & 0x000000ff) ^ key[42]);

        dst[3] = src[3] ^ ((Te2[(t3 >> 24)       ] & 0xff000000) ^
                           (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
                           (Te0[(t1 >>  8) & 0xff] & 0x0000ff00) ^
                           (Te1[(t2      ) & 0xff] & 0x000000ff) ^ key[43]);

        src += 4; dst += 4;
        k3ctr++;

    } while( --blocks );

}

Here we reduce round 0-2 from 36 XORs to just 6. And more code size reduction, so I see around 29% faster execution on my target with 1200 byte packets in 10 packet clusters.

Source code: srtp_aes_2010.c

Comments are always appreciated. Contact information is available here.

Article Licensing Information

This article is published under the following license: Attribution-NoDerivatives 4.0 International (CC BY-ND 4.0)
Short summary: You may copy and redistribute the material in any medium or format for any purpose, even commercially. You must give appropriate credit, provide a link to the license, and indicate if changes were made. If you remix, transform, or build upon the material, you may not distribute the modified material.


Ekte Programmering Norwegian flag
American flag Real Programming
Ignorantus AS