SRTP AES Optimization

Nils L. Corneliusen
2 April 2010

An updated version of this article is available here: SRTP AES Optimization Revisited

The basics - code size reduction

SRTP AES encryption can be very time consuming on CPU challenged systems. My target system has very little cache, ram and whatnot, so things will be slow. On the other hand, any code size reduction yields good results.

If you're using OpenSSL, you'll probably end up with an encryption/decrytption routine looking something like the code below. Caveats are: Aligned blocks, size multiple of 16, big endian.

void srtp_encrypt_decrypt( uint32_t *src, uint32_t srclen, uint32_t *dst,
                           const AES_KEY *aeskey, uint32_t *iv )
{
    uint32_t blocks;
    uint32_t tmp[4];

    blocks = (srclen+0x0f)>>4;

    do {

        AES_encrypt( (uint8_t *)iv, (uint8_t *)tmp, aeskey );

        dst[0] = src[0] ^ tmp[0];
        dst[1] = src[1] ^ tmp[1];
        dst[2] = src[2] ^ tmp[2];
        dst[3] = src[3] ^ tmp[3];

        iv[3]++;

        dst += 4; src += 4; tmp += 4;

    } while( --blocks );

}

A basic first step would be to chop off all the unnecessary stuff from the AES_encrypt() function. As mentioned above, code size reduction is good on my target. Since data are aligned and we only use 128 bit encryption, there's lots to remove, the result being the code below:

void AES128_encrypt_aligned( const uint32_t *in, uint32_t *out, const AES_KEY *key )
{
    const u32 *rk;
    u32 s0, s1, s2, s3, t0, t1, t2, t3;

    rk = key->rd_key;

    /* round 0 */
    s0 = in[0] ^ rk[0];
    s1 = in[1] ^ rk[1];
    s2 = in[2] ^ rk[2];
    s3 = in[3] ^ rk[3];
    /* round 1: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[ 4];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[ 5];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[ 6];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[ 7];
    /* round 2: */
    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[ 8];
    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[ 9];
    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[10];
    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[11];
    /* round 3: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[12];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[13];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[14];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[15];
    /* round 4: */
    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[16];
    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[17];
    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[18];
    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[19];
    /* round 5: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[20];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[21];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[22];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[23];
    /* round 6: */
    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[24];
    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[25];
    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[26];
    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[27];
    /* round 7: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[28];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[29];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[30];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[31];
    /* round 8: */
    s0 = Te0[t0 >> 24] ^ Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >>  8) & 0xff] ^ Te3[t3 & 0xff] ^ rk[32];
    s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >>  8) & 0xff] ^ Te3[t0 & 0xff] ^ rk[33];
    s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff] ^ Te2[(t0 >>  8) & 0xff] ^ Te3[t1 & 0xff] ^ rk[34];
    s3 = Te0[t3 >> 24] ^ Te1[(t0 >> 16) & 0xff] ^ Te2[(t1 >>  8) & 0xff] ^ Te3[t2 & 0xff] ^ rk[35];
    /* round 9: */
    t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >>  8) & 0xff] ^ Te3[s3 & 0xff] ^ rk[36];
    t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >>  8) & 0xff] ^ Te3[s0 & 0xff] ^ rk[37];
    t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >>  8) & 0xff] ^ Te3[s1 & 0xff] ^ rk[38];
    t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >>  8) & 0xff] ^ Te3[s2 & 0xff] ^ rk[39];

    out[0] = (Te2[(t0 >> 24)       ] & 0xff000000) ^
             (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
             (Te0[(t2 >>  8) & 0xff] & 0x0000ff00) ^
             (Te1[(t3      ) & 0xff] & 0x000000ff) ^ rk[40];
    out[1] = (Te2[(t1 >> 24)       ] & 0xff000000) ^
             (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
             (Te0[(t3 >>  8) & 0xff] & 0x0000ff00) ^
             (Te1[(t0      ) & 0xff] & 0x000000ff) ^ rk[41];
    out[2] = (Te2[(t2 >> 24)       ] & 0xff000000) ^
             (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
             (Te0[(t0 >>  8) & 0xff] & 0x0000ff00) ^
             (Te1[(t1      ) & 0xff] & 0x000000ff) ^ rk[42];
    out[3] = (Te2[(t3 >> 24)       ] & 0xff000000) ^
             (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
             (Te0[(t1 >>  8) & 0xff] & 0x0000ff00) ^
             (Te1[(t2      ) & 0xff] & 0x000000ff) ^ rk[43];
}

Just this simple step gives a marked improvement on my target, probably not so on modern systems with more cache.

With the basics covered, we can move on to the interesting stuff.

The interesting stuff - AES XOR reduction

This little nugget is present in the SRTP AES specification:

4.1.1 AES in Counter Mode (...) Note that the initial value, IV, is fixed for each packet and is formed by "reserving" 16 zeros in the least significant bits for the purpose of the counter. (...)

So, back in the real world where data sizes are reasonable, this translates into something like this: Assuming packet size <= 4096 bytes, the IV is constant for each block except for the last byte which goes 0..255.

To take advantage of that, we have to merge the code from OpenSSL into the srtp_encrypt_decypt() function and precalculate based on the constant parts.

void srtp_encrypt_decrypt( uint32_t *src, uint32_t srclen, uint32_t *dst,
                           const AES_KEY *aeskey, const uint32_t *iv )
{
    uint32_t blocks;
    const uint32_t *key;
    uint32_t r1t0;
    uint32_t r2s0,r2s1,r2s2,r2s3;
    uint32_t s0, s1, s2, s3;
    uint32_t t0, t1, t2, t3;
    uint8_t k3, k3ctr;

    blocks = (srclen+0x0f)>>4;
    key = key->rd_key;

    // Precalculate constant parts
      s0 = HTONL(iv[0]) ^ key[0];
      s1 = HTONL(iv[1]) ^ key[1];
      s2 = HTONL(iv[2]) ^ key[2];
      s3 = HTONL(iv[3]) ^ key[3];

    r1t0 = Te0[s0 >> 24] ^ Te1[(s1 >> 16) & 0xff] ^ Te2[(s2 >> 8) & 0xff]                  ^ key[ 4];
      t1 = Te0[s1 >> 24] ^ Te1[(s2 >> 16) & 0xff] ^ Te2[(s3 >> 8) & 0xff] ^ Te3[s0 & 0xff] ^ key[ 5];
      t2 = Te0[s2 >> 24] ^ Te1[(s3 >> 16) & 0xff] ^ Te2[(s0 >> 8) & 0xff] ^ Te3[s1 & 0xff] ^ key[ 6];
      t3 = Te0[s3 >> 24] ^ Te1[(s0 >> 16) & 0xff] ^ Te2[(s1 >> 8) & 0xff] ^ Te3[s2 & 0xff] ^ key[ 7];

    r2s0 =                 Te1[(t1 >> 16) & 0xff] ^ Te2[(t2 >> 8) & 0xff] ^ Te3[t3 & 0xff] ^ key[ 8];
    r2s1 = Te0[t1 >> 24] ^ Te1[(t2 >> 16) & 0xff] ^ Te2[(t3 >> 8) & 0xff]                  ^ key[ 9];
    r2s2 = Te0[t2 >> 24] ^ Te1[(t3 >> 16) & 0xff]                         ^ Te3[t1 & 0xff] ^ key[10];
    r2s3 = Te0[t3 >> 24]                          ^ Te2[(t1 >> 8) & 0xff] ^ Te3[t2 & 0xff] ^ key[11];

    k3 = (uint8_t)key[3];
    k3ctr = 0;

    do {

        // round 0/1
        //    s3 = iv[3] ^ key[3];
        //    t0 = Te3[s3&0xff] ^ r1t0;
        // => t0 = Te3[ iv[3] ^ key[3]) & 0xff] ^ r1t0;
        // which reduces to:
        t0 = Te3[k3ctr ^ k3] ^ r1t0;

        // round 2
        s0 = Te0[ t0   >> 24        ] ^ r2s0;
        s1 = Te3[ t0          & 0xff] ^ r2s1;
        s2 = Te2[(t0   >>  8) & 0xff] ^ r2s2;
        s3 = Te1[(t0   >> 16) & 0xff] ^ r2s3;

        //
        // round 3-9 as specified above here
        //
        
        dst[0] = src[0] ^ ((Te2[(t0 >> 24)       ] & 0xff000000) ^
                           (Te3[(t1 >> 16) & 0xff] & 0x00ff0000) ^
                           (Te0[(t2 >>  8) & 0xff] & 0x0000ff00) ^
                           (Te1[(t3      ) & 0xff] & 0x000000ff) ^ key[40]);

        dst[1] = src[1] ^ ((Te2[(t1 >> 24)       ] & 0xff000000) ^
                           (Te3[(t2 >> 16) & 0xff] & 0x00ff0000) ^
                           (Te0[(t3 >>  8) & 0xff] & 0x0000ff00) ^
                           (Te1[(t0      ) & 0xff] & 0x000000ff) ^ key[41]);

        dst[2] = src[2] ^ ((Te2[(t2 >> 24)       ] & 0xff000000) ^
                           (Te3[(t3 >> 16) & 0xff] & 0x00ff0000) ^
                           (Te0[(t0 >>  8) & 0xff] & 0x0000ff00) ^
                           (Te1[(t1      ) & 0xff] & 0x000000ff) ^ key[42]);

        dst[3] = src[3] ^ ((Te2[(t3 >> 24)       ] & 0xff000000) ^
                           (Te3[(t0 >> 16) & 0xff] & 0x00ff0000) ^
                           (Te0[(t1 >>  8) & 0xff] & 0x0000ff00) ^
                           (Te1[(t2      ) & 0xff] & 0x000000ff) ^ key[43]);

        src += 4; dst += 4;
        k3ctr++;

    } while( --blocks );

}

Here we reduce round 0-2 from 36 XORs to just 6. And more code size reduction, so I see around 29% faster execution on my target with 1200 byte packets in 10 packet clusters.

Send me a mail if you have any input on this, I'm sure you can figure out my mail address from the front page.


www.ignorantus.com