Tuesday, September 22, 2009

Unfolding a code with Full optimized flags turned on with GCC

Original code:

#include <stdio.h>
#include <math.h>


double a, b;

#define SQR(a)  ((a)*(a))

int main()
{
    double sum;

    a = 0.5;
    b = 0.5;
    sum = sqrt(SQR(sin(a)) + SQR(cos(b)));

    printf("sum = %f\n", sum);
    return 0;
}

CFLAGS is set to "-mtune=nocona -mfpmath=sse -msse3 -O3 -ffast-math"

The source code above, after compiled with GCC (e.g: gcc -S $CFLAGS test.c), gives:

    .file   "ssetest.c"
    .def    ___main;    .scl    2;  .type   32; .endef
    .section .rdata,"dr"
LC1:
    .ascii "sum = %f\12\0"
    .align 8
LC2:
    .long   0
    .long   1071644672
    .text
.globl _main
    .def    _main;  .scl    2;  .type   32; .endef
_main:
    pushl   %ebp
    movl    $16, %eax
    movl    %esp, %ebp
    subl    $24, %esp
    andl    $-16, %esp
    call    __alloca
    call    ___main
    fldl    LC2
    movl    $LC1, (%esp)
    fld     %st(0)
    fstl    _a
    fstl    _b
    fxch    %st(1)
    fsin
    fxch    %st(1)
    fcos
    fxch    %st(1)
    fstpl   -8(%ebp)
    movsd   -8(%ebp), %xmm2
    fstpl   -8(%ebp)
    movsd   -8(%ebp), %xmm0
    mulsd   %xmm2, %xmm2
    mulsd   %xmm0, %xmm0
    addsd   %xmm0, %xmm2
    sqrtsd  %xmm2, %xmm1
    movsd   %xmm1, 4(%esp)
    call    _printf
    xorl    %eax, %eax
    leave
    ret
    .comm   _a, 16   # 8
    .comm   _b, 16   # 8
    .def    _printf;    .scl    3;  .type   32; .endef

The code is so efficient.  fsin/fcos does the sine computation in CPU hardware (no emulation).  It also utilize MMX registers (xmm0, xmm1, xmm2) so memory movement is minimum.

No comments:

Post a Comment