OS X requires position independent assembly code (this should be the same as compiling with the -fPIC flag) This means the register ebx must be saved and restored before being used. After looking it over I can change most of the homeworld assembly code to meet this requirement. rather than ifdefing the modified code to only be used by os x, I would rather just change it as the changes should not effect linux. I will post diffs here before committing so that people can point out problems. In the end the assembly should optimize better, as Position independent code supposedly does. Comments are appreciated. Heres the first diff.
Code:
Index: /Users/axcess/Documents/Xcode Projects/trunk/src/Game/Matrix.c
===================================================================
--- /Users/axcess/Documents/Xcode Projects/trunk/src/Game/Matrix.c (revision 695)
+++ /Users/axcess/Documents/Xcode Projects/trunk/src/Game/Matrix.c (working copy)
@@ -445,7 +445,7 @@
pop esi
pop edi
}
-#elif defined (__GNUC__) && defined (__i386__) && !defined (_MACOSX_FIX_86)
+#elif defined (__GNUC__) && defined (__i386__) && !defined (_MACOSX_86)
/* This block of code is the modified version of the code above.
* It was safe to use upto gcc 4.1, but seems to generate a
* problem once we use -O2 with gcc 4.3 */
@@ -457,7 +457,11 @@
matrix matResult[]={{0,0,0, 0,0,0, 0,0,0}};
__asm__ __volatile__ (
+ " pushl %%edi\n"
+ " pushl %%esi\n"
" pushl %%ebp\n"
+ " pushl %%ebx\n"
+ " movl %0, %%ebx\n"
" movl $-3, %%eax\n"
" jmp mat_x_mat_l1%=\n"
" .align 4\n"
@@ -501,14 +505,17 @@
" incl %%eax\n"
" jne mat_x_mat_l2%=\n"
" fstps 32(%%ebx, %%eax, "FSIZE_STR")\n"
+ " popl %%ebx\n"
" popl %%ebp\n"
+ " popl %%esi\n"
+ " popl %%edi\n"
:
- : "b" (matResult), "c" (first), "d" (second)
- : "eax", "edi", "esi" );
+ : "r" (matResult), "c" (first), "d" (second)
+ : "eax");
memcpy(result, matResult, sizeof(struct matrix));
-#elif defined (__GNUC__) && defined (_X86_64)
+#elif defined (__GNUC__) && defined (_X86_64) || defined (_MACOSX_86)
/* This is the AMD64 version of the above code but using the
* xmm 128-bit SSE registers. It looks longer but should be a
* lot quicker as most of the operations are in parallel.
@@ -724,8 +731,10 @@
pop edi
pop esi
}
-#elif defined (__GNUC__) && defined (__i386__) && !defined (_MACOSX_FIX_86)
+#elif defined (__GNUC__) && defined (__i386__)
__asm__ __volatile__ (
+ " pushl "DEST"\n"
+ " movl %1, "DEST"\n"
" flds 0*"FSIZE_STR"("SOURCE")\n" /*s0*/
" fmuls (0+0*3)*"FSIZE_STR"("MATRIX")\n" /*a0*/
" flds 1*"FSIZE_STR"("SOURCE")\n" /*s1 a0*/
@@ -771,8 +780,9 @@
" fxch %%st(1)\n" /*d0 d2*/
" fstps 0*"FSIZE_STR"("DEST")\n" /*d2*/
" fstps 2*"FSIZE_STR"("DEST")\n"
+ " popl "DEST"\n"
:
- : "S" (vector), "b" (result), "D" (matrix) );
+ : "S" (vector), "r" (result), "D" (matrix) );
#else
result->x = matrixdot(matrix->m11,matrix->m12,matrix->m13,vector->x,vector->y,vector->z);
result->y = matrixdot(matrix->m21,matrix->m22,matrix->m23,vector->x,vector->y,vector->z);
@@ -855,8 +865,10 @@
pop edi
pop esi
}
-#elif defined (__GNUC__) && defined (__i386__) && !defined (_MACOSX_FIX_86)
+#elif defined (__GNUC__) && defined (__i386__)
__asm__ __volatile__ (
+ " pushl "DEST"\n"
+ " movl %1,"DEST"\n"
" flds 0*"FSIZE_STR"("SOURCE")\n" /*s0*/
" fmuls (0+0*3)*"FSIZE_STR"("MATRIX")\n" /*a0*/
" flds 1*"FSIZE_STR"("SOURCE")\n" /*s1 a0*/
@@ -902,8 +914,9 @@
" fxch %%st(1)\n" /*d0 d2*/
" fstps 0*"FSIZE_STR"("DEST")\n" /*d2*/
" fstps 2*"FSIZE_STR"("DEST")\n"
+ " popl "DEST"\n"
:
- : "S" (vector), "b" (result), "D" (matrix) );
+ : "S" (vector), "r" (result), "D" (matrix) );
#else
result->x = matrixdot(vector->x,vector->y,vector->z,matrix->m11,matrix->m21,matrix->m31);
result->y = matrixdot(vector->x,vector->y,vector->z,matrix->m12,matrix->m22,matrix->m32);