Paradox Game Engine  v1.0.0 beta06
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Events Macros Pages
matrix.cpp
Go to the documentation of this file.
1 /*
2  NEON math library for the iPhone / iPod touch
3 
4  Copyright (c) 2009 Justin Saunders
5 
6  This software is provided 'as-is', without any express or implied warranty.
7  In no event will the authors be held liable for any damages arising
8  from the use of this software.
9  Permission is granted to anyone to use this software for any purpose,
10  including commercial applications, and to alter it and redistribute it freely,
11  subject to the following restrictions:
12 
13  1. The origin of this software must not be misrepresented; you must
14  not claim that you wrote the original software. If you use this
15  software in a product, an acknowledgment in the product documentation
16  would be appreciated but is not required.
17 
18  2. Altered source versions must be plainly marked as such, and must
19  not be misrepresented as being the original software.
20 
21  3. This notice may not be removed or altered from any source distribution.
22 */
23 
24 #include "coreconfig.h"
25 
26 // possibly needed header for iOS
27 //#ifdef __arm__
28 //#include "arm/arch.h"
29 //#endif
30 
31 #ifdef PLATFORM_ANDROID
32 
33 #ifdef __thumb__
34 #error "This file should be compiled in ARM mode only."
35 // Note in Xcode, right click file, Get Info->Build, Other compiler flags = "-marm"
36 #endif
37 
38 extern "C" {
39 
40 // Note asm is taken from reader "Jeff" post Wolfgang's blog:
41 // https://www.blogger.com/comment.g?blogID=398682525365778708&postID=7527893703750196380&page=1
42 CORE_EXPORT( void ) NEON_Matrix4Mul(const float* a, const float* b, float* output )
43 {
44 #ifdef _ARM_ARCH_7
45  __asm__ __volatile
46  (
47  // Store A & B leaving room at top of registers for result (q0-q3)
48  "vldmia %2, { q4-q7 } \n\t"
49  "vldmia %1, { q8-q11 } \n\t"
50 
51  // result = first column of B x first row of A
52  "vmul.f32 q0, q8, d8[0]\n\t"
53  "vmul.f32 q1, q8, d10[0]\n\t"
54  "vmul.f32 q2, q8, d12[0]\n\t"
55  "vmul.f32 q3, q8, d14[0]\n\t"
56 
57  // result += second column of B x second row of A
58  "vmla.f32 q0, q9, d8[1]\n\t"
59  "vmla.f32 q1, q9, d10[1]\n\t"
60  "vmla.f32 q2, q9, d12[1]\n\t"
61  "vmla.f32 q3, q9, d14[1]\n\t"
62 
63  // result += third column of B x third row of A
64  "vmla.f32 q0, q10, d9[0]\n\t"
65  "vmla.f32 q1, q10, d11[0]\n\t"
66  "vmla.f32 q2, q10, d13[0]\n\t"
67  "vmla.f32 q3, q10, d15[0]\n\t"
68 
69  // result += last column of B x last row of A
70  "vmla.f32 q0, q11, d9[1]\n\t"
71  "vmla.f32 q1, q11, d11[1]\n\t"
72  "vmla.f32 q2, q11, d13[1]\n\t"
73  "vmla.f32 q3, q11, d15[1]\n\t"
74 
75  // output = result registers
76  "vstmia %0, { q0-q3 }"
77  : // no output
78  : "r" (output), "r" (a), "r" (b) // input - note *value* of pointer doesn't change
79  : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q11" //clobber
80  );
81 #else
82  output[ 0] = a[0] * b[ 0] + a[4] * b[ 1] + a[ 8] * b[ 2] + a[12] * b[ 3];
83  output[ 1] = a[1] * b[ 0] + a[5] * b[ 1] + a[ 9] * b[ 2] + a[13] * b[ 3];
84  output[ 2] = a[2] * b[ 0] + a[6] * b[ 1] + a[10] * b[ 2] + a[14] * b[ 3];
85  output[ 3] = a[3] * b[ 0] + a[7] * b[ 1] + a[11] * b[ 2] + a[15] * b[ 3];
86  output[ 4] = a[0] * b[ 4] + a[4] * b[ 5] + a[ 8] * b[ 6] + a[12] * b[ 7];
87  output[ 5] = a[1] * b[ 4] + a[5] * b[ 5] + a[ 9] * b[ 6] + a[13] * b[ 7];
88  output[ 6] = a[2] * b[ 4] + a[6] * b[ 5] + a[10] * b[ 6] + a[14] * b[ 7];
89  output[ 7] = a[3] * b[ 4] + a[7] * b[ 5] + a[11] * b[ 6] + a[15] * b[ 7];
90  output[ 8] = a[0] * b[ 8] + a[4] * b[ 9] + a[ 8] * b[10] + a[12] * b[11];
91  output[ 9] = a[1] * b[ 8] + a[5] * b[ 9] + a[ 9] * b[10] + a[13] * b[11];
92  output[10] = a[2] * b[ 8] + a[6] * b[ 9] + a[10] * b[10] + a[14] * b[11];
93  output[11] = a[3] * b[ 8] + a[7] * b[ 9] + a[11] * b[10] + a[15] * b[11];
94  output[12] = a[0] * b[12] + a[4] * b[13] + a[ 8] * b[14] + a[12] * b[15];
95  output[13] = a[1] * b[12] + a[5] * b[13] + a[ 9] * b[14] + a[13] * b[15];
96  output[14] = a[2] * b[12] + a[6] * b[13] + a[10] * b[14] + a[14] * b[15];
97  output[15] = a[3] * b[12] + a[7] * b[13] + a[11] * b[14] + a[15] * b[15];
98 #endif
99 }
100 
101 CORE_EXPORT( void ) NEON_Matrix4Vector4Mul(const float* m, const float* v, float* output)
102 {
103 #ifdef _ARM_ARCH_7
104  __asm__ __volatile
105  (
106  // Store m & v leaving room at top of registers for result (q0)
107  "vldmia %1, {q1-q4 } \n\t" // q2-q5 = m
108  "vldmia %2, {q5} \n\t" // q1 = v
109 
110  // result = first column of A x V.x
111  "vmul.f32 q0, q1, d10[0]\n\t"
112 
113  // result += second column of A x V.y
114  "vmla.f32 q0, q2, d10[1]\n\t"
115 
116  // result += third column of A x V.z
117  "vmla.f32 q0, q3, d11[0]\n\t"
118 
119  // result += last column of A x V.w
120  "vmla.f32 q0, q4, d11[1]\n\t"
121 
122  // output = result registers
123  "vstmia %0, {q0}"
124 
125  : // no output
126  : "r" (output), "r" (m), "r" (v) // input - note *value* of pointer doesn't change
127  : "memory", "q0", "q1", "q2", "q3", "q4", "q5" //clobber
128  );
129 #else
130  output[0] = m[0] * v[0] + m[4] * v[1] + m[ 8] * v[2] + m[12] * v[3];
131  output[1] = m[1] * v[0] + m[5] * v[1] + m[ 9] * v[2] + m[13] * v[3];
132  output[2] = m[2] * v[0] + m[6] * v[1] + m[10] * v[2] + m[14] * v[3];
133  output[3] = m[3] * v[0] + m[7] * v[1] + m[11] * v[2] + m[15] * v[3];
134 #endif
135 }
136 }
137 
138 #endif
function b
function a
#define CORE_EXPORT(x)
Definition: coreconfig.h:10