NEON を使って4x4行列の乗算を行うプログラムの説明。

| ch15_06/main.cpp |
#include <iostream>
#include <iomanip>
#include "MatrixF32.h"
using namespace std;
extern void Mat4x4MulF32_(float *m_des, const float* m_src1, const float *m_src2);
void Mat4x4MulF32(MatrixF32& m_src1, MatrixF32& m_src2) {
const size_t nr = m_src1.GetNumRows();
const size_t nc = m_src2.GetNumCols();
MatrixF32 m_des1(nr ,nc);
MatrixF32 m_des2(nr ,nc);
MatrixF32::Mul4x4(m_des1, m_src1, m_src2);
Mat4x4MulF32_(m_des2.Data(), m_src1.Data(), m_src2.Data());
cout << fixed << setprecision(1);
m_src1.SetOstream(12, " ");
m_src2.SetOstream(12, " ");
m_des1.SetOstream(12, " ");
m_des2.SetOstream(12, " ");
cout << "\nResults for Mat4x4MulF32\n";
cout << "Matrix m_src1\n" << m_src1 << '\n';
cout << "Matrix m_src2\n" << m_src2 << '\n';
cout << "Matrix m_des1\n" << m_des1 << '\n';
cout << "Matrix m_des2\n" << m_des2 << '\n';
if (m_des1 != m_des2)
cout << "\nMatrix compare failed - Mat4x4MulF32\n";
}
void Mat4x4MulF32Test(void) {
const size_t nr = 4;
const size_t nc = 4;
MatrixF32 m_src1(nr ,nc);
MatrixF32 m_src2(nr ,nc);
const float src1_row0[] = { 10, 11, 12, 13 };
const float src1_row1[] = { 20, 21, 22, 23 };
const float src1_row2[] = { 30, 31, 32, 33 };
const float src1_row3[] = { 40, 41, 42, 43 };
const float src2_row0[] = { 100, 101, 102, 103 };
const float src2_row1[] = { 200, 201, 202, 203 };
const float src2_row2[] = { 300, 301, 302, 303 };
const float src2_row3[] = { 400, 401, 402, 403 };
m_src1.SetRow(0, src1_row0);
m_src1.SetRow(1, src1_row1);
m_src1.SetRow(2, src1_row2);
m_src1.SetRow(3, src1_row3);
m_src2.SetRow(0, src2_row0);
m_src2.SetRow(1, src2_row1);
m_src2.SetRow(2, src2_row2);
m_src2.SetRow(3, src2_row3);
Mat4x4MulF32(m_src1, m_src2);
}
int main() {
Mat4x4MulF32Test();
return 0;
}
|
| ch15_06/neon.cpp |
#include "Vec128.h"
void Mat4x4MulF32_(float *m_des, const float* m_src1, const float *m_src2) {
__asm volatile("\n\
ld1 {v0.4s-v3.4s}, [x1] // m_src1 \n\
ld1 {v4.4s-v7.4s}, [x2] // m_src2 \n\
\n\
// Row 0 \n\
fmul v16.4s, v4.4s, v0.s[0] // v16 = v4 * v0.lane0 \n\
fmla v16.4s, v5.4s, v0.s[1] // v16 += v5 * v0.lane1 \n\
fmla v16.4s, v6.4s, v0.s[2] // v16 += v6 * v0.lane2 \n\
fmla v16.4s, v7.4s, v0.s[3] // v16 += v6 * v0.lane3 \n\
st1 {v16.4s}, [x0], 16 \n\
\n\
// Row 1 \n\
fmul v17.4s, v4.4s, v1.s[0] // v17 = v4 * v1.lane0 \n\
fmla v17.4s, v5.4s, v1.s[1] // v17 += v5 * v1.lane1 \n\
fmla v17.4s, v6.4s, v1.s[2] // v17 += v6 * v1.lane2 \n\
fmla v17.4s, v7.4s, v1.s[3] // v17 += v6 * v1.lane3 \n\
st1 {v17.4s}, [x0], 16 \n\
\n\
// Row 2 \n\
fmul v18.4s, v4.4s, v2.s[0] // v18 = v4 * v2.lane0 \n\
fmla v18.4s, v5.4s, v2.s[1] // v18 += v5 * v2.lane1 \n\
fmla v18.4s, v6.4s, v2.s[2] // v18 += v6 * v2.lane2 \n\
fmla v18.4s, v7.4s, v2.s[3] // v18 += v6 * v2.lane3 \n\
st1 {v18.4s}, [x0], 16 \n\
\n\
// Row 3 \n\
fmul v19.4s, v4.4s, v3.s[0] // v19 = v4 * v3.lane0 \n\
fmla v19.4s, v5.4s, v3.s[1] // v19 += v5 * v3.lane1 \n\
fmla v19.4s, v6.4s, v3.s[2] // v19 += v6 * v3.lane2 \n\
fmla v19.4s, v7.4s, v3.s[3] // v19 += v6 * v3.lane3 \n\
st1 {v19.4s}, [x0], 16 \n\
"
:
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "x0"
);
}
|
| ch15_06/main.cpp の実行例 |
arm64@manet Ch15_06 % g++ -I.. -std=c++11 -O -S neon.cpp
arm64@manet Ch15_06 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet Ch15_06 % ./a.out
Results for Mat4x4MulF32
Matrix m_src1
10.0 11.0 12.0 13.0
20.0 21.0 22.0 23.0
30.0 31.0 32.0 33.0
40.0 41.0 42.0 43.0
Matrix m_src2
100.0 101.0 102.0 103.0
200.0 201.0 202.0 203.0
300.0 301.0 302.0 303.0
400.0 401.0 402.0 403.0
Matrix m_des1
12000.0 12046.0 12092.0 12138.0
22000.0 22086.0 22172.0 22258.0
32000.0 32126.0 32252.0 32378.0
42000.0 42166.0 42332.0 42498.0
Matrix m_des2
12000.0 12046.0 12092.0 12138.0
22000.0 22086.0 22172.0 22258.0
32000.0 32126.0 32252.0 32378.0
42000.0 42166.0 42332.0 42498.0
arm64@manet Ch15_06 %
|