NEON を使って4x4行列の転置行列求めるプログラムの説明。

| ch15_07/main.cpp |
#include <iostream>
#include <iomanip>
#include "MatrixF32.h"
using namespace std;
extern void Mat4x4TransposeF32_(float* m_des, const float* m_src1);
void Mat4x4TestF32(MatrixF32& m_src1) {
const char nl = '\n';
const size_t nr = m_src1.GetNumCols();
const size_t nc = m_src1.GetNumRows();
MatrixF32 m_des1(nr, nc);
MatrixF32 m_des2(nr, nc);
MatrixF32::Transpose(m_des1, m_src1);
Mat4x4TransposeF32_(m_des2.Data(), m_src1.Data());
cout << fixed << setprecision(1);
m_src1.SetOstream(12, " ");
m_des1.SetOstream(12, " ");
m_des2.SetOstream(12, " ");
cout << "\nResults for Mat4x4TestF32\n";
cout << "Matrix m_src1\n" << m_src1 << nl;
cout << "Matrix m_des1 (transpose of m_src1)\n" << m_des1 << nl;
cout << "Matrix m_des2 (transpose of m_src1)\n" << m_des2 << nl;
if (m_des1 != m_des2)
cout << "\nMatrix transpose compare failed\n";
}
void Mat4x4TestF32(void) {
const size_t nr = 4;
const size_t nc = 4;
MatrixF32 m_src1(nr ,nc);
const float src1_row0[] = { 10, 11, 12, 13 };
const float src1_row1[] = { 20, 21, 22, 23 };
const float src1_row2[] = { 30, 31, 32, 33 };
const float src1_row3[] = { 40, 41, 42, 43 };
m_src1.SetRow(0, src1_row0);
m_src1.SetRow(1, src1_row1);
m_src1.SetRow(2, src1_row2);
m_src1.SetRow(3, src1_row3);
Mat4x4TestF32(m_src1);
}
int main() {
Mat4x4TestF32();
return 0;
}
|
| ch15_07/neon.cpp |
#include "Vec128.h"
void Mat4x4TransposeF32_(float* m_des, const float* m_src1) {
__asm volatile ("\n\
ld1 {v0.4s-v3.4s}, [x1] \n\
trn1 v4.4s, v0.4s, v1.4s // a0 b0 a2 b2 \n\
trn2 v5.4s, v0.4s, v1.4s // a1 b1 a3 b3 \n\
trn1 v6.4s, v2.4s, v3.4s // c0 d0 c2 d2 \n\
trn2 v7.4s, v2.4s, v3.4s // c1 d1 c3 d3 \n\
trn1 v0.2d, v4.2d, v6.2d // a0 b0 c0 d0 \n\
trn1 v1.2d, v5.2d, v7.2d // a1 b1 c1 d1 \n\
trn2 v2.2d, v4.2d, v6.2d // a2 b2 c2 d2 \n\
trn2 v3.2d, v5.2d, v7.2d // a3 b3 c3 d3 \n\
st1 {v0.4s-v3.4s}, [x0] \n\
"
:
:
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
|
| ch15_07/main.cpp の実行例 |
arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O -S neon.cpp
arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet Ch15_07 % ./a.out
Results for Mat4x4TestF32
Matrix m_src1
10.0 11.0 12.0 13.0
20.0 21.0 22.0 23.0
30.0 31.0 32.0 33.0
40.0 41.0 42.0 43.0
Matrix m_des1 (transpose of m_src1)
10.0 20.0 30.0 40.0
11.0 21.0 31.0 41.0
12.0 22.0 32.0 42.0
13.0 23.0 33.0 43.0
Matrix m_des2 (transpose of m_src1)
10.0 20.0 30.0 40.0
11.0 21.0 31.0 41.0
12.0 22.0 32.0 42.0
13.0 23.0 33.0 43.0
arm64@manet Ch15_07 %
|