ch15_01/main.cpp |
#include <iostream> #include <cmath> #include "Vec128.h" using namespace std; extern void PackedMathF32_(Vec128 x[9], const Vec128& a, const Vec128& b); extern void PackedMathF64_(Vec128 x[9], const Vec128& a, const Vec128& b); void PackedMathF32(void) { const char nl = '\n'; Vec128 x[9], a, b; a.m_F32[0] = 36.0f; b.m_F32[0] = -1.0f / 9.0f; a.m_F32[1] = 1.0f / 32.0f; b.m_F32[1] = 64.0f; a.m_F32[2] = 2.0f; b.m_F32[2] = -0.0625f; a.m_F32[3] = 42.0f; b.m_F32[3] = 8.666667f; PackedMathF32_(x, a, b); cout << ("\nResults for PackedMathF32_\n"); cout << "a: " << a.ToStringF32() << nl; cout << "b: " << b.ToStringF32() << nl; cout << nl; cout << "fadd: " << x[0].ToStringF32() << nl; cout << "fsub: " << x[1].ToStringF32() << nl; cout << "fmul: " << x[2].ToStringF32() << nl; cout << "fdiv: " << x[3].ToStringF32() << nl; cout << "fabs(a): " << x[4].ToStringF32() << nl; cout << "fneg(b): " << x[5].ToStringF32() << nl; cout << "fminnm: " << x[6].ToStringF32() << nl; cout << "fmaxnm: " << x[7].ToStringF32() << nl; cout << "fsqrt(a): " << x[8].ToStringF32() << nl; } void PackedMathF64(void) { const char nl = '\n'; Vec128 x[9], a, b; a.m_F64[0] = 36.0; b.m_F64[0] = -M_SQRT2; a.m_F64[1] = M_PI; b.m_F64[1] = 2.0; PackedMathF64_(x, a, b); cout << ("\nResults for PackedMathF64_\n"); cout << "a: " << a.ToStringF64() << nl; cout << "b: " << b.ToStringF64() << nl; cout << nl; cout << "fadd: " << x[0].ToStringF64() << nl; cout << "fsub: " << x[1].ToStringF64() << nl; cout << "fmul: " << x[2].ToStringF64() << nl; cout << "fdiv: " << x[3].ToStringF64() << nl; cout << "fabs(a): " << x[4].ToStringF64() << nl; cout << "fneg(b): " << x[5].ToStringF64() << nl; cout << "fminnm: " << x[6].ToStringF64() << nl; cout << "fmaxnm: " << x[7].ToStringF64() << nl; cout << "fsqrt(a): " << x[8].ToStringF64() << nl; } int main() { PackedMathF32(); PackedMathF64(); return 0; } |
ch15_01/neon.cpp |
//#include <iostream> //#include <cmath> #include "Vec128.h" void PackedMathF32_(Vec128 x[9], const Vec128& a, const Vec128& b) { __asm volatile("\n\ ld1 {v0.4s},[x1] // v0 = a \n\ ld1 {v1.4s},[x2] // v1 = b \n\ fadd v2.4s,v0.4s,v1.4s // v2 = a + b \n\ st1 {v2.4s},[x0],16 // save result to x[0] \n\ fsub v2.4s,v0.4s,v1.4s // v2 = a - b \n\ st1 {v2.4s},[x0],16 // save result to x[1] \n\ fmul v2.4s,v0.4s,v1.4s // v2 = a * b \n\ st1 {v2.4s},[x0],16 // save result to x[2] \n\ fdiv v2.4s,v0.4s,v1.4s // v2 = a / b \n\ st1 {v2.4s},[x0],16 // save result to x[3] \n\ fabs v2.4s,v0.4s // v2 = abs(a) \n\ st1 {v2.4s},[x0],16 // save result to x[4] \n\ fneg v2.4s,v1.4s // v2 = -b \n\ st1 {v2.4s},[x0],16 // save result to x[5] \n\ fminnm v2.4s,v0.4s,v1.4s // v2 = min(a, b) \n\ st1 {v2.4s},[x0],16 // save result to x[6] \n\ fmaxnm v2.4s,v0.4s,v1.4s // v2 = max(a, b) \n\ st1 {v2.4s},[x0],16 // save result to x[7] \n\ fsqrt v2.4s,v0.4s // v2 = sqrt(a) \n\ st1 {v2.4s},[x0],16 // save result to x[8] \n\ //ret \n\ \n" : : : "v0", "v1", "v2" ); } void PackedMathF64_(Vec128 x[9], const Vec128& a, const Vec128& b){ __asm volatile("\n\ ld1 {v0.2d},[x1] // v0 = a \n\ ld1 {v1.2d},[x2] // v1 = b \n\ fadd v2.2d,v0.2d,v1.2d // v2 = a + b \n\ st1 {v2.2d},[x0],16 // save result to x[0] \n\ fsub v2.2d,v0.2d,v1.2d // v2 = a - b \n\ st1 {v2.2d},[x0],16 // save result to x[1] \n\ fmul v2.2d,v0.2d,v1.2d // v2 = a * b \n\ st1 {v2.2d},[x0],16 // save result to x[2] \n\ fdiv v2.2d,v0.2d,v1.2d // v2 = a / b \n\ st1 {v2.2d},[x0],16 // save result to x[3] \n\ fabs v2.2d,v0.2d // v2 = abs(a) \n\ st1 {v2.2d},[x0],16 // save result to x[4] \n\ fneg v2.2d,v1.2d // v2 = -b \n\ st1 {v2.2d},[x0],16 // save result to x[5] \n\ fminnm v2.2d,v0.2d,v1.2d // v2 = min(a, b) \n\ st1 {v2.2d},[x0],16 // save result to x[6] \n\ fmaxnm v2.2d,v0.2d,v1.2d // v2 = max(a, b) \n\ st1 {v2.2d},[x0],16 // save result to x[7] \n\ fsqrt v2.2d,v0.2d // v2 = sqrt(a) \n\ st1 {v2.2d},[x0],16 // save result to x[8] \n\ //ret \n\ \n" : : : "v0", "v1", "v2" ); } |
ch15_01/main.cpp の実行例 |
arm64@manet src % cd ch15_01 arm64@manet ch15_01 % g++ -I.. -std=c++11 -O -S neon.cpp arm64@manet ch15_01 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out arm64@manet ch15_01 % ./a.out Results for PackedMathF32_ a: 36.000000 0.031250 | 2.000000 42.000000 b: -0.111111 64.000000 | -0.062500 8.666667 fadd: 35.888889 64.031250 | 1.937500 50.666668 fsub: 36.111111 -63.968750 | 2.062500 33.333332 fmul: -4.000000 2.000000 | -0.125000 364.000000 fdiv: -324.000000 0.000488 | -32.000000 4.846154 fabs(a): 36.000000 0.031250 | 2.000000 42.000000 fneg(b): 0.111111 -64.000000 | 0.062500 -8.666667 fminnm: -0.111111 0.031250 | -0.062500 8.666667 fmaxnm: 36.000000 64.000000 | 2.000000 42.000000 fsqrt(a): 6.000000 0.176777 | 1.414214 6.480741 Results for PackedMathF64_ a: 36.000000000000 | 3.141592653590 b: -1.414213562373 | 2.000000000000 fadd: 34.585786437627 | 5.141592653590 fsub: 37.414213562373 | 1.141592653590 fmul: -50.911688245431 | 6.283185307180 fdiv: -25.455844122716 | 1.570796326795 fabs(a): 36.000000000000 | 3.141592653590 fneg(b): 1.414213562373 | -2.000000000000 fminnm: -1.414213562373 | 2.000000000000 fmaxnm: 36.000000000000 | 3.141592653590 fsqrt(a): 6.000000000000 | 1.772453850906 |
ch15_02/main.cpp |
#include <iostream> #include <iomanip> #include <cmath> #include "Vec128.h" using namespace std; extern void PackedCompareF32_(Vec128 x[8], const Vec128& a, const Vec128& b); extern void PackedCompareF64_(Vec128 x[8], const Vec128& a, const Vec128& b); const char* c_CmpStr[8] = { "EQ", "NE", "GT", "GE", "LT", "LE", "a LT0", "b GT0" }; void PackedCompareF32(void) { const char nl = '\n'; Vec128 x[8], a, b; a.m_F32[0] = 2.0; b.m_F32[0] = -4.0; a.m_F32[1] = 17.0; b.m_F32[1] = 12.0; a.m_F32[2] = -6.0; b.m_F32[2] = -6.0; a.m_F32[3] = 3.0; b.m_F32[3] = 8.0; PackedCompareF32_(x, a, b); cout << "\nResults for PackedCompareF32_\n"; cout << setw(11) << 'a' << ':' << a.ToStringF32() << nl; cout << setw(11) << 'b' << ':' << b.ToStringF32() << nl; cout << nl; for (int j = 0; j < 8; j++) cout << setw(11) << c_CmpStr[j] << ':' << x[j].ToStringX32() << nl; } void PackedCompareF64(void) { const char nl = '\n'; Vec128 x[8], a, b; a.m_F64[0] = -2.0; b.m_F64[0] = -4.0; a.m_F64[1] = M_SQRT2; b.m_F64[1] = M_PI; PackedCompareF64_(x, a, b); cout << "\nResults for PackedCompareF64_\n"; cout << setw(11) << 'a' << ':' << a.ToStringF64() << nl; cout << setw(11) << 'b' << ':' << b.ToStringF64() << nl; cout << nl; for (int j = 0; j < 8; j++) cout << setw(11) << c_CmpStr[j] << ':' << x[j].ToStringX64() << nl; } int main() { PackedCompareF32(); PackedCompareF64(); return 0; } |
ch15_02/neon.cpp |
#include "Vec128.h" void PackedCompareF32_(Vec128 x[8], const Vec128& a, const Vec128& b) { __asm volatile("\n\ ld1 {v0.4s}, [x1] // v0 = a \n\ ld1 {v1.4s}, [x2] // v1 = b \n\ fcmeq v2.4s, v0.4s, v1.4s // packed a == b \n\ st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ not v2.16b, v2.16b // packed a !=b \n\ st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmgt v2.4s, v0.4s, v1.4s // packed a > b \n\ st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmge v2.4s, v0.4s, v1.4s // packed a >= b \n\ st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmlt v2.4s, v0.4s, v1.4s // packed a < b \n\ st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmle v2.4s, v0.4s, v1.4s // packed a <= b \n\ st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmlt v2.4s, v0.4s, 0.0 // packed a < 0 \n\ st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmgt v2.4s, v1.4s, 0.0 // packed b > 0 \n\ st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\ " : : : "v0", "v1", "v2" ); } void PackedCompareF64_(Vec128 x[8], const Vec128& a, const Vec128& b) { __asm volatile("\n\ ld1 {v0.2d}, [x1] // v0 = a \n\ ld1 {v1.2d}, [x2] // v1 = b \n\ fcmeq v2.2d, v0.2d, v1.2d // packed a == b \n\ st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ not v2.16b, v2.16b // packed a !=b \n\ st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmgt v2.2d, v0.2d, v1.2d // packed a > b \n\ st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmge v2.2d, v0.2d, v1.2d // packed a >= b \n\ st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmlt v2.2d, v0.2d, v1.2d // packed a < b \n\ st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmle v2.2d, v0.2d, v1.2d // packed a <= b \n\ st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmlt v2.2d, v0.2d, 0.0 // packed a < 0 \n\ st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\ \n\ fcmgt v2.2d, v1.2d, 0.0 // packed b > 0 \n\ st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\ " : : : "v0", "v1", "v2" ); } |
ch15_02/main.cpp の実行例 |
arm64@manet Ch15_02 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out arm64@manet Ch15_02 % ./a.out Results for PackedCompareF32_ a: 2.000000 17.000000 | -6.000000 3.000000 b: -4.000000 12.000000 | -6.000000 8.000000 EQ: 00000000 00000000 | FFFFFFFF 00000000 NE: FFFFFFFF FFFFFFFF | 00000000 FFFFFFFF GT: FFFFFFFF FFFFFFFF | 00000000 00000000 GE: FFFFFFFF FFFFFFFF | FFFFFFFF 00000000 LT: 00000000 00000000 | 00000000 FFFFFFFF LE: 00000000 00000000 | FFFFFFFF FFFFFFFF a LT0: 00000000 00000000 | FFFFFFFF 00000000 b GT0: 00000000 FFFFFFFF | 00000000 FFFFFFFF Results for PackedCompareF64_ a: -2.000000000000 | 1.414213562373 b: -4.000000000000 | 3.141592653590 EQ: 0000000000000000 | 0000000000000000 NE: FFFFFFFFFFFFFFFF | FFFFFFFFFFFFFFFF GT: FFFFFFFFFFFFFFFF | 0000000000000000 GE: FFFFFFFFFFFFFFFF | 0000000000000000 LT: 0000000000000000 | FFFFFFFFFFFFFFFF LE: 0000000000000000 | FFFFFFFFFFFFFFFF a LT0: FFFFFFFFFFFFFFFF | 0000000000000000 b GT0: 0000000000000000 | FFFFFFFFFFFFFFFF |
ch15_03/main.cpp |
#include <iostream> #include <iomanip> #include <cmath> #include "Vec128.h" using namespace std; extern void F32fromI32(Vec128 x[2], const Vec128& a); extern void I32fromF32(Vec128 x[2], const Vec128& a); extern void F64fromI64(Vec128 x[2], const Vec128& a); extern void I64fromF64(Vec128 x[2], const Vec128& a); extern void F32fromU32(Vec128 x[2], const Vec128& a); extern void U32fromF32(Vec128 x[2], const Vec128& a); extern void F64fromU64(Vec128 x[2], const Vec128& a); extern void U64fromF64(Vec128 x[2], const Vec128& a); extern void F32fromF64(Vec128 x[2], const Vec128& a, const Vec128& b); extern void F64fromF32(Vec128 x[2], const Vec128& a); void PackedConvertA(void) { const char nl = '\n'; Vec128 x[2], a; // F32_I32 a.m_I32[0] = 10; a.m_I32[1] = -500; a.m_I32[2] = 600; a.m_I32[3] = -1024; F32fromI32(x, a); cout << "\nResults for CvtOp::F32_I32\n"; cout << "a: " << a.ToStringI32() << nl; cout << "x[0]: " << x[0].ToStringF32() << nl; // I32_F32 a.m_F32[0] = -1.25f; a.m_F32[1] = 100.875f; a.m_F32[2] = -200.0f; a.m_F32[3] = (float)M_PI; I32fromF32(x, a); cout << "\nResults for CvtOp::I32_F32\n"; cout << "a: " << a.ToStringF32() << nl; cout << "x[0]: " << x[0].ToStringI32() << nl; // F64_I64 a.m_I64[0] = 1000; a.m_I64[1] = -500000000000; F64fromI64(x, a); cout << "\nResults for CvtOp::F64_I64\n"; cout << "a: " << a.ToStringI64() << nl; cout << "x[0]: " << x[0].ToStringF64() << nl; // I64_F64 a.m_F64[0] = -122.66666667; a.m_F64[1] = 1234567890123.75; I64fromF64(x, a); cout << "\nResults for CvtOp::I64_F64\n"; cout << "a: " << a.ToStringF64() << nl; cout << "x[0]: " << x[0].ToStringI64() << nl; } void PackedConvertB(void) { const char nl = '\n'; Vec128 x[2], a; // F32_U32 a.m_U32[0] = 10; a.m_U32[1] = 500; a.m_U32[2] = 600; a.m_U32[3] = 1024; F32fromU32(x, a); cout << "\nResults for CvtOp::F32_U32\n"; cout << "a: " << a.ToStringU32() << nl; cout << "x[0]: " << x[0].ToStringF32() << nl; // U32_F32 a.m_F32[0] = 1.25f; a.m_F32[1] = 100.875f; a.m_F32[2] = 200.0f; a.m_F32[3] = (float)M_PI; U32fromF32(x, a); cout << "\nResults for CvtOp::U32_F32\n"; cout << "a: " << a.ToStringF32() << nl; cout << "x[0]: " << x[0].ToStringU32() << nl; // F64_U64 a.m_I64[0] = 1000; a.m_I64[1] = 420000000000; F64fromU64(x, a); cout << "\nResults for CvtOp::F64_U64\n"; cout << "a: " << a.ToStringU64() << nl; cout << "x[0]: " << x[0].ToStringF64() << nl; // U64_F64 a.m_F64[0] = 698.40; a.m_F64[1] = 1234567890123.75; U64fromF64(x, a); cout << "\nResults for CvtOp::U64_F64\n"; cout << "a: " << a.ToStringF64() << nl; cout << "x[0]: " << x[0].ToStringU64() << nl; } void PackedConvertC(void) { const char nl = '\n'; Vec128 x[2], a, b; // F32_F64 a.m_F64[0] = M_PI; a.m_F64[1] = M_LOG10E; b.m_F64[0] = -M_E; b.m_F64[1] = M_LN2; F32fromF64(x, a, b); cout << "\nResults for CvtOp::F32_F64\n"; cout << "a: " << a.ToStringF64() << nl; cout << "b: " << b.ToStringF64() << nl; cout << "x[0]: " << x[0].ToStringF32() << nl; // F64_F32 a.m_F32[0] = 1.0f / 9.0f; a.m_F32[1] = 100.875f; a.m_F32[2] = 200.0f; a.m_F32[3] = (float)M_SQRT2; F64fromF32(x, a); cout << "\nResults for CvtOp::F64_F32\n"; cout << "a: " << a.ToStringF32() << nl; cout << "x[0]: " << x[0].ToStringF64() << nl; cout << "x[1]: " << x[1].ToStringF64() << nl; } int main() { PackedConvertA(); PackedConvertB(); PackedConvertC(); return 0; } |
ch15_03/neon.cpp |
#include "Vec128.h" void F32fromI32(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.4s}, [x1] \n\ scvtf v1.4s, v0.4s // float32 <- int32 \n\ st1 {v1.4s}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "x0", "x1" ); } void I32fromF32(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.4s}, [x1] \n\ fcvtns v1.4s, v0.4s // int32 <- float32 \n\ st1 {v1.4s}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "x0", "x1" ); } void F64fromI64(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.2d}, [x1] \n\ scvtf v1.2d, v0.2d // float64 <- int64 \n\ st1 {v1.2d}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "x0", "x1" ); } void I64fromF64(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.2d}, [x1] \n\ fcvtns v1.2d, v0.2d // int32 <- float32 \n\ st1 {v1.2d}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "x0", "x1" ); } void F32fromU32(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.4s}, [x1] \n\ ucvtf v1.4s, v0.4s // float32 <- int32 \n\ st1 {v1.4s}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "x0", "x1" ); } void U32fromF32(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.4s}, [x1] \n\ fcvtnu v1.4s, v0.4s // uint32 <- float32 \n\ st1 {v1.4s}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "x0", "x1" ); } void F64fromU64(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.2d}, [x1] \n\ ucvtf v1.2d, v0.2d // float64 <- int64 \n\ st1 {v1.2d}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "x0", "x1" ); } void U64fromF64(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.2d}, [x1] \n\ fcvtnu v1.2d, v0.2d // uint64 <- float64 \n\ st1 {v1.2d}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "x0", "x1" ); } void F32fromF64(Vec128 x[2], const Vec128& a, const Vec128& b) { __asm volatile ("\n\ ld1 {v0.2d}, [x1] \n\ ld1 {v2.2d}, [x2] \n\ fcvtn v1.2s, v0.2d // lower-order F32 \n\ fcvtn2 v1.4s, v2.2d // higher-order F32 \n\ st1 {v1.4s}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "v2", "x0", "x1" ); } void F64fromF32(Vec128 x[2], const Vec128& a) { __asm volatile ("\n\ ld1 {v0.4s}, [x1] \n\ fcvtl v1.2d, v0.2s // lower-order F32 \n\ fcvtl2 v2.2d, v0.4s // higher-order F32 \n\ st1 {v1.2d, v2.2d}, [x0] // [x0] = v1 \n\ " : : : "v0", "v1", "v2", "x0", "x1" ); } |
ch15_03/main.cpp の実行例 |
arm64@manet ch15_03 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out arm64@manet ch15_03 % ./a.out Results for CvtOp::F32_I32 a: 10 -500 | 600 -1024 x[0]: 10.000000 -500.000000 | 600.000000 -1024.000000 Results for CvtOp::I32_F32 a: -1.250000 100.875000 | -200.000000 3.141593 x[0]: -1 101 | -200 3 Results for CvtOp::F64_I64 a: 1000 | -500000000000 x[0]: 1000.000000000000 | -500000000000.000000000000 Results for CvtOp::I64_F64 a: -122.666666670000 | 1234567890123.750000000000 x[0]: -123 | 1234567890124 Results for CvtOp::F32_U32 a: 10 500 | 600 1024 x[0]: 10.000000 500.000000 | 600.000000 1024.000000 Results for CvtOp::U32_F32 a: 1.250000 100.875000 | 200.000000 3.141593 x[0]: 1 101 | 200 3 Results for CvtOp::F64_U64 a: 1000 | 420000000000 x[0]: 1000.000000000000 | 420000000000.000000000000 Results for CvtOp::U64_F64 a: 698.400000000000 | 1234567890123.750000000000 x[0]: 698 | 1234567890124 Results for CvtOp::F32_F64 a: 3.141592653590 | 0.434294481903 b: -2.718281828459 | 0.693147180560 x[0]: 3.141593 0.434294 | -2.718282 0.693147 Results for CvtOp::F64_F32 a: 0.111111 100.875000 | 200.000000 1.414214 x[0]: 0.111111111939 | 100.875000000000 x[1]: 200.000000000000 | 1.414213538170 |
ch15_04/main.cpp |
#include <iostream> #include <iomanip> #include <string> #include <random> #include "AlignedMem.h" using namespace std; extern bool CalcCorrCoef_(float* rho, float sums[5], const float* x, const float* y, size_t n, float epsilon); const size_t c_Alignment = 16; void Init(float* x, float* y, size_t n, unsigned int seed) { uniform_real_distribution<float> dist1 {0.0, 50.0}; normal_distribution<float> dist2 {25.0, 7.0}; mt19937 rng {seed}; for (size_t i = 0; i < n; i++) { x[i] = round(dist1(rng)); y[i] = x[i] + round(dist2(rng)); // cout << setw(10) << x[i] << ", " << setw(10) << y[i] << endl; } } bool CalcCorrCoef(float* rho, float sums[5], const float* x, const float* y, size_t n, float epsilon) { // Make sure n is valid if (n == 0) return false; // Make sure x and y are properly aligned if (!AlignedMem::IsAligned(x, c_Alignment)) return false; if (!AlignedMem::IsAligned(y, c_Alignment)) return false; // Calculate and save sum variables float sum_x = 0, sum_y = 0, sum_xx = 0, sum_yy = 0, sum_xy = 0; for (size_t i = 0; i < n; i++) { sum_x += x[i]; sum_y += y[i]; sum_xx += x[i] * x[i]; sum_yy += y[i] * y[i]; sum_xy += x[i] * y[i]; } sums[0] = sum_x; sums[1] = sum_y; sums[2] = sum_xx; sums[3] = sum_yy; sums[4] = sum_xy; // Calculate rho float rho_num = n * sum_xy - sum_x * sum_y; float rho_den = sqrt(n * sum_xx - sum_x * sum_x) * sqrt(n * sum_yy - sum_y * sum_y); if (rho_den >= epsilon) { *rho = rho_num / rho_den; return true; } else { *rho = 0; return false; } } int main() { const char nl = '\n'; const size_t n = 103; AlignedArray<float> x_aa(n, c_Alignment); AlignedArray<float> y_aa(n, c_Alignment); float sums1[5], sums2[5]; float rho1, rho2; float epsilon = 1.0e-9; float* x = x_aa.Data(); float* y = y_aa.Data(); Init(x, y, n, 71); bool rc1 = CalcCorrCoef(&rho1, sums1, x, y, n, epsilon); bool rc2 = CalcCorrCoef_(&rho2, sums2, x, y, n, epsilon); cout << "Results for CalcCorrCoef\n\n"; if (!rc1 || !rc2) { cout << "Invalid return code "; cout << "rc1 = " << boolalpha << rc1 << ", "; cout << "rc2 = " << boolalpha << rc2 << nl; return 1; } int w = 14; string sep(w * 3, '-'); cout << fixed << setprecision(6); cout << "Value " << setw(w) << "C++" << " " << setw(w) << "A64 SIMD" << nl; cout << sep << nl; cout << setprecision(2); cout << "sum_x: " << setw(w) << sums1[0] << " " << setw(w) << sums2[0] << nl; cout << "sum_y: " << setw(w) << sums1[1] << " " << setw(w) << sums2[1] << nl; cout << "sum_xx: " << setw(w) << sums1[2] << " " << setw(w) << sums2[2] << nl; cout << "sum_yy: " << setw(w) << sums1[3] << " " << setw(w) << sums2[3] << nl; cout << "sum_xy: " << setw(w) << sums1[4] << " " << setw(w) << sums2[4] << nl; cout << "rho: " << setw(w) << rho1 << " " << setw(w) << rho2 << nl; return 0; } |
ch15_04/neon.cpp |
#include "Vec128.h" //#define UpdateSums(VregX, VregY) \ // fadd v16.4s, v16.4s, \VregX\().4s // update sum_x \n\ // fadd v17.4s, v17.4s, \VregX\().4s // update sum_y \n\ void CalcCorrCoef_(float* rho, float sums[5], const float* x, const float* y, size_t n, float epsilon) { __asm volatile ("\n\ \n\ .macro UpdateSums VregX, VregY \n\ fadd.4s v16, v16, \\VregX\\() // sum_x: v16 += VregX \n\ fadd.4s v17, v17, \\VregY\\() // sum_y: v17 += VregX \n\ fmla.4s v18, \\VregX\\(), \\VregX\\() // sum_xx: v18 += VregX^2 \n\ fmla.4s v19, \\VregY\\(), \\VregY\\() // sum_yy: v19 += VregY^2 \n\ fmla.4s v20, \\VregX\\(), \\VregY\\() // sum_xy: v20 += VregX * VregY \n\ .endm \n\ \n\ cbz x4, LInvalidArg // if n == 0 goto END \n\ tst x2, 0x0f // if (x2 != 15) \n\ b.ne LInvalidArg // goto END \n\ tst x3, 0x0f // if (x3 != 15) \n\ b.ne LInvalidArg // goto END \n\ mov x5, x4 // save n to x5 \n\ \n\ eor v16.16b, v16.16b, v16.16b // sum_x = 0 \n\ eor v17.16b, v17.16b, v17.16b // sum_y = 0 \n\ eor v18.16b, v18.16b, v18.16b // sum_xx = 0 \n\ eor v19.16b, v19.16b, v19.16b // sum_yy = 0 \n\ eor v20.16b, v20.16b, v20.16b // sum_xy = 0 \n\ \n\ cmp x4, 16 // if n<=16 \n\ b.lo LSkipLoop1 // goto SkipLoop1 \n\ \n\ LLoop1: \n\ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 64 // load x[0:16] \n\ ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x3], 64 // load y[0:16] \n\ \n\ UpdateSums v0, v4 \n\ UpdateSums v1, v5 \n\ UpdateSums v2, v6 \n\ UpdateSums v3, v7 \n\ sub x4, x4, 16 // n -= 16 \n\ cmp x4, 16 // if x4 >= 16 \n\ b.hs LLoop1 // goto Loop \n\ \n\ LSkipLoop1: \n\ faddp v16.4s, v16.4s, v16.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\ faddp v16.4s, v16.4s, v16.4s // s16 = lane0+lane1 \n\ faddp v17.4s, v17.4s, v17.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\ faddp v17.4s, v17.4s, v17.4s // s17 = lane0+lane1 \n\ faddp v18.4s, v18.4s, v18.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\ faddp v18.4s, v18.4s, v18.4s // s18 = lane0+lane1 \n\ faddp v19.4s, v19.4s, v19.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\ faddp v19.4s, v19.4s, v19.4s // s19 = lane0+lane1 \n\ faddp v20.4s, v20.4s, v20.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\ faddp v20.4s, v20.4s, v20.4s // s20 = lane0+lane1 \n\ cbz x4, LSkipLoop2 // if x4==0 goto SkipLoop2 \n\ \n\ LLoop2: \n\ ldr s1, [x2], 4 // s1 = x; x+=4 \n\ ldr s2, [x3], 4 // s2 = y; y+=4 \n\ fadd s16, s16, s1 // s16 += s1 \n\ fadd s17, s17, s2 // s17 += s2 \n\ fmla.4s v18, v1, v1[0] // v18 += s1 * s1 \n\ fmla.4s v19, v2, v2[0] // f19 += s2 * s2 \n\ fmla.4s v20, v1, v2[0] // f20 += s1 * s2 \n\ subs x4, x4, 1 // if (--n !=0) \n\ b.ne LLoop2 // goto Loop2 \n\ \n\ LSkipLoop2: \n\ stp s16, s17, [x1], 8 // [x1]=s16,s17; x1+=8 \n\ stp s18, s19, [x1], 8 // [x1]=s18,s19; x1+=8 \n\ str s20, [x1] // [x1]=s20; x1+=8 \n\ \n\ // rho numerator \n\ scvtf s21, x5 // s21 = n \n\ fmul s1, s21, s20 // s1 = n * sum_xy \n\ fmls.4s v1, v16, v17[0] // s1 -= sum_x * sum_y \n\ \n\ // rho denominator \n\ fmul s2, s21, s18 // s2 = n * sum_xx \n\ fmsub s2, s16, s16, s2 // s2 = s2 - sum_x * sum_x \n \ fsqrt s2, s2 // s2 = sqrt(s2) \n\ \n\ fmul s3, s21, s19 // s2 = n * sum_yy \n\ fmsub s3, s17, s17, s3 // s2 = s3 - sum_y * sum_y \n\ fsqrt s3, s3 // s3 = sqrt(s3) \n\ \n\ fmul s4, s2, s3 // s4 = s2 * s3 \n\ fcmp s4, s0 // if rho_den < epsilon \n\ b.lo LBadRhoDen // goto BadRhoDen \n\ \n\ fdiv s5, s1, s4 // s5 = rho \n\ str s5, [x0] // [x0] = s5 \n\ mov w0, 1 // return code: success \n\ ret \n\ \n\ LBadRhoDen: \n\ eor v5.16b, v5.16b, v5.16b // rho = 0 \n\ str s5, [x0] // [x0] = rho \n\ mov w0, 0 // return code: fail \n\ ret \n\ \n\ LInvalidArg: \n\ mov w0, 0 // return code: fail \n\ ret \n\ LReturn: \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "x0", "x1" ); } |
ch15_04/main.cpp の実行例 |
arm64@manet ch15_04 % g++ -I.. -std=c++11 -S neon.cpp arm64@manet ch15_04 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out arm64@manet ch15_04 % ./a.out Results for CalcCorrCoef Value C++ A64 SIMD ------------------------------------------ sum_x: 2567.00 2567.00 sum_y: 5160.00 5160.00 sum_xx: 88805.00 88805.00 sum_yy: 287412.00 287412.00 sum_xy: 153065.00 153065.00 rho: 0.91 0.91 |
ch15_06/main.cpp |
#include <iostream> #include <iomanip> #include "MatrixF32.h" using namespace std; extern void Mat4x4MulF32_(float *m_des, const float* m_src1, const float *m_src2); void Mat4x4MulF32(MatrixF32& m_src1, MatrixF32& m_src2) { const size_t nr = m_src1.GetNumRows(); const size_t nc = m_src2.GetNumCols(); MatrixF32 m_des1(nr ,nc); MatrixF32 m_des2(nr ,nc); MatrixF32::Mul4x4(m_des1, m_src1, m_src2); Mat4x4MulF32_(m_des2.Data(), m_src1.Data(), m_src2.Data()); cout << fixed << setprecision(1); m_src1.SetOstream(12, " "); m_src2.SetOstream(12, " "); m_des1.SetOstream(12, " "); m_des2.SetOstream(12, " "); cout << "\nResults for Mat4x4MulF32\n"; cout << "Matrix m_src1\n" << m_src1 << '\n'; cout << "Matrix m_src2\n" << m_src2 << '\n'; cout << "Matrix m_des1\n" << m_des1 << '\n'; cout << "Matrix m_des2\n" << m_des2 << '\n'; if (m_des1 != m_des2) cout << "\nMatrix compare failed - Mat4x4MulF32\n"; } void Mat4x4MulF32Test(void) { const size_t nr = 4; const size_t nc = 4; MatrixF32 m_src1(nr ,nc); MatrixF32 m_src2(nr ,nc); const float src1_row0[] = { 10, 11, 12, 13 }; const float src1_row1[] = { 20, 21, 22, 23 }; const float src1_row2[] = { 30, 31, 32, 33 }; const float src1_row3[] = { 40, 41, 42, 43 }; const float src2_row0[] = { 100, 101, 102, 103 }; const float src2_row1[] = { 200, 201, 202, 203 }; const float src2_row2[] = { 300, 301, 302, 303 }; const float src2_row3[] = { 400, 401, 402, 403 }; m_src1.SetRow(0, src1_row0); m_src1.SetRow(1, src1_row1); m_src1.SetRow(2, src1_row2); m_src1.SetRow(3, src1_row3); m_src2.SetRow(0, src2_row0); m_src2.SetRow(1, src2_row1); m_src2.SetRow(2, src2_row2); m_src2.SetRow(3, src2_row3); Mat4x4MulF32(m_src1, m_src2); } int main() { Mat4x4MulF32Test(); return 0; } |
ch15_06/neon.cpp |
#include "Vec128.h" void Mat4x4MulF32_(float *m_des, const float* m_src1, const float *m_src2) { __asm volatile("\n\ ld1 {v0.4s-v3.4s}, [x1] // m_src1 \n\ ld1 {v4.4s-v7.4s}, [x2] // m_src2 \n\ \n\ // Row 0 \n\ fmul v16.4s, v4.4s, v0.s[0] // v16 = v4 * v0.lane0 \n\ fmla v16.4s, v5.4s, v0.s[1] // v16 += v5 * v0.lane1 \n\ fmla v16.4s, v6.4s, v0.s[2] // v16 += v6 * v0.lane2 \n\ fmla v16.4s, v7.4s, v0.s[3] // v16 += v6 * v0.lane3 \n\ st1 {v16.4s}, [x0], 16 \n\ \n\ // Row 1 \n\ fmul v17.4s, v4.4s, v1.s[0] // v17 = v4 * v1.lane0 \n\ fmla v17.4s, v5.4s, v1.s[1] // v17 += v5 * v1.lane1 \n\ fmla v17.4s, v6.4s, v1.s[2] // v17 += v6 * v1.lane2 \n\ fmla v17.4s, v7.4s, v1.s[3] // v17 += v6 * v1.lane3 \n\ st1 {v17.4s}, [x0], 16 \n\ \n\ // Row 2 \n\ fmul v18.4s, v4.4s, v2.s[0] // v18 = v4 * v2.lane0 \n\ fmla v18.4s, v5.4s, v2.s[1] // v18 += v5 * v2.lane1 \n\ fmla v18.4s, v6.4s, v2.s[2] // v18 += v6 * v2.lane2 \n\ fmla v18.4s, v7.4s, v2.s[3] // v18 += v6 * v2.lane3 \n\ st1 {v18.4s}, [x0], 16 \n\ \n\ // Row 3 \n\ fmul v19.4s, v4.4s, v3.s[0] // v19 = v4 * v3.lane0 \n\ fmla v19.4s, v5.4s, v3.s[1] // v19 += v5 * v3.lane1 \n\ fmla v19.4s, v6.4s, v3.s[2] // v19 += v6 * v3.lane2 \n\ fmla v19.4s, v7.4s, v3.s[3] // v19 += v6 * v3.lane3 \n\ st1 {v19.4s}, [x0], 16 \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "x0" ); } |
ch15_06/main.cpp の実行例 |
arm64@manet Ch15_06 % g++ -I.. -std=c++11 -O -S neon.cpp arm64@manet Ch15_06 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out arm64@manet Ch15_06 % ./a.out Results for Mat4x4MulF32 Matrix m_src1 10.0 11.0 12.0 13.0 20.0 21.0 22.0 23.0 30.0 31.0 32.0 33.0 40.0 41.0 42.0 43.0 Matrix m_src2 100.0 101.0 102.0 103.0 200.0 201.0 202.0 203.0 300.0 301.0 302.0 303.0 400.0 401.0 402.0 403.0 Matrix m_des1 12000.0 12046.0 12092.0 12138.0 22000.0 22086.0 22172.0 22258.0 32000.0 32126.0 32252.0 32378.0 42000.0 42166.0 42332.0 42498.0 Matrix m_des2 12000.0 12046.0 12092.0 12138.0 22000.0 22086.0 22172.0 22258.0 32000.0 32126.0 32252.0 32378.0 42000.0 42166.0 42332.0 42498.0 arm64@manet Ch15_06 % |
ch15_07/main.cpp |
#include <iostream> #include <iomanip> #include "MatrixF32.h" using namespace std; extern void Mat4x4TransposeF32_(float* m_des, const float* m_src1); void Mat4x4TestF32(MatrixF32& m_src1) { const char nl = '\n'; const size_t nr = m_src1.GetNumCols(); const size_t nc = m_src1.GetNumRows(); MatrixF32 m_des1(nr, nc); MatrixF32 m_des2(nr, nc); MatrixF32::Transpose(m_des1, m_src1); Mat4x4TransposeF32_(m_des2.Data(), m_src1.Data()); cout << fixed << setprecision(1); m_src1.SetOstream(12, " "); m_des1.SetOstream(12, " "); m_des2.SetOstream(12, " "); cout << "\nResults for Mat4x4TestF32\n"; cout << "Matrix m_src1\n" << m_src1 << nl; cout << "Matrix m_des1 (transpose of m_src1)\n" << m_des1 << nl; cout << "Matrix m_des2 (transpose of m_src1)\n" << m_des2 << nl; if (m_des1 != m_des2) cout << "\nMatrix transpose compare failed\n"; } void Mat4x4TestF32(void) { const size_t nr = 4; const size_t nc = 4; MatrixF32 m_src1(nr ,nc); const float src1_row0[] = { 10, 11, 12, 13 }; const float src1_row1[] = { 20, 21, 22, 23 }; const float src1_row2[] = { 30, 31, 32, 33 }; const float src1_row3[] = { 40, 41, 42, 43 }; m_src1.SetRow(0, src1_row0); m_src1.SetRow(1, src1_row1); m_src1.SetRow(2, src1_row2); m_src1.SetRow(3, src1_row3); Mat4x4TestF32(m_src1); } int main() { Mat4x4TestF32(); return 0; } |
ch15_07/neon.cpp |
#include "Vec128.h" void Mat4x4TransposeF32_(float* m_des, const float* m_src1) { __asm volatile ("\n\ ld1 {v0.4s-v3.4s}, [x1] \n\ trn1 v4.4s, v0.4s, v1.4s // a0 b0 a2 b2 \n\ trn2 v5.4s, v0.4s, v1.4s // a1 b1 a3 b3 \n\ trn1 v6.4s, v2.4s, v3.4s // c0 d0 c2 d2 \n\ trn2 v7.4s, v2.4s, v3.4s // c1 d1 c3 d3 \n\ trn1 v0.2d, v4.2d, v6.2d // a0 b0 c0 d0 \n\ trn1 v1.2d, v5.2d, v7.2d // a1 b1 c1 d1 \n\ trn2 v2.2d, v4.2d, v6.2d // a2 b2 c2 d2 \n\ trn2 v3.2d, v5.2d, v7.2d // a3 b3 c3 d3 \n\ st1 {v0.4s-v3.4s}, [x0] \n\ " : : : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } |
ch15_07/main.cpp の実行例 |
arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O -S neon.cpp arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out arm64@manet Ch15_07 % ./a.out Results for Mat4x4TestF32 Matrix m_src1 10.0 11.0 12.0 13.0 20.0 21.0 22.0 23.0 30.0 31.0 32.0 33.0 40.0 41.0 42.0 43.0 Matrix m_des1 (transpose of m_src1) 10.0 20.0 30.0 40.0 11.0 21.0 31.0 41.0 12.0 22.0 32.0 42.0 13.0 23.0 33.0 43.0 Matrix m_des2 (transpose of m_src1) 10.0 20.0 30.0 40.0 11.0 21.0 31.0 41.0 12.0 22.0 32.0 42.0 13.0 23.0 33.0 43.0 arm64@manet Ch15_07 % |