Arm64 Assembly Language

Modern Arm Assembly Language Programming: Covers Armv8-A 32-bit, 64-bit, and SIMD

by Daniel Kusswurm

2021.07.28: updated by

Chapter 15: Armv8-64 SIMD Floating-Point Programming

Packed Floating-Point Arithmetic

Basic Arithmetic

NEON を使って基本演算を行うプログラムの説明。

ch15_01/main.cpp

#include <iostream>
#include <cmath>
#include "Vec128.h"
using namespace std;

extern void PackedMathF32_(Vec128 x[9], const Vec128& a, const Vec128& b);
extern void PackedMathF64_(Vec128 x[9], const Vec128& a, const Vec128& b);

void PackedMathF32(void)
{
    const char nl = '\n';
    Vec128 x[9], a, b;
    a.m_F32[0] = 36.0f;                b.m_F32[0] = -1.0f / 9.0f;
    a.m_F32[1] = 1.0f / 32.0f;         b.m_F32[1] = 64.0f;
    a.m_F32[2] = 2.0f;                 b.m_F32[2] = -0.0625f;
    a.m_F32[3] = 42.0f;                b.m_F32[3] = 8.666667f;
    PackedMathF32_(x, a, b);
    cout << ("\nResults for PackedMathF32_\n");
    cout << "a:        " << a.ToStringF32() << nl;
    cout << "b:        " << b.ToStringF32() << nl;
    cout << nl;
    cout << "fadd:     " << x[0].ToStringF32() << nl;
    cout << "fsub:     " << x[1].ToStringF32() << nl;
    cout << "fmul:     " << x[2].ToStringF32() << nl;
    cout << "fdiv:     " << x[3].ToStringF32() << nl;
    cout << "fabs(a):  " << x[4].ToStringF32() << nl;
    cout << "fneg(b):  " << x[5].ToStringF32() << nl;
    cout << "fminnm:   " << x[6].ToStringF32() << nl;
    cout << "fmaxnm:   " << x[7].ToStringF32() << nl;
    cout << "fsqrt(a): " << x[8].ToStringF32() << nl;
}
void PackedMathF64(void)
{
    const char nl = '\n';
    Vec128 x[9], a, b;
    a.m_F64[0] = 36.0;              b.m_F64[0] = -M_SQRT2;
    a.m_F64[1] = M_PI;              b.m_F64[1] = 2.0;
    PackedMathF64_(x, a, b);
    cout << ("\nResults for PackedMathF64_\n");
    cout << "a:        " << a.ToStringF64() << nl;
    cout << "b:        " << b.ToStringF64() << nl;
    cout << nl;
    cout << "fadd:     " << x[0].ToStringF64() << nl;
    cout << "fsub:     " << x[1].ToStringF64() << nl;
    cout << "fmul:     " << x[2].ToStringF64() << nl;
    cout << "fdiv:     " << x[3].ToStringF64() << nl;
    cout << "fabs(a):  " << x[4].ToStringF64() << nl;
    cout << "fneg(b):  " << x[5].ToStringF64() << nl;
    cout << "fminnm:   " << x[6].ToStringF64() << nl;
    cout << "fmaxnm:   " << x[7].ToStringF64() << nl;
    cout << "fsqrt(a): " << x[8].ToStringF64() << nl;
}
int main()
{
    PackedMathF32();
    PackedMathF64();
    return 0;
}

ch15_01/neon.cpp

//#include <iostream>
//#include <cmath>
#include "Vec128.h"

void PackedMathF32_(Vec128 x[9], const Vec128& a, const Vec128& b) {
  __asm volatile("\n\
            ld1 {v0.4s},[x1]                    // v0 = a               \n\
            ld1 {v1.4s},[x2]                    // v1 = b               \n\
            fadd v2.4s,v0.4s,v1.4s              // v2 = a + b           \n\
            st1 {v2.4s},[x0],16                 // save result to x[0]  \n\
            fsub v2.4s,v0.4s,v1.4s              // v2 = a - b           \n\
            st1 {v2.4s},[x0],16                 // save result to x[1]  \n\
            fmul v2.4s,v0.4s,v1.4s              // v2 = a * b           \n\
            st1 {v2.4s},[x0],16                 // save result to x[2]  \n\
            fdiv v2.4s,v0.4s,v1.4s              // v2 = a / b           \n\
            st1 {v2.4s},[x0],16                 // save result to x[3]  \n\
            fabs v2.4s,v0.4s                    // v2 = abs(a)          \n\
            st1 {v2.4s},[x0],16                 // save result to x[4]  \n\
            fneg v2.4s,v1.4s                    // v2 = -b              \n\
            st1 {v2.4s},[x0],16                 // save result to x[5]  \n\
            fminnm v2.4s,v0.4s,v1.4s            // v2 = min(a, b)       \n\
            st1 {v2.4s},[x0],16                 // save result to x[6]  \n\
            fmaxnm v2.4s,v0.4s,v1.4s            // v2 = max(a, b)       \n\
            st1 {v2.4s},[x0],16                 // save result to x[7]  \n\
            fsqrt v2.4s,v0.4s                   // v2 = sqrt(a)         \n\
            st1 {v2.4s},[x0],16                 // save result to x[8]  \n\
            //ret                                                         \n\
\n"
		 :
		 :
		 : "v0", "v1", "v2"
		 );
}
void PackedMathF64_(Vec128 x[9], const Vec128& a, const Vec128& b){
  __asm volatile("\n\
            ld1 {v0.2d},[x1]                    // v0 = a               \n\
            ld1 {v1.2d},[x2]                    // v1 = b               \n\
            fadd v2.2d,v0.2d,v1.2d              // v2 = a + b           \n\
            st1 {v2.2d},[x0],16                 // save result to x[0]  \n\
            fsub v2.2d,v0.2d,v1.2d              // v2 = a - b           \n\
            st1 {v2.2d},[x0],16                 // save result to x[1]  \n\
            fmul v2.2d,v0.2d,v1.2d              // v2 = a * b           \n\
            st1 {v2.2d},[x0],16                 // save result to x[2]  \n\
            fdiv v2.2d,v0.2d,v1.2d              // v2 = a / b           \n\
            st1 {v2.2d},[x0],16                 // save result to x[3]  \n\
            fabs v2.2d,v0.2d                    // v2 = abs(a)          \n\
            st1 {v2.2d},[x0],16                 // save result to x[4]  \n\
            fneg v2.2d,v1.2d                    // v2 = -b              \n\
            st1 {v2.2d},[x0],16                 // save result to x[5]  \n\
            fminnm v2.2d,v0.2d,v1.2d            // v2 = min(a, b)       \n\
            st1 {v2.2d},[x0],16                 // save result to x[6]  \n\
            fmaxnm v2.2d,v0.2d,v1.2d            // v2 = max(a, b)       \n\
            st1 {v2.2d},[x0],16                 // save result to x[7]  \n\
            fsqrt v2.2d,v0.2d                   // v2 = sqrt(a)         \n\
            st1 {v2.2d},[x0],16                 // save result to x[8]  \n\
            //ret                                                             \n\
\n"
		 :
		 :
		 :
		 "v0", "v1", "v2"
		 );
}

ch15_01/main.cpp の実行例

arm64@manet src % cd ch15_01
arm64@manet ch15_01 % g++ -I.. -std=c++11 -O -S neon.cpp
arm64@manet ch15_01 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet ch15_01 % ./a.out

Results for PackedMathF32_
a:               36.000000        0.031250  |        2.000000       42.000000
b:               -0.111111       64.000000  |       -0.062500        8.666667

fadd:            35.888889       64.031250  |        1.937500       50.666668
fsub:            36.111111      -63.968750  |        2.062500       33.333332
fmul:            -4.000000        2.000000  |       -0.125000      364.000000
fdiv:          -324.000000        0.000488  |      -32.000000        4.846154
fabs(a):         36.000000        0.031250  |        2.000000       42.000000
fneg(b):          0.111111      -64.000000  |        0.062500       -8.666667
fminnm:          -0.111111        0.031250  |       -0.062500        8.666667
fmaxnm:          36.000000       64.000000  |        2.000000       42.000000
fsqrt(a):         6.000000        0.176777  |        1.414214        6.480741

Results for PackedMathF64_
a:                         36.000000000000  |                  3.141592653590
b:                         -1.414213562373  |                  2.000000000000

fadd:                      34.585786437627  |                  5.141592653590
fsub:                      37.414213562373  |                  1.141592653590
fmul:                     -50.911688245431  |                  6.283185307180
fdiv:                     -25.455844122716  |                  1.570796326795
fabs(a):                   36.000000000000  |                  3.141592653590
fneg(b):                    1.414213562373  |                 -2.000000000000
fminnm:                    -1.414213562373  |                  2.000000000000
fmaxnm:                    36.000000000000  |                  3.141592653590
fsqrt(a):                   6.000000000000  |                  1.772453850906

ch15_02

ch15_02/main.cpp

#include <iostream>
#include <iomanip>
#include <cmath>
#include "Vec128.h"
using namespace std;

extern void PackedCompareF32_(Vec128 x[8], const Vec128& a, const Vec128& b);
extern void PackedCompareF64_(Vec128 x[8], const Vec128& a, const Vec128& b);

const char* c_CmpStr[8] = {
    "EQ", "NE", "GT", "GE", "LT", "LE", "a LT0", "b GT0"
};

void PackedCompareF32(void) {
    const char nl = '\n';
    Vec128 x[8], a, b;

    a.m_F32[0] = 2.0;         b.m_F32[0] = -4.0;
    a.m_F32[1] = 17.0;        b.m_F32[1] = 12.0;
    a.m_F32[2] = -6.0;        b.m_F32[2] = -6.0;
    a.m_F32[3] = 3.0;         b.m_F32[3] = 8.0;

    PackedCompareF32_(x, a, b);

    cout << "\nResults for PackedCompareF32_\n";
    cout << setw(11) << 'a' << ':' << a.ToStringF32() << nl;
    cout << setw(11) << 'b' << ':' << b.ToStringF32() << nl;
    cout << nl;

    for (int j = 0; j < 8; j++)
        cout << setw(11) << c_CmpStr[j] << ':' << x[j].ToStringX32() << nl;
}

void PackedCompareF64(void)
{
    const char nl = '\n';
    Vec128 x[8], a, b;

    a.m_F64[0] = -2.0;        b.m_F64[0] = -4.0;
    a.m_F64[1] = M_SQRT2;     b.m_F64[1] = M_PI;

    PackedCompareF64_(x, a, b);

    cout << "\nResults for PackedCompareF64_\n";
    cout << setw(11) << 'a' << ':' << a.ToStringF64() << nl;
    cout << setw(11) << 'b' << ':' << b.ToStringF64() << nl;
    cout << nl;

    for (int j = 0; j < 8; j++)
        cout << setw(11) << c_CmpStr[j] << ':' << x[j].ToStringX64() << nl;
}

int main()
{
    PackedCompareF32();
    PackedCompareF64();
    return 0;
}

ch15_02/neon.cpp

#include "Vec128.h"

void PackedCompareF32_(Vec128 x[8], const Vec128& a, const Vec128& b) {
  __asm volatile("\n\
	ld1	{v0.4s}, [x1]            // v0 = a           \n\
	ld1	{v1.4s}, [x2]            // v1 = b           \n\
	fcmeq	v2.4s, v0.4s, v1.4s      // packed a == b    \n\
	st1	{v2.4s}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	not     v2.16b, v2.16b           // packed a !=b     \n\
	st1	{v2.4s}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmgt	v2.4s, v0.4s, v1.4s      // packed a > b     \n\
	st1	{v2.4s}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmge	v2.4s, v0.4s, v1.4s      // packed a >= b    \n\
	st1	{v2.4s}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmlt	v2.4s, v0.4s, v1.4s      // packed a < b     \n\
	st1	{v2.4s}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmle	v2.4s, v0.4s, v1.4s      // packed a <= b    \n\
	st1	{v2.4s}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmlt	v2.4s, v0.4s, 0.0        // packed a < 0     \n\
	st1	{v2.4s}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmgt	v2.4s, v1.4s, 0.0        // packed b > 0     \n\
	st1	{v2.4s}, [x0], 16        // [x0]=v2; x0+=16  \n\
"
		 :
		 :
		 : "v0", "v1", "v2"
		 );
}

void PackedCompareF64_(Vec128 x[8], const Vec128& a, const Vec128& b) {
  __asm volatile("\n\
	ld1	{v0.2d}, [x1]            // v0 = a           \n\
	ld1	{v1.2d}, [x2]            // v1 = b           \n\
	fcmeq	v2.2d, v0.2d, v1.2d      // packed a == b    \n\
	st1	{v2.2d}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	not     v2.16b, v2.16b           // packed a !=b     \n\
	st1	{v2.2d}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmgt	v2.2d, v0.2d, v1.2d      // packed a > b     \n\
	st1	{v2.2d}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmge	v2.2d, v0.2d, v1.2d      // packed a >= b    \n\
	st1	{v2.2d}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmlt	v2.2d, v0.2d, v1.2d      // packed a < b     \n\
	st1	{v2.2d}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmle	v2.2d, v0.2d, v1.2d      // packed a <= b    \n\
	st1	{v2.2d}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmlt	v2.2d, v0.2d, 0.0        // packed a < 0     \n\
	st1	{v2.2d}, [x0], 16        // [x0]=v2; x0+=16  \n\
                                                             \n\
	fcmgt	v2.2d, v1.2d, 0.0        // packed b > 0     \n\
	st1	{v2.2d}, [x0], 16        // [x0]=v2; x0+=16  \n\
"
		 :
		 :
		 : "v0", "v1", "v2"
		 );
}

ch15_02/main.cpp の実行例

arm64@manet Ch15_02 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet Ch15_02 % ./a.out

Results for PackedCompareF32_
          a:        2.000000       17.000000  |       -6.000000        3.000000
          b:       -4.000000       12.000000  |       -6.000000        8.000000

         EQ:        00000000        00000000  |        FFFFFFFF        00000000
         NE:        FFFFFFFF        FFFFFFFF  |        00000000        FFFFFFFF
         GT:        FFFFFFFF        FFFFFFFF  |        00000000        00000000
         GE:        FFFFFFFF        FFFFFFFF  |        FFFFFFFF        00000000
         LT:        00000000        00000000  |        00000000        FFFFFFFF
         LE:        00000000        00000000  |        FFFFFFFF        FFFFFFFF
      a LT0:        00000000        00000000  |        FFFFFFFF        00000000
      b GT0:        00000000        FFFFFFFF  |        00000000        FFFFFFFF

Results for PackedCompareF64_
          a:                 -2.000000000000  |                  1.414213562373
          b:                 -4.000000000000  |                  3.141592653590

         EQ:                0000000000000000  |                0000000000000000
         NE:                FFFFFFFFFFFFFFFF  |                FFFFFFFFFFFFFFFF
         GT:                FFFFFFFFFFFFFFFF  |                0000000000000000
         GE:                FFFFFFFFFFFFFFFF  |                0000000000000000
         LT:                0000000000000000  |                FFFFFFFFFFFFFFFF
         LE:                0000000000000000  |                FFFFFFFFFFFFFFFF
      a LT0:                FFFFFFFFFFFFFFFF  |                0000000000000000
      b GT0:                0000000000000000  |                FFFFFFFFFFFFFFFF

ch15_03

ch15_03/main.cpp

#include <iostream>
#include <iomanip>
#include <cmath>
#include "Vec128.h"
using namespace std;

extern void F32fromI32(Vec128 x[2], const Vec128& a);
extern void I32fromF32(Vec128 x[2], const Vec128& a);
extern void F64fromI64(Vec128 x[2], const Vec128& a);
extern void I64fromF64(Vec128 x[2], const Vec128& a);
extern void F32fromU32(Vec128 x[2], const Vec128& a);
extern void U32fromF32(Vec128 x[2], const Vec128& a);
extern void F64fromU64(Vec128 x[2], const Vec128& a);
extern void U64fromF64(Vec128 x[2], const Vec128& a);
extern void F32fromF64(Vec128 x[2], const Vec128& a, const Vec128& b);
extern void F64fromF32(Vec128 x[2], const Vec128& a);

void PackedConvertA(void) {
    const char nl = '\n';
    Vec128 x[2], a;

    // F32_I32
    a.m_I32[0] = 10;
    a.m_I32[1] = -500;
    a.m_I32[2] = 600;
    a.m_I32[3] = -1024;
    F32fromI32(x, a);
    cout << "\nResults for CvtOp::F32_I32\n";
    cout << "a:    " << a.ToStringI32() << nl;
    cout << "x[0]: " << x[0].ToStringF32() << nl;

    // I32_F32
    a.m_F32[0] = -1.25f;
    a.m_F32[1] = 100.875f;
    a.m_F32[2] = -200.0f;
    a.m_F32[3] = (float)M_PI;
    I32fromF32(x, a);
    cout << "\nResults for CvtOp::I32_F32\n";
    cout << "a:    " << a.ToStringF32() << nl;
    cout << "x[0]: " << x[0].ToStringI32() << nl;

    // F64_I64
    a.m_I64[0] = 1000;
    a.m_I64[1] = -500000000000;
    F64fromI64(x, a);
    cout << "\nResults for CvtOp::F64_I64\n";
    cout << "a:    " << a.ToStringI64() << nl;
    cout << "x[0]: " << x[0].ToStringF64() << nl;

    // I64_F64
    a.m_F64[0] = -122.66666667;
    a.m_F64[1] = 1234567890123.75;
    I64fromF64(x, a);
    cout << "\nResults for CvtOp::I64_F64\n";
    cout << "a:    " << a.ToStringF64() << nl;
    cout << "x[0]: " << x[0].ToStringI64() << nl;
}

void PackedConvertB(void)
{
    const char nl = '\n';
    Vec128 x[2], a;

    // F32_U32
    a.m_U32[0] = 10;
    a.m_U32[1] = 500;
    a.m_U32[2] = 600;
    a.m_U32[3] = 1024;
    F32fromU32(x, a);
    cout << "\nResults for CvtOp::F32_U32\n";
    cout << "a:    " << a.ToStringU32() << nl;
    cout << "x[0]: " << x[0].ToStringF32() << nl;

    // U32_F32
    a.m_F32[0] = 1.25f;
    a.m_F32[1] = 100.875f;
    a.m_F32[2] = 200.0f;
    a.m_F32[3] = (float)M_PI;
    U32fromF32(x, a);
    cout << "\nResults for CvtOp::U32_F32\n";
    cout << "a:    " << a.ToStringF32() << nl;
    cout << "x[0]: " << x[0].ToStringU32() << nl;

    // F64_U64
    a.m_I64[0] = 1000;
    a.m_I64[1] = 420000000000;
    F64fromU64(x, a);
    cout << "\nResults for CvtOp::F64_U64\n";
    cout << "a:    " << a.ToStringU64() << nl;
    cout << "x[0]: " << x[0].ToStringF64() << nl;

    // U64_F64
    a.m_F64[0] = 698.40;
    a.m_F64[1] = 1234567890123.75;
    U64fromF64(x, a);
    cout << "\nResults for CvtOp::U64_F64\n";
    cout << "a:    " << a.ToStringF64() << nl;
    cout << "x[0]: " << x[0].ToStringU64() << nl;
}

void PackedConvertC(void)
{
    const char nl = '\n';
    Vec128 x[2], a, b;

    // F32_F64
    a.m_F64[0] = M_PI;
    a.m_F64[1] = M_LOG10E;
    b.m_F64[0] = -M_E;
    b.m_F64[1] = M_LN2;
    F32fromF64(x, a, b);
    cout << "\nResults for CvtOp::F32_F64\n";
    cout << "a:    " << a.ToStringF64() << nl;
    cout << "b:    " << b.ToStringF64() << nl;
    cout << "x[0]: " << x[0].ToStringF32() << nl;

    // F64_F32
    a.m_F32[0] = 1.0f / 9.0f;
    a.m_F32[1] = 100.875f;
    a.m_F32[2] = 200.0f;
    a.m_F32[3] = (float)M_SQRT2;
    F64fromF32(x, a);
    cout << "\nResults for CvtOp::F64_F32\n";
    cout << "a:    " << a.ToStringF32() << nl;
    cout << "x[0]: " << x[0].ToStringF64() << nl;
    cout << "x[1]: " << x[1].ToStringF64() << nl;
}

int main()
{
    PackedConvertA();
    PackedConvertB();
    PackedConvertC();
    return 0;
}

ch15_03/neon.cpp

#include "Vec128.h"

void F32fromI32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	scvtf	v1.4s, v0.4s	// float32 <- int32   \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void I32fromF32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	fcvtns	v1.4s, v0.4s	// int32 <- float32   \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void F64fromI64(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	scvtf	v1.2d, v0.2d	// float64 <- int64   \n\
	st1	{v1.2d}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void I64fromF64(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	fcvtns	v1.2d, v0.2d	// int32 <- float32   \n\
	st1	{v1.2d}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}


void F32fromU32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	ucvtf	v1.4s, v0.4s	// float32 <- int32   \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void U32fromF32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	fcvtnu	v1.4s, v0.4s	// uint32 <- float32  \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void F64fromU64(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	ucvtf	v1.2d, v0.2d	// float64 <- int64   \n\
	st1	{v1.2d}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}

void U64fromF64(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	fcvtnu	v1.2d, v0.2d	// uint64 <- float64  \n\
	st1	{v1.2d}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "x0", "x1"
		  );

}


void F32fromF64(Vec128 x[2], const Vec128& a, const Vec128& b) {
  __asm volatile ("\n\
	ld1	{v0.2d}, [x1]                         \n\
	ld1	{v2.2d}, [x2]                         \n\
	fcvtn	v1.2s, v0.2d	// lower-order F32    \n\
	fcvtn2	v1.4s, v2.2d	// higher-order F32   \n\
	st1	{v1.4s}, [x0]   // [x0] = v1          \n\
	"
	:
	:
	: "v0", "v1", "v2", "x0", "x1"
		  );

}

void F64fromF32(Vec128 x[2], const Vec128& a) {
  __asm volatile ("\n\
	ld1	{v0.4s}, [x1]                         \n\
	fcvtl	v1.2d, v0.2s	// lower-order F32    \n\
	fcvtl2	v2.2d, v0.4s	// higher-order F32   \n\
	st1	{v1.2d, v2.2d}, [x0]   // [x0] = v1   \n\
	"
	:
	:
	: "v0", "v1", "v2", "x0", "x1"
		  );

}

ch15_03/main.cpp の実行例

arm64@manet ch15_03 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet ch15_03 % ./a.out

Results for CvtOp::F32_I32
a:                  10            -500  |             600           -1024
x[0]:        10.000000     -500.000000  |      600.000000    -1024.000000

Results for CvtOp::I32_F32
a:           -1.250000      100.875000  |     -200.000000        3.141593
x[0]:               -1             101  |            -200               3

Results for CvtOp::F64_I64
a:                                1000  |                   -500000000000
x[0]:                1000.000000000000  |      -500000000000.000000000000

Results for CvtOp::I64_F64
a:                   -122.666666670000  |      1234567890123.750000000000
x[0]:                             -123  |                   1234567890124

Results for CvtOp::F32_U32
a:                  10             500  |             600            1024
x[0]:        10.000000      500.000000  |      600.000000     1024.000000

Results for CvtOp::U32_F32
a:            1.250000      100.875000  |      200.000000        3.141593
x[0]:                1             101  |             200               3

Results for CvtOp::F64_U64
a:                                1000  |                    420000000000
x[0]:                1000.000000000000  |       420000000000.000000000000

Results for CvtOp::U64_F64
a:                    698.400000000000  |      1234567890123.750000000000
x[0]:                              698  |                   1234567890124

Results for CvtOp::F32_F64
a:                      3.141592653590  |                  0.434294481903
b:                     -2.718281828459  |                  0.693147180560
x[0]:         3.141593        0.434294  |       -2.718282        0.693147

Results for CvtOp::F64_F32
a:            0.111111      100.875000  |      200.000000        1.414214
x[0]:                   0.111111111939  |                100.875000000000
x[1]:                 200.000000000000  |                  1.414213538170

ch15_04

ch15_04/main.cpp

#include <iostream>
#include <iomanip>
#include <string>
#include <random>
#include "AlignedMem.h"

using namespace std;

extern bool CalcCorrCoef_(float* rho, float sums[5], const float* x, const float* y, size_t n, float epsilon);

const size_t c_Alignment = 16;

void Init(float* x, float* y, size_t n, unsigned int seed) {
    uniform_real_distribution<float> dist1 {0.0, 50.0};
    normal_distribution<float> dist2 {25.0, 7.0};
    mt19937 rng {seed};

    for (size_t i = 0; i < n; i++) {
        x[i] = round(dist1(rng));
        y[i] = x[i] + round(dist2(rng));
//      cout << setw(10) << x[i] << ", " << setw(10) << y[i] << endl;
    }
}

bool CalcCorrCoef(float* rho, float sums[5], const float* x, const float* y, size_t n, float epsilon) {
    // Make sure n is valid
    if (n == 0)
        return false;

    // Make sure x and y are properly aligned
    if (!AlignedMem::IsAligned(x, c_Alignment))
        return false;
    if (!AlignedMem::IsAligned(y, c_Alignment))
        return false;

    // Calculate and save sum variables
    float sum_x = 0, sum_y = 0, sum_xx = 0, sum_yy = 0, sum_xy = 0;

    for (size_t i = 0; i < n; i++)
    {
        sum_x += x[i];
        sum_y += y[i];
        sum_xx += x[i] * x[i];
        sum_yy += y[i] * y[i];
        sum_xy += x[i] * y[i];
    }

    sums[0] = sum_x;
    sums[1] = sum_y;
    sums[2] = sum_xx;
    sums[3] = sum_yy;
    sums[4] = sum_xy;

    // Calculate rho
    float rho_num = n * sum_xy - sum_x * sum_y;
    float rho_den = sqrt(n * sum_xx - sum_x * sum_x) * sqrt(n * sum_yy - sum_y * sum_y);

    if (rho_den >= epsilon)
    {
        *rho = rho_num / rho_den;
        return true;
    }
    else
    {
        *rho = 0;
        return false;
    }
}

int main()
{
    const char nl = '\n';
    const size_t n = 103;
    AlignedArray<float> x_aa(n, c_Alignment);
    AlignedArray<float> y_aa(n, c_Alignment);
    float sums1[5], sums2[5];
    float rho1, rho2;
    float epsilon = 1.0e-9;
    float* x = x_aa.Data();
    float* y = y_aa.Data();

    Init(x, y, n, 71);

    bool rc1 = CalcCorrCoef(&rho1, sums1, x, y, n, epsilon);
    bool rc2 = CalcCorrCoef_(&rho2, sums2, x, y, n, epsilon);

    cout << "Results for CalcCorrCoef\n\n";

    if (!rc1 || !rc2)
    {
        cout << "Invalid return code ";
        cout << "rc1 = " << boolalpha << rc1 << ", ";
        cout << "rc2 = " << boolalpha << rc2 << nl;
        return 1;
    }

    int w = 14;
    string sep(w * 3, '-');

    cout << fixed << setprecision(6);
    cout << "Value    " << setw(w) << "C++" << " " << setw(w) << "A64 SIMD" << nl;
    cout << sep << nl;

    cout << setprecision(2);
    cout << "sum_x:   " << setw(w) << sums1[0] << " " << setw(w) << sums2[0] << nl;
    cout << "sum_y:   " << setw(w) << sums1[1] << " " << setw(w) << sums2[1] << nl;
    cout << "sum_xx:  " << setw(w) << sums1[2] << " " << setw(w) << sums2[2] << nl;
    cout << "sum_yy:  " << setw(w) << sums1[3] << " " << setw(w) << sums2[3] << nl;
    cout << "sum_xy:  " << setw(w) << sums1[4] << " " << setw(w) << sums2[4] << nl;
    cout << "rho:     " << setw(w) << rho1 << " " << setw(w) << rho2 << nl;
    return 0;
}

ch15_04/neon.cpp

#include "Vec128.h"

//#define UpdateSums(VregX, VregY)					\
//	fadd	v16.4s, v16.4s, \VregX\().4s     // update sum_x \n\
//	fadd	v17.4s, v17.4s, \VregX\().4s     // update sum_y \n\



void CalcCorrCoef_(float* rho, float sums[5], const float* x, const float* y, size_t n, float epsilon) {
  __asm volatile ("\n\
	                                              \n\
	.macro UpdateSums VregX, VregY                \n\
	fadd.4s	v16, v16, \\VregX\\() // sum_x: v16 += VregX  \n\
	fadd.4s	v17, v17, \\VregY\\() // sum_y: v17 += VregX  \n\
	fmla.4s	v18, \\VregX\\(), \\VregX\\() // sum_xx: v18 += VregX^2 \n\
	fmla.4s	v19, \\VregY\\(), \\VregY\\() // sum_yy: v19 += VregY^2 \n\
	fmla.4s	v20, \\VregX\\(), \\VregY\\() // sum_xy: v20 += VregX * VregY \n\
	.endm                                         \n\
	                                              \n\
	cbz	x4, LInvalidArg // if n == 0 goto END \n\
	tst	x2, 0x0f            // if (x2 != 15)  \n\
	b.ne	LInvalidArg     //   goto END         \n\
	tst	x3, 0x0f            // if (x3 != 15)  \n\
	b.ne	LInvalidArg     //   goto END         \n\
	mov	x5, x4	        // save n to x5       \n\
	                                              \n\
	eor	v16.16b, v16.16b, v16.16b // sum_x = 0    \n\
	eor	v17.16b, v17.16b, v17.16b // sum_y = 0    \n\
	eor	v18.16b, v18.16b, v18.16b // sum_xx = 0   \n\
	eor	v19.16b, v19.16b, v19.16b // sum_yy = 0   \n\
	eor	v20.16b, v20.16b, v20.16b // sum_xy = 0   \n\
	                                              \n\
	cmp	x4, 16          // if n<=16           \n\
	b.lo	LSkipLoop1      //   goto SkipLoop1   \n\
	                                              \n\
LLoop1:	                                              \n\
	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [x2], 64 // load x[0:16] \n\
	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [x3], 64 // load y[0:16] \n\
	                                              \n\
	UpdateSums v0, v4                             \n\
	UpdateSums v1, v5                             \n\
	UpdateSums v2, v6                             \n\
	UpdateSums v3, v7                             \n\
	sub	x4, x4, 16               // n -= 16   \n\
	cmp	x4, 16              // if x4 >= 16    \n\
	b.hs	LLoop1             //   goto Loop     \n\
	                                              \n\
LSkipLoop1:                                           \n\
	faddp	v16.4s, v16.4s, v16.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
	faddp	v16.4s, v16.4s, v16.4s // s16 = lane0+lane1      \n\
	faddp	v17.4s, v17.4s, v17.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
	faddp	v17.4s, v17.4s, v17.4s // s17 = lane0+lane1      \n\
	faddp	v18.4s, v18.4s, v18.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
	faddp	v18.4s, v18.4s, v18.4s // s18 = lane0+lane1      \n\
	faddp	v19.4s, v19.4s, v19.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
	faddp	v19.4s, v19.4s, v19.4s // s19 = lane0+lane1      \n\
	faddp	v20.4s, v20.4s, v20.4s // lane0=lane0+lane1,lane1=lane2+lane3 \n\
	faddp	v20.4s, v20.4s, v20.4s // s20 = lane0+lane1      \n\
	cbz	x4, LSkipLoop2   // if x4==0 goto SkipLoop2      \n\
	                                              \n\
LLoop2:	                                              \n\
	ldr	s1, [x2], 4       // s1 = x; x+=4     \n\
	ldr	s2, [x3], 4       // s2 = y; y+=4     \n\
	fadd	s16, s16, s1      // s16 += s1        \n\
	fadd	s17, s17, s2      // s17 += s2        \n\
	fmla.4s v18, v1, v1[0] // v18 += s1 * s1   \n\
	fmla.4s v19, v2, v2[0] // f19 += s2 * s2   \n\
	fmla.4s v20, v1, v2[0] // f20 += s1 * s2   \n\
	subs	x4, x4, 1         // if (--n !=0)     \n\
	b.ne LLoop2               //   goto Loop2     \n\
	                                              \n\
LSkipLoop2:                                           \n\
	stp	s16, s17, [x1], 8 // [x1]=s16,s17; x1+=8 \n\
	stp	s18, s19, [x1], 8 // [x1]=s18,s19; x1+=8 \n\
	str	s20, [x1]         // [x1]=s20; x1+=8  \n\
	                                              \n\
	// rho numerator                              \n\
	scvtf	s21, x5	          // s21 = n          \n\
	fmul	s1, s21, s20      // s1 = n * sum_xy  \n\
	fmls.4s v1, v16, v17[0]   // s1 -= sum_x * sum_y \n\
	                                              \n\
	// rho denominator                            \n\
	fmul	s2, s21, s18      // s2 = n * sum_xx  \n\
	fmsub	s2, s16, s16, s2  // s2 = s2 - sum_x * sum_x \n	\
	fsqrt	s2, s2            // s2 = sqrt(s2)    \n\
	                                              \n\
	fmul	s3, s21, s19      // s2 = n * sum_yy \n\
	fmsub	s3, s17, s17, s3  // s2 = s3 - sum_y * sum_y \n\
	fsqrt	s3, s3           // s3 = sqrt(s3)     \n\
	                                              \n\
	fmul	s4, s2, s3       // s4 = s2 * s3      \n\
	fcmp	s4, s0           // if rho_den < epsilon \n\
	b.lo	LBadRhoDen       //   goto BadRhoDen  \n\
	                                              \n\
	fdiv	s5, s1, s4       // s5 = rho          \n\
	str	s5, [x0]         // [x0] = s5         \n\
	mov	w0, 1            // return code: success \n\
	ret                                           \n\
	                                              \n\
LBadRhoDen:                                           \n\
	eor	v5.16b, v5.16b, v5.16b   // rho = 0   \n\
	str	s5, [x0]         // [x0] = rho        \n\
	mov	w0, 0		// return code: fail  \n\
	ret                                           \n\
	                                              \n\
LInvalidArg:                                          \n\
	mov	w0, 0		// return code: fail  \n\
	ret                                           \n\
LReturn:                                              \n\
	"
	:
	:
	: "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "x0", "x1"
		  );
}

ch15_04/main.cpp の実行例

arm64@manet ch15_04 % g++ -I.. -std=c++11 -S neon.cpp
arm64@manet ch15_04 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet ch15_04 % ./a.out
Results for CalcCorrCoef

Value               C++       A64 SIMD
------------------------------------------
sum_x:          2567.00        2567.00
sum_y:          5160.00        5160.00
sum_xx:        88805.00       88805.00
sum_yy:       287412.00      287412.00
sum_xy:       153065.00      153065.00
rho:               0.91           0.91

ch15_06

ch15_06/main.cpp

#include <iostream>
#include <iomanip>
#include "MatrixF32.h"

using namespace std;

extern void Mat4x4MulF32_(float *m_des, const float* m_src1, const float *m_src2);

void Mat4x4MulF32(MatrixF32& m_src1, MatrixF32& m_src2) {
    const size_t nr = m_src1.GetNumRows();
    const size_t nc = m_src2.GetNumCols();
    MatrixF32 m_des1(nr ,nc);
    MatrixF32 m_des2(nr ,nc);
    MatrixF32::Mul4x4(m_des1, m_src1, m_src2);
    Mat4x4MulF32_(m_des2.Data(), m_src1.Data(), m_src2.Data());
    cout << fixed << setprecision(1);
    m_src1.SetOstream(12, "  ");
    m_src2.SetOstream(12, "  ");
    m_des1.SetOstream(12, "  ");
    m_des2.SetOstream(12, "  ");
    cout << "\nResults for Mat4x4MulF32\n";
    cout << "Matrix m_src1\n" << m_src1 << '\n';
    cout << "Matrix m_src2\n" << m_src2 << '\n';
    cout << "Matrix m_des1\n" << m_des1 << '\n';
    cout << "Matrix m_des2\n" << m_des2 << '\n';
    if (m_des1 != m_des2)
        cout << "\nMatrix compare failed - Mat4x4MulF32\n";
}

void Mat4x4MulF32Test(void) {
    const size_t nr = 4;
    const size_t nc = 4;
    MatrixF32 m_src1(nr ,nc);
    MatrixF32 m_src2(nr ,nc);
    const float src1_row0[] = { 10, 11, 12, 13 };
    const float src1_row1[] = { 20, 21, 22, 23 };
    const float src1_row2[] = { 30, 31, 32, 33 };
    const float src1_row3[] = { 40, 41, 42, 43 };
    const float src2_row0[] = { 100, 101, 102, 103 };
    const float src2_row1[] = { 200, 201, 202, 203 };
    const float src2_row2[] = { 300, 301, 302, 303 };
    const float src2_row3[] = { 400, 401, 402, 403 };
    m_src1.SetRow(0, src1_row0);
    m_src1.SetRow(1, src1_row1);
    m_src1.SetRow(2, src1_row2);
    m_src1.SetRow(3, src1_row3);
    m_src2.SetRow(0, src2_row0);
    m_src2.SetRow(1, src2_row1);
    m_src2.SetRow(2, src2_row2);
    m_src2.SetRow(3, src2_row3);
    Mat4x4MulF32(m_src1, m_src2);
}

int main() {
    Mat4x4MulF32Test();
    return 0;
}

ch15_06/neon.cpp

#include "Vec128.h"

void Mat4x4MulF32_(float *m_des, const float* m_src1, const float *m_src2) {
  __asm volatile("\n\
	ld1	{v0.4s-v3.4s}, [x1]           // m_src1                                 \n\
	ld1	{v4.4s-v7.4s}, [x2]           // m_src2                                 \n\
	                                                                                \n\
	// Row 0                                                                        \n\
	fmul	v16.4s, v4.4s, v0.s[0]       // v16 = v4 * v0.lane0                     \n\
	fmla	v16.4s, v5.4s, v0.s[1]       // v16 += v5 * v0.lane1                    \n\
	fmla	v16.4s, v6.4s, v0.s[2]       // v16 += v6 * v0.lane2                    \n\
	fmla	v16.4s, v7.4s, v0.s[3]       // v16 += v6 * v0.lane3                    \n\
	st1	{v16.4s}, [x0], 16                                                      \n\
	                                                                                \n\
	// Row 1                                                                        \n\
	fmul	v17.4s, v4.4s, v1.s[0]       // v17 = v4 * v1.lane0                     \n\
	fmla	v17.4s, v5.4s, v1.s[1]       // v17 += v5 * v1.lane1                    \n\
	fmla	v17.4s, v6.4s, v1.s[2]       // v17 += v6 * v1.lane2                    \n\
	fmla	v17.4s, v7.4s, v1.s[3]       // v17 += v6 * v1.lane3                    \n\
	st1	{v17.4s}, [x0], 16                                                      \n\
	                                                                                \n\
	// Row 2                                                                        \n\
	fmul	v18.4s, v4.4s, v2.s[0]       // v18 = v4 * v2.lane0                     \n\
	fmla	v18.4s, v5.4s, v2.s[1]       // v18 += v5 * v2.lane1                    \n\
	fmla	v18.4s, v6.4s, v2.s[2]       // v18 += v6 * v2.lane2                    \n\
	fmla	v18.4s, v7.4s, v2.s[3]       // v18 += v6 * v2.lane3                    \n\
	st1	{v18.4s}, [x0], 16                                                      \n\
	                                                                                \n\
	// Row 3                                                                        \n\
	fmul	v19.4s, v4.4s, v3.s[0]       // v19 = v4 * v3.lane0                     \n\
	fmla	v19.4s, v5.4s, v3.s[1]       // v19 += v5 * v3.lane1                    \n\
	fmla	v19.4s, v6.4s, v3.s[2]       // v19 += v6 * v3.lane2                    \n\
	fmla	v19.4s, v7.4s, v3.s[3]       // v19 += v6 * v3.lane3                    \n\
	st1	{v19.4s}, [x0], 16                                                      \n\
"
		 :
		 :
		 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "x0"
		 );
}

ch15_06/main.cpp の実行例

arm64@manet Ch15_06 % g++ -I.. -std=c++11 -O -S neon.cpp
arm64@manet Ch15_06 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet Ch15_06 % ./a.out

Results for Mat4x4MulF32
Matrix m_src1
        10.0          11.0          12.0          13.0
        20.0          21.0          22.0          23.0
        30.0          31.0          32.0          33.0
        40.0          41.0          42.0          43.0

Matrix m_src2
       100.0         101.0         102.0         103.0
       200.0         201.0         202.0         203.0
       300.0         301.0         302.0         303.0
       400.0         401.0         402.0         403.0

Matrix m_des1
     12000.0       12046.0       12092.0       12138.0
     22000.0       22086.0       22172.0       22258.0
     32000.0       32126.0       32252.0       32378.0
     42000.0       42166.0       42332.0       42498.0

Matrix m_des2
     12000.0       12046.0       12092.0       12138.0
     22000.0       22086.0       22172.0       22258.0
     32000.0       32126.0       32252.0       32378.0
     42000.0       42166.0       42332.0       42498.0

arm64@manet Ch15_06 %

ch15_07

ch15_07/main.cpp

#include <iostream>
#include <iomanip>
#include "MatrixF32.h"

using namespace std;

extern void Mat4x4TransposeF32_(float* m_des, const float* m_src1);

void Mat4x4TestF32(MatrixF32& m_src1) {
    const char nl = '\n';
    const size_t nr = m_src1.GetNumCols();
    const size_t nc = m_src1.GetNumRows();
    MatrixF32 m_des1(nr, nc);
    MatrixF32 m_des2(nr, nc);
    MatrixF32::Transpose(m_des1, m_src1);
    Mat4x4TransposeF32_(m_des2.Data(), m_src1.Data());
    cout << fixed << setprecision(1);
    m_src1.SetOstream(12, "  ");
    m_des1.SetOstream(12, "  ");
    m_des2.SetOstream(12, "  ");
    cout << "\nResults for Mat4x4TestF32\n";
    cout << "Matrix m_src1\n" << m_src1 << nl;
    cout << "Matrix m_des1 (transpose of m_src1)\n" << m_des1 << nl;
    cout << "Matrix m_des2 (transpose of m_src1)\n" << m_des2 << nl;
    if (m_des1 != m_des2)
        cout << "\nMatrix transpose compare failed\n";
}

void Mat4x4TestF32(void) {
    const size_t nr = 4;
    const size_t nc = 4;
    MatrixF32 m_src1(nr ,nc);
    const float src1_row0[] = { 10, 11, 12, 13 };
    const float src1_row1[] = { 20, 21, 22, 23 };
    const float src1_row2[] = { 30, 31, 32, 33 };
    const float src1_row3[] = { 40, 41, 42, 43 };
    m_src1.SetRow(0, src1_row0);
    m_src1.SetRow(1, src1_row1);
    m_src1.SetRow(2, src1_row2);
    m_src1.SetRow(3, src1_row3);
    Mat4x4TestF32(m_src1);
}

int main() {
  Mat4x4TestF32();
  return 0;
}

ch15_07/neon.cpp

#include "Vec128.h"

void Mat4x4TransposeF32_(float* m_des, const float* m_src1) {
  __asm volatile ("\n\
	ld1	{v0.4s-v3.4s}, [x1]                                 \n\
	trn1	v4.4s, v0.4s, v1.4s            // a0 b0 a2 b2       \n\
	trn2	v5.4s, v0.4s, v1.4s            // a1 b1 a3 b3       \n\
	trn1	v6.4s, v2.4s, v3.4s            // c0 d0 c2 d2       \n\
	trn2	v7.4s, v2.4s, v3.4s            // c1 d1 c3 d3       \n\
	trn1	v0.2d, v4.2d, v6.2d            // a0 b0 c0 d0       \n\
	trn1	v1.2d, v5.2d, v7.2d            // a1 b1 c1 d1       \n\
	trn2	v2.2d, v4.2d, v6.2d            // a2 b2 c2 d2       \n\
	trn2	v3.2d, v5.2d, v7.2d            // a3 b3 c3 d3       \n\
	st1	{v0.4s-v3.4s}, [x0]                                 \n\
	"
	:
	:
	: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
	);
}

ch15_07/main.cpp の実行例

arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O -S neon.cpp
arm64@manet Ch15_07 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet Ch15_07 % ./a.out

Results for Mat4x4TestF32
Matrix m_src1
        10.0          11.0          12.0          13.0
        20.0          21.0          22.0          23.0
        30.0          31.0          32.0          33.0
        40.0          41.0          42.0          43.0

Matrix m_des1 (transpose of m_src1)
        10.0          20.0          30.0          40.0
        11.0          21.0          31.0          41.0
        12.0          22.0          32.0          42.0
        13.0          23.0          33.0          43.0

Matrix m_des2 (transpose of m_src1)
        10.0          20.0          30.0          40.0
        11.0          21.0          31.0          41.0
        12.0          22.0          32.0          42.0
        13.0          23.0          33.0          43.0

arm64@manet Ch15_07 %

http://ynitta.com/