| ch15_02/main.cpp |
#include <iostream>
#include <iomanip>
#include <cmath>
#include "Vec128.h"
using namespace std;
extern void PackedCompareF32_(Vec128 x[8], const Vec128& a, const Vec128& b);
extern void PackedCompareF64_(Vec128 x[8], const Vec128& a, const Vec128& b);
const char* c_CmpStr[8] = {
"EQ", "NE", "GT", "GE", "LT", "LE", "a LT0", "b GT0"
};
void PackedCompareF32(void) {
const char nl = '\n';
Vec128 x[8], a, b;
a.m_F32[0] = 2.0; b.m_F32[0] = -4.0;
a.m_F32[1] = 17.0; b.m_F32[1] = 12.0;
a.m_F32[2] = -6.0; b.m_F32[2] = -6.0;
a.m_F32[3] = 3.0; b.m_F32[3] = 8.0;
PackedCompareF32_(x, a, b);
cout << "\nResults for PackedCompareF32_\n";
cout << setw(11) << 'a' << ':' << a.ToStringF32() << nl;
cout << setw(11) << 'b' << ':' << b.ToStringF32() << nl;
cout << nl;
for (int j = 0; j < 8; j++)
cout << setw(11) << c_CmpStr[j] << ':' << x[j].ToStringX32() << nl;
}
void PackedCompareF64(void)
{
const char nl = '\n';
Vec128 x[8], a, b;
a.m_F64[0] = -2.0; b.m_F64[0] = -4.0;
a.m_F64[1] = M_SQRT2; b.m_F64[1] = M_PI;
PackedCompareF64_(x, a, b);
cout << "\nResults for PackedCompareF64_\n";
cout << setw(11) << 'a' << ':' << a.ToStringF64() << nl;
cout << setw(11) << 'b' << ':' << b.ToStringF64() << nl;
cout << nl;
for (int j = 0; j < 8; j++)
cout << setw(11) << c_CmpStr[j] << ':' << x[j].ToStringX64() << nl;
}
int main()
{
PackedCompareF32();
PackedCompareF64();
return 0;
}
|
| ch15_02/neon.cpp |
#include "Vec128.h"
void PackedCompareF32_(Vec128 x[8], const Vec128& a, const Vec128& b) {
__asm volatile("\n\
ld1 {v0.4s}, [x1] // v0 = a \n\
ld1 {v1.4s}, [x2] // v1 = b \n\
fcmeq v2.4s, v0.4s, v1.4s // packed a == b \n\
st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
not v2.16b, v2.16b // packed a !=b \n\
st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmgt v2.4s, v0.4s, v1.4s // packed a > b \n\
st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmge v2.4s, v0.4s, v1.4s // packed a >= b \n\
st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmlt v2.4s, v0.4s, v1.4s // packed a < b \n\
st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmle v2.4s, v0.4s, v1.4s // packed a <= b \n\
st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmlt v2.4s, v0.4s, 0.0 // packed a < 0 \n\
st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmgt v2.4s, v1.4s, 0.0 // packed b > 0 \n\
st1 {v2.4s}, [x0], 16 // [x0]=v2; x0+=16 \n\
"
:
:
: "v0", "v1", "v2"
);
}
void PackedCompareF64_(Vec128 x[8], const Vec128& a, const Vec128& b) {
__asm volatile("\n\
ld1 {v0.2d}, [x1] // v0 = a \n\
ld1 {v1.2d}, [x2] // v1 = b \n\
fcmeq v2.2d, v0.2d, v1.2d // packed a == b \n\
st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
not v2.16b, v2.16b // packed a !=b \n\
st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmgt v2.2d, v0.2d, v1.2d // packed a > b \n\
st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmge v2.2d, v0.2d, v1.2d // packed a >= b \n\
st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmlt v2.2d, v0.2d, v1.2d // packed a < b \n\
st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmle v2.2d, v0.2d, v1.2d // packed a <= b \n\
st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmlt v2.2d, v0.2d, 0.0 // packed a < 0 \n\
st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\
\n\
fcmgt v2.2d, v1.2d, 0.0 // packed b > 0 \n\
st1 {v2.2d}, [x0], 16 // [x0]=v2; x0+=16 \n\
"
:
:
: "v0", "v1", "v2"
);
}
|
| ch15_02/main.cpp の実行例 |
arm64@manet Ch15_02 % g++ -I.. -std=c++11 -O main.cpp neon.cpp -o a.out
arm64@manet Ch15_02 % ./a.out
Results for PackedCompareF32_
a: 2.000000 17.000000 | -6.000000 3.000000
b: -4.000000 12.000000 | -6.000000 8.000000
EQ: 00000000 00000000 | FFFFFFFF 00000000
NE: FFFFFFFF FFFFFFFF | 00000000 FFFFFFFF
GT: FFFFFFFF FFFFFFFF | 00000000 00000000
GE: FFFFFFFF FFFFFFFF | FFFFFFFF 00000000
LT: 00000000 00000000 | 00000000 FFFFFFFF
LE: 00000000 00000000 | FFFFFFFF FFFFFFFF
a LT0: 00000000 00000000 | FFFFFFFF 00000000
b GT0: 00000000 FFFFFFFF | 00000000 FFFFFFFF
Results for PackedCompareF64_
a: -2.000000000000 | 1.414213562373
b: -4.000000000000 | 3.141592653590
EQ: 0000000000000000 | 0000000000000000
NE: FFFFFFFFFFFFFFFF | FFFFFFFFFFFFFFFF
GT: FFFFFFFFFFFFFFFF | 0000000000000000
GE: FFFFFFFFFFFFFFFF | 0000000000000000
LT: 0000000000000000 | FFFFFFFFFFFFFFFF
LE: 0000000000000000 | FFFFFFFFFFFFFFFF
a LT0: FFFFFFFFFFFFFFFF | 0000000000000000
b GT0: 0000000000000000 | FFFFFFFFFFFFFFFF
|