NEON Co-processor の 128 bit レジスタを double (64 bit) 2個、float (32 bit) 4個、long (64 bit) 2個、int (32 bit) 4個、short (16 bit) 8個、byte (8 bit) 16個 として扱うための共用体 Vec128 を定義する。
| Vec128.h |
#pragma once
#include <string>
#include <cstdint>
#include <sstream>
#include <iomanip>
struct alignas(16) Vec128 {
public:
union {
int8_t m_I8[16];
int16_t m_I16[8];
int32_t m_I32[4];
int64_t m_I64[2];
uint8_t m_U8[16];
uint16_t m_U16[8];
uint32_t m_U32[4];
uint64_t m_U64[2];
float m_F32[4];
double m_F64[2];
};
private:
template <typename T> std::string ToStringInt(const T* x, int n, int w) {
std::ostringstream oss;
for (int i=0; i<n; i++) {
oss << std::setw(w) << (int64_t)x[i];
if (i + 1 == n / 2) oss << " |";
}
return oss.str();
}
template <typename T> std::string ToStringUint(const T* x, int n, int w) {
std::ostringstream oss;
for (int i=0; i<n; i++) {
oss << std::setw(w) << (uint64_t)x[i];
if (i + 1 == n / 2) oss << " |";
}
return oss.str();
}
template <typename T> std::string ToStringHex(const T* x, int n, int w) {
std::ostringstream oss;
for (int i=0; i<n; i++) {
const int w_temp = 16;
std::ostringstream oss_temp;
oss_temp << std::uppercase << std::hex << std::setfill('0');
oss_temp << std::setw(w_temp) << (uint64_t)x[i];
std::string s1 = oss_temp.str();
std::string s2 = s1.substr(w_temp - sizeof(T) * 2);
oss << std::setw(w) << s2;
if (i + 1 == n / 2) oss << " |";
}
return oss.str();
}
template <typename T> std::string ToStringFP(const T* x, int n, int w, int p) {
std::ostringstream oss;
oss << std::fixed << std::setprecision(p);
for (int i=0; i<n; i++) {
oss << std::setw(w) << x[i];
if (i + 1 == n / 2) oss << " |";
}
return oss.str();
}
public:
std::string ToStringI8(void) {
return ToStringInt(m_I8, sizeof(m_I8) / sizeof(int8_t), 4);
}
std::string ToStringI16(void) {
return ToStringInt(m_I16, sizeof(m_I16) / sizeof(int16_t), 8);
}
std::string ToStringI32(void) {
return ToStringInt(m_I32, sizeof(m_I32) / sizeof(int32_t), 16);
}
std::string ToStringI64(void) {
return ToStringInt(m_I64, sizeof(m_I64) / sizeof(int64_t), 32);
}
std::string ToStringU8(void) {
return ToStringUint(m_U8, sizeof(m_U8) / sizeof(uint8_t), 4);
}
std::string ToStringU16(void) {
return ToStringUint(m_U16, sizeof(m_U16) / sizeof(uint16_t), 8);
}
std::string ToStringU32(void) {
return ToStringUint(m_I32, sizeof(m_I32) / sizeof(int32_t), 16);
}
std::string ToStringU64(void) {
return ToStringUint(m_I64, sizeof(m_I64) / sizeof(int64_t), 32);
}
std::string ToStringX8(void) {
return ToStringHex(m_U8, sizeof(m_U8) / sizeof(uint8_t), 4);
}
std::string ToStringX16(void) {
return ToStringHex(m_U16, sizeof(m_U16) / sizeof(uint16_t), 8);
}
std::string ToStringX32(void) {
return ToStringHex(m_I32, sizeof(m_I32) / sizeof(int32_t), 16);
}
std::string ToStringX64(void) {
return ToStringHex(m_I64, sizeof(m_I64) / sizeof(int64_t), 32);
}
std::string ToStringF32(void) {
return ToStringFP(m_F32, sizeof(m_F32) / sizeof(float), 16, 6);
}
std::string ToStringF64(void) {
return ToStringFP(m_F64, sizeof(m_F64) / sizeof(double), 32, 12);
}
};
|
NEON を使って基本演算を行うプログラムの説明。
NEON を使って比較演算を行うプログラムの説明。
NEON を使って型変換を行うプログラムの説明。
NEON を使って相関係数の計算を行うプログラムの説明。
略
| AlignedMem.h |
#pragma once
#include <cstdint>
#include <stdexcept>
#include <stdlib.h>
class AlignedMem {
public:
static void* Allocate(size_t mem_size, size_t mem_alignment) {
void *ptr;
if (posix_memalign(&ptr, mem_alignment, mem_size) != 0)
throw std::runtime_error("Memory allocation error: AllocateAlignedMem()");
return ptr;
}
static void Release(void* p) {
free(p);
}
template <typename T> static bool IsAligned(const T* p, size_t alignment) {
if (p == nullptr)
return false;
if (((uintptr_t)p % alignment) != 0)
return false;
return true;
}
};
template <class T> class AlignedArray {
T* m_Data;
size_t m_Size;
public:
AlignedArray(void) = delete;
AlignedArray(const AlignedArray& aa) = delete;
AlignedArray(AlignedArray&& aa) = delete;
AlignedArray& operator = (const AlignedArray& aa) = delete;
AlignedArray& operator = (AlignedArray&& aa) = delete;
AlignedArray(size_t size, size_t alignment) {
m_Size = size;
m_Data = (T*)AlignedMem::Allocate(size * sizeof(T), alignment);
}
~AlignedArray() {
AlignedMem::Release(m_Data);
}
T* Data(void) { return m_Data; }
size_t Size(void) { return m_Size; }
void Fill(T val) {
for (size_t i = 0; i < m_Size; i++)
m_Data[i] = val;
}
};
|
| MatrixF32.h |
//-------------------------------------------------
// MatrixF32.h
//-------------------------------------------------
#pragma once
#include <iostream>
#include <iomanip>
#include <random>
#include <string>
#include <cstring>
#include <stdexcept>
#include "AlignedMem.h"
class MatrixF32 {
static const size_t c_Alignment = 16;
size_t m_NumRows;
size_t m_NumCols;
size_t m_NumElements;
size_t m_DataSize;
float* m_Data;
int m_OstreamW;
std::string m_OstreamDelim;
bool IsConforming(size_t num_rows, size_t num_cols) const {
return m_NumRows == num_rows && m_NumCols == num_cols;
}
void Allocate(size_t num_rows, size_t num_cols) {
m_NumRows = num_rows;
m_NumCols = num_cols;
m_NumElements = m_NumRows * m_NumCols;
m_DataSize = m_NumElements * sizeof(float);
if (m_NumElements == 0)
m_Data = nullptr;
else
m_Data = (float*)AlignedMem::Allocate(m_DataSize, c_Alignment);
SetOstream(10, " ");
}
void Cleanup(void) {
m_NumRows = m_NumCols = m_NumElements = m_DataSize = 0;
m_Data = nullptr;
}
void Release(void) {
if (m_Data != nullptr)
AlignedMem::Release(m_Data);
Cleanup();
}
public:
MatrixF32(void) {
Allocate(0, 0);
}
MatrixF32(size_t num_rows, size_t num_cols) {
Allocate(num_rows, num_cols);
Fill(0);
}
MatrixF32(size_t num_rows, size_t num_cols, bool set_identity) {
Allocate(num_rows, num_cols);
Fill(0);
if (set_identity && m_NumRows == m_NumCols) {
for (size_t i = 0; i < num_rows; i++)
m_Data[i * m_NumCols + i] = 1.0f;
}
}
MatrixF32(const MatrixF32& mat) {
Allocate(mat.m_NumRows, mat.m_NumCols);
memcpy(m_Data, mat.m_Data, m_DataSize);
SetOstream(mat.m_OstreamW, mat.m_OstreamDelim);
}
MatrixF32(MatrixF32&& mat) {
m_NumRows = mat.m_NumRows;
m_NumCols = mat.m_NumCols;
m_NumElements = mat.m_NumElements;
m_DataSize = mat.m_DataSize;
m_Data = mat.m_Data;
m_OstreamW = mat.m_OstreamW;
m_OstreamDelim = mat.m_OstreamDelim;
mat.Cleanup();
}
~MatrixF32() {
Release();
}
MatrixF32& operator = (const MatrixF32& mat) {
if (this != &mat) {
if (!IsConforming(*this, mat)) {
Release();
Allocate(mat.m_NumRows, mat.m_NumCols);
}
memcpy(m_Data, mat.m_Data, m_DataSize);
SetOstream(mat.m_OstreamW, mat.m_OstreamDelim);
}
return *this;
}
MatrixF32& operator = (MatrixF32&& mat) {
Release();
m_NumRows = mat.m_NumRows;
m_NumCols = mat.m_NumCols;
m_NumElements = mat.m_NumElements;
m_DataSize = mat.m_DataSize;
m_Data = mat.m_Data;
SetOstream(mat.m_OstreamW, mat.m_OstreamDelim);
mat.Cleanup();
return *this;
}
friend bool operator == (const MatrixF32& mat1, const MatrixF32& mat2) {
if (!IsConforming(mat1, mat2))
return false;
return (memcmp(mat1.m_Data, mat2.m_Data, mat1.m_DataSize) == 0) ? true : false;
}
friend bool operator != (const MatrixF32& mat1, const MatrixF32& mat2) {
if (!IsConforming(mat1, mat2))
return false;
return (memcmp(mat1.m_Data, mat2.m_Data, mat1.m_DataSize) != 0) ? true : false;
}
float* Data(void) { return m_Data; }
const float* Data(void) const { return m_Data; }
size_t GetNumRows(void) const { return m_NumRows; }
size_t GetNumCols(void) const { return m_NumCols; }
size_t GetNumElements(void) const { return m_NumElements; }
bool IsSquare(void) const { return m_NumRows == m_NumCols; }
friend MatrixF32 operator + (const MatrixF32& mat1, const MatrixF32& mat2) {
if (!IsConforming(mat1, mat2))
throw std::runtime_error("Non-conforming operands: operator +");
MatrixF32 result(mat1.m_NumRows, mat1.m_NumCols);
for (size_t i = 0; i < result.m_NumElements; i++)
result.m_Data[i] = mat1.m_Data[i] + mat2.m_Data[i];
return result;
}
friend MatrixF32 operator * (const MatrixF32& mat1, const MatrixF32& mat2) {
if (mat1.m_NumCols != mat2.m_NumRows)
throw std::runtime_error("Non-conforming operands: operator *");
size_t m = mat1.m_NumCols;
MatrixF32 result(mat1.m_NumRows, mat2.m_NumCols);
for (size_t i = 0; i < result.m_NumRows; i++) {
for (size_t j = 0; j < result.m_NumCols; j++) {
float sum = 0;
for (size_t k = 0; k < m; k++) {
float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j];
sum += val;
}
result.m_Data[i * result.m_NumCols + j] = sum;
}
}
return result;
}
//
// For some operations, static functions are used instead of overloaded
// operators in order to avoid inaccurate benchmark performance comparisons.
//
static void Add(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) {
if (!IsConforming(result, mat1) || !IsConforming(mat1, mat2))
throw std::runtime_error("Non-conforming operands: MatrixF32::Add");
for (size_t i = 0; i < result.m_NumElements; i++)
result.m_Data[i] = mat1.m_Data[i] + mat2.m_Data[i];
}
static void Mul(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) {
if (mat1.m_NumCols != mat2.m_NumRows)
throw std::runtime_error("Non-conforming operands: MatrixF32::Mul");
if (result.m_NumRows != mat1.m_NumRows || result.m_NumCols != mat2.m_NumCols)
throw std::runtime_error("Invalid matrix size: MatrixF32::Mul");
size_t m = mat1.m_NumCols;
for (size_t i = 0; i < result.m_NumRows; i++) {
for (size_t j = 0; j < result.m_NumCols; j++) {
float sum = 0;
for (size_t k = 0; k < m; k++) {
float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j];
sum += val;
}
result.m_Data[i * result.m_NumCols + j] = sum;
}
}
}
static void Mul4x4(MatrixF32& result, const MatrixF32& mat1, const MatrixF32& mat2) {
const size_t nrows = 4;
const size_t ncols = 4;
const size_t m = 4;
for (size_t i = 0; i < nrows; i++) {
for (size_t j = 0; j < ncols; j++) {
float sum = 0;
for (size_t k = 0; k < m; k++) {
float val = mat1.m_Data[i * mat1.m_NumCols + k] * mat2.m_Data[k * mat2.m_NumCols + j];
sum += val;
}
result.m_Data[i * result.m_NumCols + j] = sum;
}
}
}
static void MulScalar(MatrixF32& result, const MatrixF32& mat, float val) {
if (!IsConforming(result, mat))
throw std::runtime_error("Non-conforming operands: MatrixF32::MulScalar");
for (size_t i = 0; i < result.m_NumElements; i++)
result.m_Data[i] = mat.m_Data[i] * val;
}
static void Transpose(MatrixF32& result, const MatrixF32& mat1) {
if (result.m_NumRows != mat1.m_NumCols || result.m_NumCols != mat1.m_NumRows)
throw std::runtime_error("Non-conforming operands: MatrixF32::Transpose");
for (size_t i = 0; i < result.m_NumRows; i++) {
for (size_t j = 0; j < result.m_NumCols; j++)
result.m_Data[i * result.m_NumCols + j] = mat1.m_Data[j * mat1.m_NumCols + i];
}
}
static bool IsConforming(const MatrixF32& mat1, const MatrixF32& mat2) {
return mat1.m_NumRows == mat2.m_NumRows && mat1.m_NumCols == mat2.m_NumCols;
}
void Fill(float val) {
for (size_t i = 0; i < m_NumElements; i++)
m_Data[i] = val;
}
void FillRandomInt(unsigned int seed_val, int min_val, int max_val, bool exclude_zero) {
std::uniform_int_distribution<> dist {min_val, max_val};
std::mt19937 rng {seed_val};
if (exclude_zero) {
for (size_t i = 0; i < m_NumElements; i++) {
int val;
while ((val = dist(rng)) == 0) {
}
m_Data[i] = (float)val;
}
} else {
for (size_t i = 0; i < m_NumElements; i++)
m_Data[i] = (float)dist(rng);
}
}
void RoundToZero(float epsilon) {
for (size_t i = 0; i < m_NumElements; i++) {
if (fabs(m_Data[i]) < epsilon)
m_Data[i] = 0;
}
}
void RoundToI(float epsilon) {
for (size_t i = 0; i < m_NumRows; i++) {
for (size_t j = 0; j < m_NumCols; j++) {
size_t k = i * m_NumCols + j;
if (i != j) {
if (fabs(m_Data[k]) < epsilon)
m_Data[k] = 0;
} else {
if (fabs(m_Data[k] - 1.0f) < epsilon)
m_Data[k] = 1.0f;
}
}
}
}
void SetCol(size_t col, const float* vals) {
if (col >= m_NumCols)
throw std::runtime_error("Invalid column index: MatrixF32::SetCol()");
for (size_t i = 0; i < m_NumRows; i++)
m_Data[i * m_NumCols + col] = vals[i];
}
void SetRow(size_t row, const float* vals) {
if (row >= m_NumRows)
throw std::runtime_error("Invalid row index: MatrixF32::SetRow()");
for (size_t j = 0; j < m_NumCols; j++)
m_Data[row * m_NumCols + j] = vals[j];
}
void SetI(void) {
if (!IsSquare())
throw std::runtime_error("Square matrix required: MatrixF32::SetI()");
for (size_t i = 0; i < m_NumRows; i++) {
for (size_t j = 0; j < m_NumCols; j++)
m_Data[i * m_NumCols + j] = (i == j) ? 1.0f : 0.0f;
}
}
void SetOstream(int w, const std::string& delim){
m_OstreamW = w;
m_OstreamDelim = delim;
}
float Trace(void) const {
if (!IsSquare())
throw std::runtime_error("Square matrix required: MatrixF32::Trace()");
float sum = 0;
for (size_t i = 0; i < m_NumRows; i++)
sum += m_Data[i * m_NumCols + i];
return sum;
}
float Trace4x4(void) const {
if (m_NumRows != 4 || m_NumCols != 4)
throw std::runtime_error("4x4 matrix required: MatrixF32::Trace()");
float sum = m_Data[0] + m_Data[5] + m_Data[10] + m_Data[15];
return sum;
}
friend std::ostream& operator << (std::ostream& os, const MatrixF32& mat) {
for (size_t i = 0; i < mat.m_NumRows; i++) {
for (size_t j = 0; j < mat.m_NumCols; j++) {
os << std::setw(mat.m_OstreamW) << mat.m_Data[i * mat.m_NumCols + j];
if (j + 1 < mat.m_NumCols)
os << mat.m_OstreamDelim;
}
os << std::endl;
}
return os;
}
};
|
NEON を使って4x4行列の乗算を行うプログラムの説明。
NEON を使って4x4行列の転置行列求めるプログラムの説明。
略