﻿ c-打开MP：SIMD循环中的SIMD兼容功能？ - 代码日志

#### c-打开MP：SIMD循环中的SIMD兼容功能？

``````float * x = (float *) malloc(10 * sizeof(float));
float * y = (float *) malloc(10 * sizeof(float));

for(int i = 0; i < 10; i++)
y[i] = 10;

#pragma omp simd
for(int i = 0; i < 10; i++)
x[i] = y[i]*y[i];
``````

``````float square(float x) {
return x * x;
}
float halve(float x) {
return x / 2.;
}
``````

``````void apply_simd(float * x, float * y, int length, float (*simd_func)(float c)){
#pragma omp simd
for(int i = 0; i < length; i++)
x[i] = simd_func(y[i])
}
``````

``````float inline square(float x){ ... }
``````

>函数指针具有编译时常数值
>指向编译器可以看到其定义的函数

``````apply_simd(x, y, length, halve);   // copy y to x
apply_simd(x, x, length, square);  // then update x in-place
// NEVER DO THIS, make one function that does both things
// with gcc and clang, compiles as written to two separate loops.
``````

``````// your original function mostly unchanged, but with size_t and inline
inline  // allows inlining even with -fPIC
void apply_simd(float * x, const float *y, size_t length, float (*simd_func)(float c)){
#pragma omp simd
for(size_t i = 0; i < length; i++)
x[i] = simd_func(y[i]);
}
``````

C 11来电者：

``````// __restrict isn't needed with OpenMP, but you might want to assert non-overlapping for better auto-vectorization with non-OpenMP compilers.
void test_lambda(float *__restrict x, const float *__restrict y, size_t length)
{
float (*funcptr)(float) = [](float a) -> float {
float h=0.5f*a; // halve first allows vmulps with a memory source operand
return h*h;    // 0.25 * a * a doesn't optimize to that with clang :/
};

apply_simd(x, y, length, funcptr);
}
``````

``````void test_lambda17(float *__restrict x, const float *__restrict y, size_t length)
{
apply_simd(x, y, length, [](float a) {
float h = 0.5f*a;
return h * h;
}
);
}
``````

``````.L4:
vmulps  ymm0, ymm1, YMMWORD PTR [rsi+rax]
vmulps  ymm0, ymm0, ymm0
vmovups YMMWORD PTR [rdi+rax], ymm0
cmp     rax, rcx
jne     .L4
``````

clang展开一些操作,并且可能接近每个时钟存储的一个256位向量加载,并乘以2. (非索引寻址模式可以通过展开隐藏两个指针增量来实现.傻傻的编译器.：/)

Lambda或函数指针作为模板参数

C 17允许传递没有捕获的自动存储lambda作为功能对象. (以前的标准要求通过外部或内部(静态)链接来传递作为模板参数的功能.)

``````template <float simd_func(float c)>
void apply_template(float *x, const float *y, size_t length){
#pragma omp simd
for(size_t i = 0; i < length; i++)
x[i] = simd_func(y[i]);
}

void test_lambda(float *__restrict x, const float *__restrict y, size_t length)
{
// static // even static doesn't help work around the gcc bug
constexpr auto my_op = [](float a) -> float {
float h=0.5f*a; // halve first allows vmulps with a memory source operand
return h*h;    // 0.25 * a * a doesn't optimize to that with clang :/
};

// I don't know what the unary + operator is doing here, but some examples use it
apply_lambda<+my_op>(x, y, length); // clang accepts this, gcc doesn't
}
``````

clang编译就很好了,但是即使使用-std = gnu 17,g也会错误地拒绝它

``````// `inline` is still necessary for it to actually inline with -fPIC (in a shared lib)
inline float my_func(float a) { return 0.25f * a*a;}

void test_template(float *__restrict x, const float *__restrict y, size_t length)
{
apply_lambda<my_func>(x, y, length);   // not actually a lambda, just a function
}
``````

``````.L25:
vmulps  ymm0, ymm1, YMMWORD PTR [rsi+rax]   # ymm0 = 0.25f * y[i+0..7]
vmulps  ymm0, ymm0, YMMWORD PTR [rsi+rax]   # reload the same vector again
vmovups YMMWORD PTR [rdi+rax], ymm0        # store to x[i+0..7]
cmp     rax, rcx
jne     .L25
``````