Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

小彭老师大大,有关mpm的流体模拟函数优化 #10

Open
dd123-a opened this issue Oct 6, 2024 · 11 comments
Open

小彭老师大大,有关mpm的流体模拟函数优化 #10

dd123-a opened this issue Oct 6, 2024 · 11 comments

Comments

@dd123-a
Copy link

dd123-a commented Oct 6, 2024

void Simulate() {

// CLEAR GRID
std::size_t grid_size = grid.size();

// 确保 grid_size 不超出 int 的范围

#pragma omp parallel for
for (int i = 0; i < static_cast(grid_size); ++i) {
grid[i].vel = glm::vec3(0.0f);
grid[i].mass = 0.0f;
}

// P2G_1
//质量与动量转移
 #pragma omp parallel for
for (int i = 0; i < particles.size(); i++) {
    auto &p = particles[i];
    glm::uvec3 cell_idx = glm::uvec3(p.pos);
    glm::vec3 cell_diff = (p.pos - glm::vec3(cell_idx)) - 0.5f;

    glm::vec3 weights_[3];
    weights_[0] = 0.5f * glm::pow(0.5f - cell_diff, glm::vec3(2.0f));
    weights_[1] = 0.75f - glm::pow(cell_diff, glm::vec3(2.0f));
    weights_[2] = 0.5f * glm::pow(0.5f + cell_diff, glm::vec3(2.0f));

    for (uint32_t gx = 0; gx < 3; ++gx) {
        for (uint32_t gy = 0; gy < 3; ++gy) {
            for (uint32_t gz = 0; gz < 3; ++gz) {
                float weight = weights_[gx].x * weights_[gy].y * weights_[gz].z;

                glm::uvec3 cell_pos = glm::uvec3(cell_idx.x + gx - 1,
                                                 cell_idx.y + gy - 1,
                                                 cell_idx.z + gz - 1);
                glm::vec3 cell_dist = (glm::vec3(cell_pos) - p.pos) + 0.5f;
                glm::vec3 Q = p.C * cell_dist;

                int cell_index = (int) cell_pos.x +
                                 (int) cell_pos.y * grid_res +
                                 (int) cell_pos.z * grid_res * grid_res;

                float mass_contrib = weight * particle_mass;
                grid[cell_index].mass += mass_contrib;
                grid[cell_index].vel += mass_contrib * (p.vel + Q);
            }

        }
    }
}

// P2G_2
//密度计算和压力计算
 #pragma omp parallel for
for (int i = 0; i < particles.size(); i++) {
    auto &p = particles[i];

    glm::uvec3 cell_idx = glm::uvec3(p.pos);
    glm::vec3 cell_diff = (p.pos - glm::vec3(cell_idx)) - 0.5f;

    glm::vec3 weights_[3];
    weights_[0] = 0.5f * glm::pow(0.5f - cell_diff, glm::vec3(2.0f));
    weights_[1] = 0.75f - glm::pow(cell_diff, glm::vec3(2.0f));
    weights_[2] = 0.5f * glm::pow(0.5f + cell_diff, glm::vec3(2.0f));

    float density = 0.0f;
    for (uint32_t gx = 0; gx < 3; ++gx) {
        for (uint32_t gy = 0; gy < 3; ++gy) {
            for (uint32_t gz = 0; gz < 3; ++gz) {
                float weight = weights_[gx].x * weights_[gy].y * weights_[gz].z;

                glm::uvec3 cell_pos = glm::uvec3(cell_idx.x + gx - 1,
                                                 cell_idx.y + gy - 1,
                                                 cell_idx.z + gz - 1);

                int cell_index = (int) cell_pos.x +
                                 (int) cell_pos.y * grid_res +
                                 (int) cell_pos.z * grid_res * grid_res;

                density += grid[cell_index].mass * weight;
            }
        }
    }

    float volume = particle_mass / density;
    float pressure = std::max(-0.1f, eos_stiffness *
                                     (std::pow(density / rest_density, eos_power) - 1.0f));

    glm::mat3 stress = glm::mat3(
            -pressure, 0, 0,
            0, -pressure, 0,
            0, 0, -pressure
    );

     glm::mat3 strain = p.C;

     //float trace = strain[0][0] + strain[1][0] + strain[2][0]; // DEBUG
     float trace = glm::determinant(strain);
     strain[0][0] = strain[1][0] = strain[2][0] = trace;

     glm::mat3 viscosity_term = dynamic_viscosity * strain;
     stress += viscosity_term;

    auto eq_16_term_0 = -volume * 4 * stress * dt;

    for (uint32_t gx = 0; gx < 3; ++gx) {
        for (uint32_t gy = 0; gy < 3; ++gy) {
            for (uint32_t gz = 0; gz < 3; ++gz) {
                float weight = weights_[gx].x * weights_[gy].y * weights_[gz].z;

                glm::uvec3 cell_pos = glm::uvec3(cell_idx.x + gx - 1,
                                                 cell_idx.y + gy - 1,
                                                 cell_idx.z + gz - 1);

                glm::vec3 cell_dist = (glm::vec3(cell_pos) - p.pos) + 0.5f;

                int cell_index = (int) cell_pos.x +
                                 (int) cell_pos.y * grid_res +
                                 (int) cell_pos.z * grid_res * grid_res;

                glm::vec3 momentum = (eq_16_term_0 * weight) * cell_dist;
                grid[cell_index].vel += momentum;
            }
        }
    }
}

// GRID UPDATE
// 获取 grid 的大小
grid_size = grid.size();

// GRID UPDATE

#pragma omp parallel for
for (int i = 0; i < static_cast(grid_size); ++i) {
auto& cell = grid[static_caststd::size_t(i)]; // 使用 static_cast 将索引转换为 std::size_t
if (cell.mass > 0) {
cell.vel /= cell.mass;
cell.vel += dt * glm::vec3(0.0f, gravity, 0.0f);

        int index = cell.index;
        int x = index / (grid_res * grid_res);
        index /= grid_res;
        int y = (index / grid_res) % grid_res;
        index /= grid_res;
        int z = index;

        if (x < 1 || x > grid_res - 2) { cell.vel.x = 0.0f; }
        if (y < 1 || y > grid_res - 2) { cell.vel.y = 0.0f; }
        if (z < 1 || z > grid_res - 2) { cell.vel.z = 0.0f; }
    }
}


// G2P
 #pragma omp parallel for
for (int i = 0; i < static_cast<int>(particles.size()); ++i) {
    auto& p = particles[static_cast<std::size_t>(i)]; // 使用索引访问粒子

    p.vel = glm::vec3(0.0f);

    glm::uvec3 cell_idx = glm::uvec3(p.pos);
    glm::vec3 cell_diff = (p.pos - glm::vec3(cell_idx)) - 0.5f;

    glm::vec3 weights_[3];

    weights_[0] = 0.5f * glm::pow(0.5f - cell_diff, glm::vec3(2.0f));
    weights_[1] = 0.75f - glm::pow(cell_diff, glm::vec3(2.0f));
    weights_[2] = 0.5f * glm::pow(0.5f + cell_diff, glm::vec3(2.0f));

    glm::mat3 B = glm::mat3(0.0f);
    for (uint32_t gx = 0; gx < 3; ++gx) {
        for (uint32_t gy = 0; gy < 3; ++gy) {
            for (uint32_t gz = 0; gz < 3; ++gz) {
                float weight = weights_[gx].x * weights_[gy].y * weights_[gz].z;
                // std::cout << weight << std::endl;
                glm::uvec3 cell_pos = glm::uvec3(cell_idx.x + gx - 1,
                    cell_idx.y + gy - 1,
                    cell_idx.z + gz - 1);

                glm::vec3 cell_dist = (glm::vec3(cell_pos) - p.pos) + 0.5f;

                int cell_index = (int)cell_pos.x +
                    (int)cell_pos.y * grid_res +
                    (int)cell_pos.z * grid_res * grid_res;

                glm::vec3 weighted_velocity = grid[cell_index].vel * weight;

                B += glm::mat3(weighted_velocity * cell_dist.x,
                    weighted_velocity * cell_dist.y,
                    weighted_velocity * cell_dist.z);

                p.vel += weighted_velocity;
            }
        }
    }

    p.C = B * 4.0f;
    p.vel *= damping;
    p.pos += p.vel * dt;
    p.pos = glm::clamp(p.pos, 1.0f, grid_res - 2.0f);

    glm::vec3 x_n = p.pos + p.vel;
    const float wall_min = 3.0f;
    const float wall_max = grid_res - 4.0f;
    if (x_n.x < wall_min) p.vel.x += (wall_min - x_n.x);
    if (x_n.x > wall_max) p.vel.x += (wall_max - x_n.x);
    if (x_n.y < wall_min) p.vel.y += (wall_min - x_n.y);
    if (x_n.y > wall_max) p.vel.y += (wall_max - x_n.y);
    if (x_n.z < wall_min) p.vel.z += (wall_min - x_n.z);
    if (x_n.z > wall_max) p.vel.z += (wall_max - x_n.z);
}

}这是使用openmp的进行了加速,但是我有在youtube上看到可以对其用simd指令优化到单线程的也能流畅运行的程度,但本人对此一窍不通(悲),所以来探讨一下是否有可行性。(本人尝试过使用simd进行改写,但是debug的太痛苦了,模板元什么的最讨厌了)https://github.com/Hab5/mls-mpm这是对应的完整代码仓库,https://www.youtube.com/watch?v=4Y58Pg9tpSo这是对应的YouTube视频介绍(梦开始的地方)

@archibate
Copy link
Contributor

这是你的仓库吗?看到这个仓库使用了OpenGL计算着色器,为什么说是单CPU的,可以参考一下我的:https://github.com/archibate/opengl_mpm

@archibate
Copy link
Contributor

P2G这一步里的+=需要是atomic的吧?不然累计结果会不对,导致仿真结果爆炸。

@archibate
Copy link
Contributor

你可以用chrono测一测每一步的时间,你应该会发现P2G这一步花的时间是最长的,因为P2G涉及了scatter操作,会出现多个线程同时访问同一个grid的情况,需要低效的atomic。
为了避免多线程竞争导致的性能损失,有两种方案:

  1. 先把粒子按照莫顿码排个序,排序后因为openmp会把值接近的i放在一个线程里执行,从而只有边缘存在多线程同时atomic访问同一个grid的情况。
  2. 如果有4个cpu,那就把粒子按位置分成4块,分别在各自的区域里,那就不需要atomic了。

@archibate
Copy link
Contributor

archibate commented Oct 8, 2024

看到你把omp都注释了,是没加atomic炸了然后debug了半天是吧

                    float mass_contrib = weight * particle_mass;
                    grid[cell_index].mass += mass_contrib;
                    grid[cell_index].vel += mass_contrib * (p.vel + Q);

需要改成:

                    float mass_contrib = weight * particle_mass;
#pragma omp atomic
                    grid[cell_index].mass += mass_contrib;
#pragma omp atomic
                    grid[cell_index].vel.x += mass_contrib * (p.vel.x + Q);
#pragma omp atomic
                    grid[cell_index].vel.y += mass_contrib * (p.vel.y + Q);
#pragma omp atomic
                    grid[cell_index].vel.z += mass_contrib * (p.vel.z + Q);

所以,并行并不是儿童玩具,并不是说加上#pragma omp parallel for就能免费提升性能。并行是有成本的,99%的情况下,无脑加#pragma omp parallel for要么崩溃,要么没有任何加速效果。你必须改变编程思路,适应并行编程的特点,而不是继续按照串行的思路来写代码,这是一套截然不同的思路,并不是免费的。CPU的并行编程和串行时有很大不同,再加上SIMD优化也会完全不同,GPU更是完全不同,在GPU上并行编程若想获得理想的加速效果,有时甚至需要加大计算量来提升并行度,用空间换时间等。所以,实际上很多算法,因为本身不适合GPU的并行编程方式,而没有办法用GPU并行或者收效甚微。好消息是,MPM还算是一个GPU可以加速的算法,这里的P2G俗称scatter操作,thrust里是有稳定的方案的,当然CPU也有相应方案,无论如何,这些并行的方案所写的代码都会和你现在截然不同。

@archibate
Copy link
Contributor

archibate commented Oct 8, 2024

串行写法

例如,在串行编程中过滤出一个vector所有的偶数:

std::vector<int> in = {1, 2, 3, 4};
std::vector<int> out;
for (size_t i = 0; i < in.size(); ++i) {
  if (in[i] % 2 == 0)
    out.push_back(in[i]);
}

很明显,如果你无脑加上#pragma omp parallel for,这个std::vectorpush_back会直接奔溃,因为他并不是线程安全的。

串行思维的并行

坚持使用“串行思维”写并行的方法是使用并发安全的vector容器tbb::concurrent_vector,其push_back是线程安全的:

std::vector<int> in = {1, 2, 3, 4};
tbb::concurrent_vector<int> out;
#pragma omp parallel for
for (size_t i = 0; i < in.size(); ++i) {
  if (in[i] % 2 == 0)
    out.push_back(in[i]);
}

但是你会发现性能提升不大,甚至可能反而倒退,这就是错误用“串行思维”写并行代码的后果(你的代码中需要atomic地+=一系列随机位置的grid成员,就属于这种错误思想)。

真正的大并行

真正的并行方案可能令你大开眼界,需要把上面的单个for拆成三步:

std::vector<int> in = {1, 2, 3, 4};
std::vector<int> cnt(in.size());
#pragma omp parallel for
for (size_t i = 0; i < in.size(); ++i) {
  cnt[i] = in[i] % 2 == 0;
}
tbb::parallel_exclusive_scan(cnt.begin(), cnt.end());
std::vector<int> out(cnt.back() + in.back());
#pragma omp parallel for
for (size_t i = 0; i < in.size(); ++i) {
  if (in[i] % 2 == 0)
    out[cnt[i]] = in[i];
}

连续判断了两遍,很低效?不好意思。并行编程就是需要经常牺牲“计算量”来提升并行度,最终在高并行环境中反而更高效。

Building-Blocks

其中parallel_exclusive_scan是“前缀和”算法,是可以高度并行的,在GPU和CPU上均有高效的实现。此类常用的并行算法被称为“基本算法”,也是Thread-Building-Blocks的Building-Blocks得名由来。
通常写并行算法时,实际上会先拆分成多个“基本算法”,然后就可以直接调用TBB里的基本块来实现。TBB就像乐高积木一样,你需要自己根据你想要拼出模型的形状,选择适合的积木块。
parallel_forparallel_exclusive_scanparallel_reduceparallel_merge_sortparallel_bitonic_sortparallel_radix_sortparallel_scatterparallel_gather等都是这样的基本算法。而在你原始的头脑中,只有parallel_for这一种最无脑的基本算法,因为其他的算法你在串行编程中可能根本没见过。不去把自己的算法拆解成这些可以高度并行的“基本算法”,是你并行编程这么困难的根本原因。
顺便一提,上面这种过滤出指定条件(例如“偶数”)元素的算法被称为 filter,虽然可以进一步拆成 for+scan+for,但因为非常常用,也被当作是一种基本算法了,thrust中就有 thrust::copy_if

CPU优化可以走邪道

当然,上面这种基于Building-Blocks的“三步走”战略是并行得最彻底的,好处是:不论是CPU并行,CPU SIMD,GPU并行,多GPU并行,多机联网并行,都可以轻松移植过去。但是因为牺牲了一点“计算量”,如果CPU核心数量不多(例如只有2核),可能还不如串行来得快。
因此一般CPU并行和CPU SIMD都会有单独的优化方案,例如上面这种过滤出偶数的,用x86 SIMD的AVX2指令集有一种基于_mm_maskstore_epi32黑科技可以单核加速20倍,如果用AVX512又会有所不同,CPU并行也有黑科技可以加速,不必采用最泛用的“三步走”。
但是如果你已经写出并行的最彻底的GPU方案(例如上面这种“三步走”),那么也是可以无缝降级到CPU并行和CPU SIMD的,只不过在CPU上会不如专为SIMD优化的高效。
当然,GPU也有自己的邪道,例如利用shared memory在单个block内局部地三步走,而不必整个在全局范围内跑最通用的“三步走”浪费计算量和缓存局域性,但是如果你要方便移植的话基于Building-Blocks是最方便的。
所以,你考虑好你的目标用户到底是GPU还是CPU。据我估算,你这种算法上GPU可以获得免费的20倍提升,专门优化后可以获得100倍。CPU如果无脑OpenMP(搞定atomic问题后)可以获得免费的2倍提升,但是深度优化(SIMD+并行)可以继续获得20倍提升。

@dd123-a
Copy link
Author

dd123-a commented Oct 8, 2024

好的,感谢小彭老师的指导,我使用#pragma omp parallel for能得到一点效率上的提升,debug的话主要是想用_mm_maskstore_epi32类似这种simd、avx指令来重写一下(虽然我也感觉需要一定的同步机制代码,可能编译器自带的锁机制满足了已经,没遇到崩溃的情况),但是现在我觉得走gpu加速会更好一点,所以是将这部分代码修改为在.cu代码文件,在cpu和gpu之间传递数据吗

@archibate
Copy link
Contributor

不要在cpu和gpu之间来回拷,全部统一放gpu上,只有当你需要导出结果时才拷回cpu。

@dd123-a
Copy link
Author

dd123-a commented Oct 8, 2024

其实我更好奇的是,cpu上面的simd指令之类的具体实现是怎么做到的,我看对应的博文是使用寄存器,进行一系列的内存对齐,然后八个数据一批一批的处理,这也是我想向小彭老师大大请教的地方,或许小彭老师可以写一个简短cpu的simd指令的demo吗,就我给出的这个模拟函数,嘤嘤嘤

@dd123-a
Copy link
Author

dd123-a commented Oct 25, 2024

小彭老师我又来啦;
for (int i = 0; i < particles.size(); i += 4) {
// 初始化一个 AVX-512 寄存器 位置
__m512 pos = _mm512_set_ps(
0.0f, particles[i + 3].pos.z, particles[i + 3].pos.y, particles[i + 3].pos.x,
0.0f, particles[i + 2].pos.z, particles[i + 2].pos.y, particles[i + 2].pos.x,
0.0f, particles[i + 1].pos.z, particles[i + 1].pos.y, particles[i + 1].pos.x,
0.0f, particles[i].pos.z, particles[i].pos.y, particles[i].pos.x
);

 __m512 vel = _mm512_set_ps(
     0.0, particles[i + 3].vel.z, particles[i + 3].vel.y, particles[i + 3].vel.x,
     0.0, particles[i + 2].vel.z, particles[i + 2].vel.y, particles[i + 2].vel.x,
     0.0, particles[i + 1].vel.z, particles[i + 1].vel.y, particles[i + 1].vel.x,
     0.0, particles[i].vel.z, particles[i].vel.y, particles[i].vel.x
 );

 // 计算整数部分
 __m512 intPart = _mm512_floor_ps(pos);

 // 计算小数部分
 __m512 fracPart = _mm512_sub_ps(pos, intPart);

 // 准备一个包含全部元素为0.5的向量
 __m512 halfVector = _mm512_set1_ps(0.5f);
 __m512 halfVector1 = _mm512_set1_ps(0.75f);
 __m128 messVector = _mm_set1_ps(particle_mass);

 // 从小数部分中减去0.5
 __m512 cell_diff = _mm512_sub_ps(fracPart, halfVector);

 __m512 weights[3];

 weights[0] = _mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(halfVector, cell_diff), _mm512_sub_ps(halfVector, cell_diff)), halfVector);
 weights[1] = _mm512_sub_ps(halfVector1,_mm512_mul_ps(cell_diff, cell_diff));
 weights[2] = _mm512_mul_ps(_mm512_mul_ps(_mm512_add_ps(halfVector, cell_diff), _mm512_add_ps(halfVector, cell_diff)), halfVector);
 float tempArray[3][16];
 _mm512_store_ps(tempArray[0], weights[0]);
 _mm512_store_ps(tempArray[1], weights[1]);
 _mm512_store_ps(tempArray[2], weights[2]);
 for (int gx = 0; gx < 3; ++gx) {
     for (int gy = 0; gy < 3; ++gy) {
         for (int gz = 0; gz < 3; ++gz) {
             __m128 weight = _mm_set_ps(tempArray[gx][12] * tempArray[gy][13] * tempArray[gx][14],
                 tempArray[gx][8] * tempArray[gy][9] * tempArray[gx][10],
                 tempArray[gx][4] * tempArray[gy][5] * tempArray[gx][6],
                 tempArray[gx][0] * tempArray[gy][1] * tempArray[gz][2]);

             __m512 oneVector = _mm512_set_ps(0.0, gz - 1, gy - 1, gx - 1,
                 0.0, gz - 1, gy - 1, gx - 1,
                 0.0, gz - 1, gy - 1, gx - 1,
                 0.0, gz - 1, gy - 1, gx - 1);

             __m512 cell_pos = _mm512_add_ps(intPart, oneVector);

             __m512 cell_dist = _mm512_add_ps(_mm512_sub_ps(cell_pos, pos), halfVector);

             __m512 C[3];

             C[0] = _mm512_set_ps(
                 0.0, particles[i + 3].C[0][2], particles[i + 3].C[0][1], particles[i + 3].C[0][0],
                 0.0, particles[i + 2].C[0][2], particles[i + 2].C[0][1], particles[i + 2].C[0][0],
                 0.0, particles[i + 1].C[0][2], particles[i + 1].C[0][1], particles[i + 1].C[0][0],
                 0.0, particles[i].C[0][2], particles[i].C[0][1], particles[i].C[0][0]
             );

             C[1] = _mm512_set_ps(
                 0.0, particles[i + 3].C[1][2], particles[i + 3].C[1][1], particles[i + 3].C[1][0],
                 0.0, particles[i + 2].C[1][2], particles[i + 2].C[1][1], particles[i + 2].C[1][0],
                 0.0, particles[i + 1].C[1][2], particles[i + 1].C[1][1], particles[i + 1].C[1][0],
                 0.0, particles[i].C[1][2], particles[i].C[1][1], particles[i].C[1][0]
             );

             C[2] = _mm512_set_ps(
                 0.0, particles[i + 3].C[2][2], particles[i + 3].C[2][1], particles[i + 3].C[2][0],
                 0.0, particles[i + 2].C[2][2], particles[i + 2].C[2][1], particles[i + 2].C[2][0],
                 0.0, particles[i + 1].C[2][2], particles[i + 1].C[2][1], particles[i + 1].C[2][0],
                 0.0, particles[i].C[2][2], particles[i].C[2][1], particles[i].C[2][0]
             );

             __m512 Q = _mm512_add_ps(_mm512_add_ps(_mm512_mul_ps(cell_dist, C[0]), _mm512_mul_ps(cell_dist, C[1])), _mm512_mul_ps(cell_dist, C[2]));

             __m512 temp = _mm512_set_ps(
                 0.0, grid_res * grid_res, grid_res, 1,
                 0.0, grid_res * grid_res, grid_res, 1,
                 0.0, grid_res * grid_res, grid_res, 1,
                 0.0, grid_res * grid_res, grid_res, 1
             );

             float tempArray[16];
             __m512 cell_index_temp = _mm512_mul_ps(cell_pos, temp);
             _mm512_store_ps(tempArray, cell_index_temp);


             __m128 cell_index = _mm_set_ps(
                 tempArray[14] + tempArray[13] + tempArray[12],
                 tempArray[10] + tempArray[9] + tempArray[8],
                 tempArray[6] + tempArray[5] + tempArray[4],
                 tempArray[2] + tempArray[1] + tempArray[0]
             );

             __m128 mass_contrib = _mm_mul_ps(weight, messVector);

             float masstemp[4] = { 0 };
             float cellindextemp[4] = { 0 };
             _mm_store_ps(masstemp, mass_contrib);
             _mm_store_ps(cellindextemp, cell_index);
             if (cellindextemp[0] >= 91125 || cellindextemp[1] >= 91125 || cellindextemp[2] >= 91125 || cellindextemp[3] >= 91125||
                 cellindextemp[0] < 0 || cellindextemp[1] < 0 || cellindextemp[2] < 0 || cellindextemp[3] <0) {
                 printf("fucking");
             }

            grid[cellindextemp[0]].mass += masstemp[0];
            grid[cellindextemp[1]].mass += masstemp[1];
            grid[cellindextemp[2]].mass += masstemp[2];
            grid[cellindextemp[3]].mass += masstemp[3];


           __m512 massvector = _mm512_set_ps(
               0.0, masstemp[3], masstemp[3], masstemp[3],
               0.0, masstemp[2], masstemp[2], masstemp[2],
               0.0, masstemp[1], masstemp[1], masstemp[1],
               0.0,masstemp[0], masstemp[0], masstemp[0]
           );
           __m512 massvector1 = _mm512_set1_ps(masstemp[1]);
           __m512 massvector2 = _mm512_set1_ps(masstemp[2]);
           __m512 massvector3 = _mm512_set1_ps(masstemp[3]);
          
           float veltmp[16] = { 0 };
           _mm512_store_ps(veltmp, _mm512_mul_ps(_mm512_add_ps(vel, Q), massvector));
          
           grid[cellindextemp[0]].vel.x += veltmp[0];
           grid[cellindextemp[0]].vel.y += veltmp[1];
           grid[cellindextemp[0]].vel.z += veltmp[2];
          
           grid[cellindextemp[1]].vel.x += veltmp[4];
           grid[cellindextemp[1]].vel.y += veltmp[5];
           grid[cellindextemp[1]].vel.z += veltmp[6];
          
           grid[cellindextemp[2]].vel.x += veltmp[8];
           grid[cellindextemp[2]].vel.y += veltmp[9];
           grid[cellindextemp[2]].vel.z += veltmp[10];
          
           grid[cellindextemp[3]].vel.x += veltmp[12];
           grid[cellindextemp[3]].vel.y += veltmp[13];
           grid[cellindextemp[3]].vel.z += veltmp[14];
         }
     }
 }

}这是我对p2g_1这部分的avx512的尝试,但是在我测试下,好像和原来代码的效率相比没有差异,求指正,求拷打(呜呜呜

@archibate
Copy link
Contributor

archibate commented Oct 26, 2024 via email

@dd123-a
Copy link
Author

dd123-a commented Oct 28, 2024

太难了,感觉这个算法走avx512也太难了,死了一大半脑细胞也不知道应该怎么优化这里了: __m128 weight = _mm_set_ps(tempArray[gx][12] * tempArray[gy][13] * tempArray[gx][14],
tempArray[gx][8] * tempArray[gy][9] * tempArray[gx][10],
tempArray[gx][4] * tempArray[gy][5] * tempArray[gx][6],
tempArray[gx][0] * tempArray[gy][1] * tempArray[gz][2]);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants