答案¶

约 655 个字 587 行代码预计阅读时间 10 分钟

关于实验答案

如果条件允许，建议在看答案前先自行编写程序并解决可能存在的问题，这样可以让你对知识有更加深入的理解。

OpenMP实验答案¶

编写一个Hello World程序，尝试使用OpenMP并行输出，并且输出其线程id。多运行几次，观察输出结果的变化。

参考代码

#include <omp.h>
#include <stdio.h>

int main()
{
    #pragma omp parallel
    {
        int tid = omp_get_thread_num();
        printf("Hello from thread %d\n", tid);
    }
    return 0;
}

编写一个进行矩阵计算的程序，并给它添加OpenMP并行，观察添加OpenMP前后的计算时间，有什么变化？

参考代码

删去注释即可启动OpenMP并行

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
const int N = 600;

int main()
{
    float A[N][N], B[N][N], C[N][N];
    #pragma omp parallel for
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            A[i][j] = 1.0f;
            B[i][j] = 1.0f;
            C[i][j] = 0.0f;
        }
    }

    double begin = omp_get_wtime();

    // #pragma omp parallel for
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            C[i][j] = 0;
            for (int k = 0; k < N; k++) {
                C[i][j] += A[i][k] * B[k][j];
            }
        }
    }

    double end = omp_get_wtime()-begin;
    printf("计算时间: %.6f 秒\n", end);

    return 0;
}

略

蒙特卡洛算法估算\(\pi\)。尝试实现串行版本代码，并修改为openmp并行版本，在很大试验次数的情况下对比运行时间。

参考代码

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define NUM_POINTS 100000000

int main() {

    printf("总点数 = %d\n", NUM_POINTS);

    srand(time(NULL));

    double start_time, end_time;
    double pi_estimate;
    int num_inside;

    // ============串行版本============
    start_time = omp_get_wtime();
    num_inside = 0;

    for (long i = 0; i < NUM_POINTS; i++) {
        double x = (double)rand() / RAND_MAX;
        double y = (double)rand() / RAND_MAX;

        if (x*x + y*y <= 1.0) {
            num_inside++;
        }
    }

    pi_estimate = 4.0 * num_inside / NUM_POINTS;
    end_time = omp_get_wtime();

    printf("\n串行版本：\n");
    printf("π估计值 = %.8f\n", pi_estimate);
    printf("计算时间 = %.6f 秒\n", end_time - start_time);



    // ============并行版本============
    num_inside = 0;
    start_time = omp_get_wtime();

    #pragma omp parallel
    {
        // 每个线程有自己的随机数种子
        unsigned int seed = time(NULL) ^ omp_get_thread_num();

        int local_inside = 0;

        #pragma omp for
        for (int i = 0; i < NUM_POINTS; i++) {
            double x = (double)rand_r(&seed) / RAND_MAX;
            double y = (double)rand_r(&seed) / RAND_MAX;

            if (x*x + y*y <= 1.0) {
                local_inside++;
            }
        }

        // 临界区保护共享变量
        #pragma omp critical
        {
            num_inside += local_inside;
        }
    }

    pi_estimate = 4.0 * num_inside / NUM_POINTS;
    end_time = omp_get_wtime();

    printf("\n并行版本：\n");
    printf("π估计值 = %.8f\n", pi_estimate);
    printf("计算时间 = %.6f 秒\n", end_time - start_time);




    // ============使用归约操作的并行版本============
    num_inside = 0;
    start_time = omp_get_wtime();

    #pragma omp parallel reduction(+:num_inside)
    {
        unsigned int seed = time(NULL) ^ omp_get_thread_num();
        #pragma omp for
        for (long i = 0; i < NUM_POINTS; i++) {
            double x = (double)rand_r(&seed) / RAND_MAX;
            double y = (double)rand_r(&seed) / RAND_MAX;

            if (x*x + y*y <= 1.0) {
                num_inside++;
            }
        }
    }
    pi_estimate = 4.0 * num_inside / NUM_POINTS;
    end_time = omp_get_wtime();

    printf("\n使用归约操作:\n");
    printf("π估计值 = %.8f\n", pi_estimate);
    printf("计算时间 = %.6f 秒\n", end_time - start_time);

    return 0;
}

MPI实验答案¶

尝试写一个MPI的Hello world程序，并输出每个进程的ID。

参考代码

#include <mpi.h>
#include <stdio.h>

int main(int argc, char** argv) {

    // 初始化MPI环境
    MPI_Init(&argc, &argv);

    int world_size;  // 总进程数
    int world_rank;  // 当前进程的排名（ID）
    char processor_name[MPI_MAX_PROCESSOR_NAME]; // 处理器名称
    int name_len;    // 名称长度

    // 获取总进程数
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    // 获取当前进程的排名
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    // 获取处理器名称
    MPI_Get_processor_name(processor_name, &name_len);


    printf("Hello world from processor %s, rank %d out of %d processors\n",
        processor_name, world_rank, world_size);

    // 清理MPI环境
    MPI_Finalize();

    return 0;
}

尝试从进程0发送一个数据到进程1。

参考代码

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int world_size, world_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    if (world_size < 2) {
        if (world_rank == 0) {
            printf("需要至少2个进程\n");
        }
        MPI_Finalize();
        return 1;
    }

    if (world_rank == 0) {
        int send_value = 42;
        MPI_Send(&send_value, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
        printf("进程 0 发送了值: %d 给进程 1\n", send_value);
    } else if (world_rank == 1) {
        int recv_value;
        MPI_Recv(&recv_value, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        printf("进程 1 收到了值: %d\n", recv_value);
    }

    MPI_Finalize();
    return 0;
}

尝试广播一段数据，并输出接收到的值。

参考代码

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int world_size, world_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    int* data = NULL;
    int data_size;

    if (world_rank == 0) {
        // 主进程初始化数据
        data_size = 5;
        data = (int*)malloc(data_size * sizeof(int));
        for (int i = 0; i < data_size; i++) data[i] = i + 1;

        printf("进程 0 广播数据: ");
        for (int i = 0; i < data_size; i++) printf("%d ", data[i]);
        printf("\n");
    }

    // 广播数据大小
    MPI_Bcast(&data_size, 1, MPI_INT, 0, MPI_COMM_WORLD);

    // 分配缓冲区
    if (world_rank != 0) {
        data = (int*)malloc(data_size * sizeof(int));
    }

    // 广播实际数据
    MPI_Bcast(data, data_size, MPI_INT, 0, MPI_COMM_WORLD);

    // 所有进程打印接收的数据
    printf("进程 %d 接收数据: ", world_rank);
    for (int i = 0; i < data_size; i++) printf("%d ", data[i]);
    printf("\n");

    free(data);
    MPI_Finalize();
    return 0;
}

尝试编写一个计算向量点积的程序，并使用MPI进行并行。提示：使用MPI_Scatter和MPI_Reduce

参考代码

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    const int vector_size = 1000000;
    const int local_size = vector_size / size;

    float *local_A = (float*)malloc(local_size * sizeof(float));
    float *local_B = (float*)malloc(local_size * sizeof(float));

    float *full_A = NULL;
    float *full_B = NULL;

    if (rank == 0) {
        full_A = (float*)malloc(vector_size * sizeof(float));
        full_B = (float*)malloc(vector_size * sizeof(float));

        // 使用固定种子初始化完整向量（保证可重复性）
        srand(12345);
        for (int i = 0; i < vector_size; i++) {
            full_A[i] = (float)rand() / RAND_MAX;
            full_B[i] = (float)rand() / RAND_MAX;
        }
    }

    //============并行计算============

    MPI_Scatter(full_A, local_size, MPI_FLOAT, local_A, local_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
    MPI_Scatter(full_B, local_size, MPI_FLOAT, local_B, local_size, MPI_FLOAT, 0, MPI_COMM_WORLD);

    double local_dot = 0.0;
    for (int i = 0; i < local_size; i++) {
        local_dot += (double)local_A[i] * (double)local_B[i];
    }

    double global_dot;
    MPI_Reduce(&local_dot, &global_dot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    // 验证结果
    if (rank == 0) {

        //============串行计算============
        double serial_dot = 0.0;
        for (int i = 0; i < vector_size; i++) {
            serial_dot += (double)full_A[i] * (double)full_B[i];
        }

        double abs_error = fabs(global_dot - serial_dot);

        printf("并行点积: %.16f\n", global_dot);
        printf("串行点积: %.16f\n", serial_dot);
        printf("绝对误差: %.6e\n", abs_error);

        free(full_A);
        free(full_B);
    }

    free(local_A);
    free(local_B);

    MPI_Finalize();
    return 0;
}

蒙特卡洛算法估算\(\pi\)，请使用MPI进行并行优化。

参考代码

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);

    int world_size, world_rank;
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);

    const long total_samples = 100000000;
    long samples_per_proc = total_samples / world_size;

    srand(time(NULL) + world_rank);

    // 局部计数
    long local_count = 0;
    for (long i = 0; i < samples_per_proc; i++) {
        double x = (double)rand() / RAND_MAX * 2.0 - 1.0;  // [-1, 1]
        double y = (double)rand() / RAND_MAX * 2.0 - 1.0;  // [-1, 1]
        if (x*x + y*y <= 1.0) {
            local_count++;
        }
    }

    long global_count;
    MPI_Reduce(&local_count, &global_count, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD);

    // 主进程计算结果
    if (world_rank == 0) {
        double pi_estimate = 4.0 * (double)global_count / total_samples;

        printf("估算π = %.10f\n", pi_estimate);
        printf("实际π = %.10f\n", M_PI);
        printf("样本数 = %ld\n", total_samples);
    }

    MPI_Finalize();
    return 0;
}

mpi4py实验答案¶

Hello, mpi4py.

参考代码

from mpi4py import MPI

def main():
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    # 每个进程打印自己的信息
    print(f"Hello, MPI4Py! I'm process {rank} of {size}")

if __name__ == "__main__":
    main()

mpi4py库中的点对点通信(ping - pong)

参考代码

from mpi4py import MPI
import time
import numpy as np

def main():
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    # 确保至少有2个进程
    if size < 2:
        if rank == 0:
            print("Error: Need at least 2 processes")
        return

    MAX_ITER = 3  # 设置交互次数
    status = MPI.Status()

    if rank == 0:
        # 进程0：发送ping，接收pong
        for i in range(MAX_ITER):

            # 发送ping消息给进程1
            comm.send("ping", dest=1, tag=1)
            print(f"[{rank}] Sent 'ping' to 1")

            # 接收pong回复
            msg = comm.recv(source=1, tag=2, status=status)
            print(f"[{rank}] Received '{msg}' from 1")

            time.sleep(1)

    elif rank == 1:
        # 进程1：发送pong，接收ping
        for i in range(MAX_ITER):

            # 发送pong消息给进程0
            comm.send("pong", dest=0, tag=2)
            print(f"[{rank}] Sent 'pong' to 0")

            # 接收ping消息
            msg = comm.recv(source=0, tag=1, status=status)
            print(f"[{rank}] Received '{msg}' from 0")

            time.sleep(1)


if __name__ == "__main__":
    main()

mpi4py库中的集合通信

参考代码

from mpi4py import MPI
import numpy as np

def mpi_np_gather(send_buffer: np.ndarray) -> np.ndarray:
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    # 获取本地数据的行数和列数
    rows, cols = send_buffer.shape

    # 收集所有进程的行数
    all_rows = comm.allgather(rows)

    # 计算每个进程的元素数量（行数×列数）
    send_count = rows * cols
    recv_counts = [r * cols for r in all_rows]

    # 计算接收缓冲区中的位移
    displacements = [0]
    for i in range(1, size):
        displacements.append(displacements[i-1] + recv_counts[i-1])

    # 创建接收缓冲区
    total_elements = sum(recv_counts)
    recv_buffer = np.empty(total_elements, dtype=send_buffer.dtype)

    # 执行Allgatherv操作
    comm.Allgatherv(
        send_buffer.flatten(),  # 本地发送数据（展平为一维）
        [recv_buffer, (recv_counts, displacements)]  # 接收缓冲区配置
    )

    # 将一维接收缓冲区重塑为二维数组
    return recv_buffer.reshape(-1, cols)


if __name__ == "__main__":
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    if rank == 0:
        data = np.array([[0, 1]])
    elif rank == 1:
        data = np.array([[1, 0], [2, 3]])
    elif rank == 2:
        data = 7 * np.ones((3, 2), dtype=int)
    elif rank == 3:
        data = np.array([[114, 514]])

    result = mpi_np_gather(data)

    if rank == 3:
        print("Gathered result:")
        print(result)

向量化实验答案¶

在“代码提示”内给出了一段矩阵加法的向量化程序，请你对其进行4倍循环展开。

参考代码

#include <immintrin.h>

void matrix_add_unrolled(const float* A, const float* B, float* C, int M, int N) {
    int total = M * N;
    int i = 0;

    for (; i <= total - 64; i += 64) {
        __m512 a0 = _mm512_loadu_ps(&A[i]);
        __m512 a1 = _mm512_loadu_ps(&A[i + 16]);
        __m512 a2 = _mm512_loadu_ps(&A[i + 32]);
        __m512 a3 = _mm512_loadu_ps(&A[i + 48]);

        __m512 b0 = _mm512_loadu_ps(&B[i]);
        __m512 b1 = _mm512_loadu_ps(&B[i + 16]);
        __m512 b2 = _mm512_loadu_ps(&B[i + 32]);
        __m512 b3 = _mm512_loadu_ps(&B[i + 48]);

        __m512 c0 = _mm512_add_ps(a0, b0);
        __m512 c1 = _mm512_add_ps(a1, b1);
        __m512 c2 = _mm512_add_ps(a2, b2);
        __m512 c3 = _mm512_add_ps(a3, b3);

        _mm512_storeu_ps(&C[i], c0);
        _mm512_storeu_ps(&C[i + 16], c1);
        _mm512_storeu_ps(&C[i + 32], c2);
        _mm512_storeu_ps(&C[i + 48], c3);
    }

    // 处理剩余元素 (小于64个)
    for (; i < total; i += 16) {
        __m512 a = _mm512_loadu_ps(&A[i]);
        __m512 b = _mm512_loadu_ps(&B[i]);
        __m512 c = _mm512_add_ps(a, b);
        _mm512_storeu_ps(&C[i], c);
    }
}

计算数组arr的Sigmoid函数：1/(1+exp(-x))

参考代码

#include <immintrin.h>

void vector_sigmoid(float* arr, int n) {
    const __m512 vOne = _mm512_set1_ps(1.0f);

    for (int i = 0; i < n; i += 16) {
        __m512 x = _mm512_loadu_ps(arr + i);

        // 近似计算: exp(-|x|)
        __m512 exp_neg = _mm512_expm1_ps(_mm512_sub_ps(_mm512_setzero_ps(), x));
        exp_neg = _mm512_add_ps(exp_neg, vOne);  // exp(-x) ≈ 1 + expm1(-x)

        // sigmoid = 1/(1+exp(-x))
        __m512 denom = _mm512_add_ps(vOne, exp_neg);
        __m512 result = _mm512_div_ps(vOne, denom);

        _mm512_storeu_ps(arr + i, result);
    }
}

使用向量化实现一个16*16的矩阵乘法。

参考代码

#include <iostream>
#include <cmath>
#include <immintrin.h>

const int N = 16;

// 基础版本
void matrix_mul_basic(float A[N][N], float B[N][N], float C[N][N]) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
            for (int k = 0; k < N; k++) {
                sum += A[i][k] * B[k][j];
            }
            C[i][j] = sum;
        }
    }
}

// AVX512优化版本
void matrix_mul_avx512(float A[N][N], float B[N][N], float C[N][N]) {
    __m512 c_rows[N];
    for (int i = 0; i < N; i++) {
        c_rows[i] = _mm512_setzero_ps();
    }

    for (int k = 0; k < N; k++) {
        __m512 b_row = _mm512_loadu_ps(&B[k][0]);
        for (int i = 0; i < N; i++) {
            __m512 a_broadcast = _mm512_set1_ps(A[i][k]);
            c_rows[i] = _mm512_fmadd_ps(a_broadcast, b_row, c_rows[i]);
        }
    }

    for (int i = 0; i < N; i++) {
        _mm512_storeu_ps(&C[i][0], c_rows[i]);
    }
}

// 验证结果一致性
bool verify_results(float C1[N][N], float C2[N][N], float epsilon = 1e-5f) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            if (std::fabs(C1[i][j] - C2[i][j]) > epsilon) {
                std::cerr << "Mismatch at (" << i << "," << j << "): "
                        << C1[i][j] << " vs " << C2[i][j] << std::endl;
                return false;
            }
        }
    }
    return true;
}

int main() {
    float A[N][N], B[N][N];
    float C_basic[N][N], C_avx[N][N];

    // 初始化矩阵
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            A[i][j] = static_cast<float>(i * N + j);
            B[i][j] = static_cast<float>(i - j);
        }
    }

    matrix_mul_basic(A, B, C_basic);

    matrix_mul_avx512(A, B, C_avx);

    // 验证结果
    if (verify_results(C_basic, C_avx)) {
        std::cout << "Verification PASSED" << std::endl;
    } else {
        std::cout << "Verification FAILED" << std::endl;
    }

    return 0;
}