跳转至

答案

约 655 个字 587 行代码 预计阅读时间 10 分钟

关于实验答案

如果条件允许,建议在看答案前先自行编写程序并解决可能存在的问题,这样可以让你对知识有更加深入的理解。


OpenMP实验答案

  1. 编写一个Hello World程序,尝试使用OpenMP并行输出,并且输出其线程id。多运行几次,观察输出结果的变化。

    参考代码
    #include <omp.h>
    #include <stdio.h>
    
    int main()
    {
        #pragma omp parallel
        {
            int tid = omp_get_thread_num();
            printf("Hello from thread %d\n", tid);
        }
        return 0;
    }
    
  2. 编写一个进行矩阵计算的程序,并给它添加OpenMP并行,观察添加OpenMP前后的计算时间,有什么变化?

    参考代码

    删去注释即可启动OpenMP并行

    #include <omp.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    const int N = 600;
    
    int main()
    {
        float A[N][N], B[N][N], C[N][N];
        #pragma omp parallel for
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                A[i][j] = 1.0f;
                B[i][j] = 1.0f;
                C[i][j] = 0.0f;
            }
        }
    
        double begin = omp_get_wtime();
    
        // #pragma omp parallel for
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                C[i][j] = 0;
                for (int k = 0; k < N; k++) {
                    C[i][j] += A[i][k] * B[k][j];
                }
            }
        }
    
        double end = omp_get_wtime()-begin;
        printf("计算时间: %.6f 秒\n", end);
    
        return 0;
    }
    
  3. 蒙特卡洛算法估算\(\pi\)。尝试实现串行版本代码,并修改为openmp并行版本,在很大试验次数的情况下对比运行时间。

    参考代码
    #include <omp.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    
    #define NUM_POINTS 100000000
    
    int main() {
    
        printf("总点数 = %d\n", NUM_POINTS);
    
        srand(time(NULL));
    
        double start_time, end_time;
        double pi_estimate;
        int num_inside;
    
        // ============串行版本============
        start_time = omp_get_wtime();
        num_inside = 0;
    
        for (long i = 0; i < NUM_POINTS; i++) {
            double x = (double)rand() / RAND_MAX;
            double y = (double)rand() / RAND_MAX;
    
            if (x*x + y*y <= 1.0) {
                num_inside++;
            }
        }
    
        pi_estimate = 4.0 * num_inside / NUM_POINTS;
        end_time = omp_get_wtime();
    
        printf("\n串行版本:\n");
        printf("π估计值 = %.8f\n", pi_estimate);
        printf("计算时间 = %.6f 秒\n", end_time - start_time);
    
    
    
        // ============并行版本============
        num_inside = 0;
        start_time = omp_get_wtime();
    
        #pragma omp parallel
        {
            // 每个线程有自己的随机数种子
            unsigned int seed = time(NULL) ^ omp_get_thread_num();
    
            int local_inside = 0;
    
            #pragma omp for
            for (int i = 0; i < NUM_POINTS; i++) {
                double x = (double)rand_r(&seed) / RAND_MAX;
                double y = (double)rand_r(&seed) / RAND_MAX;
    
                if (x*x + y*y <= 1.0) {
                    local_inside++;
                }
            }
    
            // 临界区保护共享变量
            #pragma omp critical
            {
                num_inside += local_inside;
            }
        }
    
        pi_estimate = 4.0 * num_inside / NUM_POINTS;
        end_time = omp_get_wtime();
    
        printf("\n并行版本:\n");
        printf("π估计值 = %.8f\n", pi_estimate);
        printf("计算时间 = %.6f 秒\n", end_time - start_time);
    
    
    
    
        // ============使用归约操作的并行版本============
        num_inside = 0;
        start_time = omp_get_wtime();
    
        #pragma omp parallel reduction(+:num_inside)
        {
            unsigned int seed = time(NULL) ^ omp_get_thread_num();
            #pragma omp for
            for (long i = 0; i < NUM_POINTS; i++) {
                double x = (double)rand_r(&seed) / RAND_MAX;
                double y = (double)rand_r(&seed) / RAND_MAX;
    
                if (x*x + y*y <= 1.0) {
                    num_inside++;
                }
            }
        }
        pi_estimate = 4.0 * num_inside / NUM_POINTS;
        end_time = omp_get_wtime();
    
        printf("\n使用归约操作:\n");
        printf("π估计值 = %.8f\n", pi_estimate);
        printf("计算时间 = %.6f 秒\n", end_time - start_time);
    
        return 0;
    }
    

MPI实验答案

  1. 尝试写一个MPI的Hello world程序,并输出每个进程的ID。

    参考代码
    #include <mpi.h>
    #include <stdio.h>
    
    int main(int argc, char** argv) {
    
        // 初始化MPI环境
        MPI_Init(&argc, &argv);
    
        int world_size;  // 总进程数
        int world_rank;  // 当前进程的排名(ID)
        char processor_name[MPI_MAX_PROCESSOR_NAME]; // 处理器名称
        int name_len;    // 名称长度
    
        // 获取总进程数
        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    
        // 获取当前进程的排名
        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    
        // 获取处理器名称
        MPI_Get_processor_name(processor_name, &name_len);
    
    
        printf("Hello world from processor %s, rank %d out of %d processors\n",
            processor_name, world_rank, world_size);
    
        // 清理MPI环境
        MPI_Finalize();
    
        return 0;
    }
    
  2. 尝试从进程0发送一个数据到进程1。

    参考代码
    #include <mpi.h>
    #include <stdio.h>
    #include <stdlib.h>
    
    int main(int argc, char** argv) {
        MPI_Init(&argc, &argv);
    
        int world_size, world_rank;
        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    
        if (world_size < 2) {
            if (world_rank == 0) {
                printf("需要至少2个进程\n");
            }
            MPI_Finalize();
            return 1;
        }
    
        if (world_rank == 0) {
            int send_value = 42;
            MPI_Send(&send_value, 1, MPI_INT, 1, 0, MPI_COMM_WORLD);
            printf("进程 0 发送了值: %d 给进程 1\n", send_value);
        } else if (world_rank == 1) {
            int recv_value;
            MPI_Recv(&recv_value, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
            printf("进程 1 收到了值: %d\n", recv_value);
        }
    
        MPI_Finalize();
        return 0;
    }
    
  3. 尝试广播一段数据,并输出接收到的值。

    参考代码
    #include <mpi.h>
    #include <stdio.h>
    #include <stdlib.h>
    
    int main(int argc, char** argv) {
        MPI_Init(&argc, &argv);
    
        int world_size, world_rank;
        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    
        int* data = NULL;
        int data_size;
    
        if (world_rank == 0) {
            // 主进程初始化数据
            data_size = 5;
            data = (int*)malloc(data_size * sizeof(int));
            for (int i = 0; i < data_size; i++) data[i] = i + 1;
    
            printf("进程 0 广播数据: ");
            for (int i = 0; i < data_size; i++) printf("%d ", data[i]);
            printf("\n");
        }
    
        // 广播数据大小
        MPI_Bcast(&data_size, 1, MPI_INT, 0, MPI_COMM_WORLD);
    
        // 分配缓冲区
        if (world_rank != 0) {
            data = (int*)malloc(data_size * sizeof(int));
        }
    
        // 广播实际数据
        MPI_Bcast(data, data_size, MPI_INT, 0, MPI_COMM_WORLD);
    
        // 所有进程打印接收的数据
        printf("进程 %d 接收数据: ", world_rank);
        for (int i = 0; i < data_size; i++) printf("%d ", data[i]);
        printf("\n");
    
        free(data);
        MPI_Finalize();
        return 0;
    }
    
  4. 尝试编写一个计算向量点积的程序,并使用MPI进行并行。提示:使用MPI_Scatter和MPI_Reduce

    参考代码
    #include <mpi.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <time.h>
    
    int main(int argc, char** argv) {
        MPI_Init(&argc, &argv);
    
        int rank, size;
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
    
        const int vector_size = 1000000;
        const int local_size = vector_size / size;
    
        float *local_A = (float*)malloc(local_size * sizeof(float));
        float *local_B = (float*)malloc(local_size * sizeof(float));
    
        float *full_A = NULL;
        float *full_B = NULL;
    
        if (rank == 0) {
            full_A = (float*)malloc(vector_size * sizeof(float));
            full_B = (float*)malloc(vector_size * sizeof(float));
    
            // 使用固定种子初始化完整向量(保证可重复性)
            srand(12345);
            for (int i = 0; i < vector_size; i++) {
                full_A[i] = (float)rand() / RAND_MAX;
                full_B[i] = (float)rand() / RAND_MAX;
            }
        }
    
        //============并行计算============
    
        MPI_Scatter(full_A, local_size, MPI_FLOAT, local_A, local_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
        MPI_Scatter(full_B, local_size, MPI_FLOAT, local_B, local_size, MPI_FLOAT, 0, MPI_COMM_WORLD);
    
        double local_dot = 0.0;
        for (int i = 0; i < local_size; i++) {
            local_dot += (double)local_A[i] * (double)local_B[i];
        }
    
        double global_dot;
        MPI_Reduce(&local_dot, &global_dot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
    
        // 验证结果
        if (rank == 0) {
    
            //============串行计算============
            double serial_dot = 0.0;
            for (int i = 0; i < vector_size; i++) {
                serial_dot += (double)full_A[i] * (double)full_B[i];
            }
    
            double abs_error = fabs(global_dot - serial_dot);
    
            printf("并行点积: %.16f\n", global_dot);
            printf("串行点积: %.16f\n", serial_dot);
            printf("绝对误差: %.6e\n", abs_error);
    
            free(full_A);
            free(full_B);
        }
    
        free(local_A);
        free(local_B);
    
        MPI_Finalize();
        return 0;
    }
    
  5. 蒙特卡洛算法估算\(\pi\),请使用MPI进行并行优化。

    参考代码
    #include <mpi.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <time.h>
    
    int main(int argc, char** argv) {
        MPI_Init(&argc, &argv);
    
        int world_size, world_rank;
        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    
        const long total_samples = 100000000;
        long samples_per_proc = total_samples / world_size;
    
        srand(time(NULL) + world_rank);
    
        // 局部计数
        long local_count = 0;
        for (long i = 0; i < samples_per_proc; i++) {
            double x = (double)rand() / RAND_MAX * 2.0 - 1.0;  // [-1, 1]
            double y = (double)rand() / RAND_MAX * 2.0 - 1.0;  // [-1, 1]
            if (x*x + y*y <= 1.0) {
                local_count++;
            }
        }
    
        long global_count;
        MPI_Reduce(&local_count, &global_count, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
    
        // 主进程计算结果
        if (world_rank == 0) {
            double pi_estimate = 4.0 * (double)global_count / total_samples;
    
            printf("估算π = %.10f\n", pi_estimate);
            printf("实际π = %.10f\n", M_PI);
            printf("样本数 = %ld\n", total_samples);
        }
    
        MPI_Finalize();
        return 0;
    }
    

mpi4py实验答案

  1. Hello, mpi4py.

    参考代码
    from mpi4py import MPI
    
    def main():
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        size = comm.Get_size()
    
        # 每个进程打印自己的信息
        print(f"Hello, MPI4Py! I'm process {rank} of {size}")
    
    if __name__ == "__main__":
        main()
    
  2. mpi4py库中的点对点通信(ping - pong)

    参考代码
    from mpi4py import MPI
    import time
    import numpy as np
    
    def main():
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        size = comm.Get_size()
    
        # 确保至少有2个进程
        if size < 2:
            if rank == 0:
                print("Error: Need at least 2 processes")
            return
    
        MAX_ITER = 3  # 设置交互次数
        status = MPI.Status()
    
        if rank == 0:
            # 进程0:发送ping,接收pong
            for i in range(MAX_ITER):
    
                # 发送ping消息给进程1
                comm.send("ping", dest=1, tag=1)
                print(f"[{rank}] Sent 'ping' to 1")
    
                # 接收pong回复
                msg = comm.recv(source=1, tag=2, status=status)
                print(f"[{rank}] Received '{msg}' from 1")
    
                time.sleep(1)
    
        elif rank == 1:
            # 进程1:发送pong,接收ping
            for i in range(MAX_ITER):
    
                # 发送pong消息给进程0
                comm.send("pong", dest=0, tag=2)
                print(f"[{rank}] Sent 'pong' to 0")
    
                # 接收ping消息
                msg = comm.recv(source=0, tag=1, status=status)
                print(f"[{rank}] Received '{msg}' from 0")
    
                time.sleep(1)
    
    
    if __name__ == "__main__":
        main()
    
  3. mpi4py库中的集合通信

    参考代码
    from mpi4py import MPI
    import numpy as np
    
    def mpi_np_gather(send_buffer: np.ndarray) -> np.ndarray:
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        size = comm.Get_size()
    
        # 获取本地数据的行数和列数
        rows, cols = send_buffer.shape
    
        # 收集所有进程的行数
        all_rows = comm.allgather(rows)
    
        # 计算每个进程的元素数量(行数×列数)
        send_count = rows * cols
        recv_counts = [r * cols for r in all_rows]
    
        # 计算接收缓冲区中的位移
        displacements = [0]
        for i in range(1, size):
            displacements.append(displacements[i-1] + recv_counts[i-1])
    
        # 创建接收缓冲区
        total_elements = sum(recv_counts)
        recv_buffer = np.empty(total_elements, dtype=send_buffer.dtype)
    
        # 执行Allgatherv操作
        comm.Allgatherv(
            send_buffer.flatten(),  # 本地发送数据(展平为一维)
            [recv_buffer, (recv_counts, displacements)]  # 接收缓冲区配置
        )
    
        # 将一维接收缓冲区重塑为二维数组
        return recv_buffer.reshape(-1, cols)
    
    
    if __name__ == "__main__":
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
    
        if rank == 0:
            data = np.array([[0, 1]])
        elif rank == 1:
            data = np.array([[1, 0], [2, 3]])
        elif rank == 2:
            data = 7 * np.ones((3, 2), dtype=int)
        elif rank == 3:
            data = np.array([[114, 514]])
    
        result = mpi_np_gather(data)
    
        if rank == 3:
            print("Gathered result:")
            print(result)
    

向量化实验答案

  1. 在“代码提示”内给出了一段矩阵加法的向量化程序,请你对其进行4倍循环展开。

    参考代码
    #include <immintrin.h>
    
    void matrix_add_unrolled(const float* A, const float* B, float* C, int M, int N) {
        int total = M * N;
        int i = 0;
    
        for (; i <= total - 64; i += 64) {
            __m512 a0 = _mm512_loadu_ps(&A[i]);
            __m512 a1 = _mm512_loadu_ps(&A[i + 16]);
            __m512 a2 = _mm512_loadu_ps(&A[i + 32]);
            __m512 a3 = _mm512_loadu_ps(&A[i + 48]);
    
            __m512 b0 = _mm512_loadu_ps(&B[i]);
            __m512 b1 = _mm512_loadu_ps(&B[i + 16]);
            __m512 b2 = _mm512_loadu_ps(&B[i + 32]);
            __m512 b3 = _mm512_loadu_ps(&B[i + 48]);
    
            __m512 c0 = _mm512_add_ps(a0, b0);
            __m512 c1 = _mm512_add_ps(a1, b1);
            __m512 c2 = _mm512_add_ps(a2, b2);
            __m512 c3 = _mm512_add_ps(a3, b3);
    
            _mm512_storeu_ps(&C[i], c0);
            _mm512_storeu_ps(&C[i + 16], c1);
            _mm512_storeu_ps(&C[i + 32], c2);
            _mm512_storeu_ps(&C[i + 48], c3);
        }
    
        // 处理剩余元素 (小于64个)
        for (; i < total; i += 16) {
            __m512 a = _mm512_loadu_ps(&A[i]);
            __m512 b = _mm512_loadu_ps(&B[i]);
            __m512 c = _mm512_add_ps(a, b);
            _mm512_storeu_ps(&C[i], c);
        }
    }
    
  2. 计算数组arr的Sigmoid函数:1/(1+exp(-x))

    参考代码
    #include <immintrin.h>
    
    void vector_sigmoid(float* arr, int n) {
        const __m512 vOne = _mm512_set1_ps(1.0f);
    
        for (int i = 0; i < n; i += 16) {
            __m512 x = _mm512_loadu_ps(arr + i);
    
            // 近似计算: exp(-|x|)
            __m512 exp_neg = _mm512_expm1_ps(_mm512_sub_ps(_mm512_setzero_ps(), x));
            exp_neg = _mm512_add_ps(exp_neg, vOne);  // exp(-x) ≈ 1 + expm1(-x)
    
            // sigmoid = 1/(1+exp(-x))
            __m512 denom = _mm512_add_ps(vOne, exp_neg);
            __m512 result = _mm512_div_ps(vOne, denom);
    
            _mm512_storeu_ps(arr + i, result);
        }
    }
    
  3. 使用向量化实现一个16*16的矩阵乘法。

    参考代码
    #include <iostream>
    #include <cmath>
    #include <immintrin.h>
    
    const int N = 16;
    
    // 基础版本
    void matrix_mul_basic(float A[N][N], float B[N][N], float C[N][N]) {
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                float sum = 0.0f;
                for (int k = 0; k < N; k++) {
                    sum += A[i][k] * B[k][j];
                }
                C[i][j] = sum;
            }
        }
    }
    
    // AVX512优化版本
    void matrix_mul_avx512(float A[N][N], float B[N][N], float C[N][N]) {
        __m512 c_rows[N];
        for (int i = 0; i < N; i++) {
            c_rows[i] = _mm512_setzero_ps();
        }
    
        for (int k = 0; k < N; k++) {
            __m512 b_row = _mm512_loadu_ps(&B[k][0]);
            for (int i = 0; i < N; i++) {
                __m512 a_broadcast = _mm512_set1_ps(A[i][k]);
                c_rows[i] = _mm512_fmadd_ps(a_broadcast, b_row, c_rows[i]);
            }
        }
    
        for (int i = 0; i < N; i++) {
            _mm512_storeu_ps(&C[i][0], c_rows[i]);
        }
    }
    
    // 验证结果一致性
    bool verify_results(float C1[N][N], float C2[N][N], float epsilon = 1e-5f) {
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                if (std::fabs(C1[i][j] - C2[i][j]) > epsilon) {
                    std::cerr << "Mismatch at (" << i << "," << j << "): "
                            << C1[i][j] << " vs " << C2[i][j] << std::endl;
                    return false;
                }
            }
        }
        return true;
    }
    
    int main() {
        float A[N][N], B[N][N];
        float C_basic[N][N], C_avx[N][N];
    
        // 初始化矩阵
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < N; j++) {
                A[i][j] = static_cast<float>(i * N + j);
                B[i][j] = static_cast<float>(i - j);
            }
        }
    
        matrix_mul_basic(A, B, C_basic);
    
        matrix_mul_avx512(A, B, C_avx);
    
        // 验证结果
        if (verify_results(C_basic, C_avx)) {
            std::cout << "Verification PASSED" << std::endl;
        } else {
            std::cout << "Verification FAILED" << std::endl;
        }
    
        return 0;
    }