答案¶
约 655 个字 587 行代码 预计阅读时间 10 分钟
关于实验答案
如果条件允许,建议在看答案前先自行编写程序并解决可能存在的问题,这样可以让你对知识有更加深入的理解。
OpenMP实验答案¶
-
编写一个Hello World程序,尝试使用OpenMP并行输出,并且输出其线程id。多运行几次,观察输出结果的变化。
参考代码
#include <omp.h> #include <stdio.h> int main() { #pragma omp parallel { int tid = omp_get_thread_num(); printf("Hello from thread %d\n", tid); } return 0; }
-
编写一个进行矩阵计算的程序,并给它添加OpenMP并行,观察添加OpenMP前后的计算时间,有什么变化?
参考代码
删去注释即可启动OpenMP并行
#include <omp.h> #include <stdio.h> #include <stdlib.h> #include <string.h> const int N = 600; int main() { float A[N][N], B[N][N], C[N][N]; #pragma omp parallel for for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { A[i][j] = 1.0f; B[i][j] = 1.0f; C[i][j] = 0.0f; } } double begin = omp_get_wtime(); // #pragma omp parallel for for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { C[i][j] = 0; for (int k = 0; k < N; k++) { C[i][j] += A[i][k] * B[k][j]; } } } double end = omp_get_wtime()-begin; printf("计算时间: %.6f 秒\n", end); return 0; }
-
略
-
蒙特卡洛算法估算\(\pi\)。尝试实现串行版本代码,并修改为openmp并行版本,在很大试验次数的情况下对比运行时间。
参考代码
#include <omp.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #define NUM_POINTS 100000000 int main() { printf("总点数 = %d\n", NUM_POINTS); srand(time(NULL)); double start_time, end_time; double pi_estimate; int num_inside; // ============串行版本============ start_time = omp_get_wtime(); num_inside = 0; for (long i = 0; i < NUM_POINTS; i++) { double x = (double)rand() / RAND_MAX; double y = (double)rand() / RAND_MAX; if (x*x + y*y <= 1.0) { num_inside++; } } pi_estimate = 4.0 * num_inside / NUM_POINTS; end_time = omp_get_wtime(); printf("\n串行版本:\n"); printf("π估计值 = %.8f\n", pi_estimate); printf("计算时间 = %.6f 秒\n", end_time - start_time); // ============并行版本============ num_inside = 0; start_time = omp_get_wtime(); #pragma omp parallel { // 每个线程有自己的随机数种子 unsigned int seed = time(NULL) ^ omp_get_thread_num(); int local_inside = 0; #pragma omp for for (int i = 0; i < NUM_POINTS; i++) { double x = (double)rand_r(&seed) / RAND_MAX; double y = (double)rand_r(&seed) / RAND_MAX; if (x*x + y*y <= 1.0) { local_inside++; } } // 临界区保护共享变量 #pragma omp critical { num_inside += local_inside; } } pi_estimate = 4.0 * num_inside / NUM_POINTS; end_time = omp_get_wtime(); printf("\n并行版本:\n"); printf("π估计值 = %.8f\n", pi_estimate); printf("计算时间 = %.6f 秒\n", end_time - start_time); // ============使用归约操作的并行版本============ num_inside = 0; start_time = omp_get_wtime(); #pragma omp parallel reduction(+:num_inside) { unsigned int seed = time(NULL) ^ omp_get_thread_num(); #pragma omp for for (long i = 0; i < NUM_POINTS; i++) { double x = (double)rand_r(&seed) / RAND_MAX; double y = (double)rand_r(&seed) / RAND_MAX; if (x*x + y*y <= 1.0) { num_inside++; } } } pi_estimate = 4.0 * num_inside / NUM_POINTS; end_time = omp_get_wtime(); printf("\n使用归约操作:\n"); printf("π估计值 = %.8f\n", pi_estimate); printf("计算时间 = %.6f 秒\n", end_time - start_time); return 0; }
MPI实验答案¶
-
尝试写一个MPI的Hello world程序,并输出每个进程的ID。
参考代码
#include <mpi.h> #include <stdio.h> int main(int argc, char** argv) { // 初始化MPI环境 MPI_Init(&argc, &argv); int world_size; // 总进程数 int world_rank; // 当前进程的排名(ID) char processor_name[MPI_MAX_PROCESSOR_NAME]; // 处理器名称 int name_len; // 名称长度 // 获取总进程数 MPI_Comm_size(MPI_COMM_WORLD, &world_size); // 获取当前进程的排名 MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); // 获取处理器名称 MPI_Get_processor_name(processor_name, &name_len); printf("Hello world from processor %s, rank %d out of %d processors\n", processor_name, world_rank, world_size); // 清理MPI环境 MPI_Finalize(); return 0; }
-
尝试从进程0发送一个数据到进程1。
参考代码
#include <mpi.h> #include <stdio.h> #include <stdlib.h> int main(int argc, char** argv) { MPI_Init(&argc, &argv); int world_size, world_rank; MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); if (world_size < 2) { if (world_rank == 0) { printf("需要至少2个进程\n"); } MPI_Finalize(); return 1; } if (world_rank == 0) { int send_value = 42; MPI_Send(&send_value, 1, MPI_INT, 1, 0, MPI_COMM_WORLD); printf("进程 0 发送了值: %d 给进程 1\n", send_value); } else if (world_rank == 1) { int recv_value; MPI_Recv(&recv_value, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); printf("进程 1 收到了值: %d\n", recv_value); } MPI_Finalize(); return 0; }
-
尝试广播一段数据,并输出接收到的值。
参考代码
#include <mpi.h> #include <stdio.h> #include <stdlib.h> int main(int argc, char** argv) { MPI_Init(&argc, &argv); int world_size, world_rank; MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); int* data = NULL; int data_size; if (world_rank == 0) { // 主进程初始化数据 data_size = 5; data = (int*)malloc(data_size * sizeof(int)); for (int i = 0; i < data_size; i++) data[i] = i + 1; printf("进程 0 广播数据: "); for (int i = 0; i < data_size; i++) printf("%d ", data[i]); printf("\n"); } // 广播数据大小 MPI_Bcast(&data_size, 1, MPI_INT, 0, MPI_COMM_WORLD); // 分配缓冲区 if (world_rank != 0) { data = (int*)malloc(data_size * sizeof(int)); } // 广播实际数据 MPI_Bcast(data, data_size, MPI_INT, 0, MPI_COMM_WORLD); // 所有进程打印接收的数据 printf("进程 %d 接收数据: ", world_rank); for (int i = 0; i < data_size; i++) printf("%d ", data[i]); printf("\n"); free(data); MPI_Finalize(); return 0; }
-
尝试编写一个计算向量点积的程序,并使用MPI进行并行。提示:使用MPI_Scatter和MPI_Reduce
参考代码
#include <mpi.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> int main(int argc, char** argv) { MPI_Init(&argc, &argv); int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); const int vector_size = 1000000; const int local_size = vector_size / size; float *local_A = (float*)malloc(local_size * sizeof(float)); float *local_B = (float*)malloc(local_size * sizeof(float)); float *full_A = NULL; float *full_B = NULL; if (rank == 0) { full_A = (float*)malloc(vector_size * sizeof(float)); full_B = (float*)malloc(vector_size * sizeof(float)); // 使用固定种子初始化完整向量(保证可重复性) srand(12345); for (int i = 0; i < vector_size; i++) { full_A[i] = (float)rand() / RAND_MAX; full_B[i] = (float)rand() / RAND_MAX; } } //============并行计算============ MPI_Scatter(full_A, local_size, MPI_FLOAT, local_A, local_size, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Scatter(full_B, local_size, MPI_FLOAT, local_B, local_size, MPI_FLOAT, 0, MPI_COMM_WORLD); double local_dot = 0.0; for (int i = 0; i < local_size; i++) { local_dot += (double)local_A[i] * (double)local_B[i]; } double global_dot; MPI_Reduce(&local_dot, &global_dot, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); // 验证结果 if (rank == 0) { //============串行计算============ double serial_dot = 0.0; for (int i = 0; i < vector_size; i++) { serial_dot += (double)full_A[i] * (double)full_B[i]; } double abs_error = fabs(global_dot - serial_dot); printf("并行点积: %.16f\n", global_dot); printf("串行点积: %.16f\n", serial_dot); printf("绝对误差: %.6e\n", abs_error); free(full_A); free(full_B); } free(local_A); free(local_B); MPI_Finalize(); return 0; }
-
蒙特卡洛算法估算\(\pi\),请使用MPI进行并行优化。
参考代码
#include <mpi.h> #include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> int main(int argc, char** argv) { MPI_Init(&argc, &argv); int world_size, world_rank; MPI_Comm_size(MPI_COMM_WORLD, &world_size); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); const long total_samples = 100000000; long samples_per_proc = total_samples / world_size; srand(time(NULL) + world_rank); // 局部计数 long local_count = 0; for (long i = 0; i < samples_per_proc; i++) { double x = (double)rand() / RAND_MAX * 2.0 - 1.0; // [-1, 1] double y = (double)rand() / RAND_MAX * 2.0 - 1.0; // [-1, 1] if (x*x + y*y <= 1.0) { local_count++; } } long global_count; MPI_Reduce(&local_count, &global_count, 1, MPI_LONG, MPI_SUM, 0, MPI_COMM_WORLD); // 主进程计算结果 if (world_rank == 0) { double pi_estimate = 4.0 * (double)global_count / total_samples; printf("估算π = %.10f\n", pi_estimate); printf("实际π = %.10f\n", M_PI); printf("样本数 = %ld\n", total_samples); } MPI_Finalize(); return 0; }
mpi4py实验答案¶
-
Hello, mpi4py.
参考代码
from mpi4py import MPI def main(): comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() # 每个进程打印自己的信息 print(f"Hello, MPI4Py! I'm process {rank} of {size}") if __name__ == "__main__": main()
-
mpi4py库中的点对点通信(ping - pong)
参考代码
from mpi4py import MPI import time import numpy as np def main(): comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() # 确保至少有2个进程 if size < 2: if rank == 0: print("Error: Need at least 2 processes") return MAX_ITER = 3 # 设置交互次数 status = MPI.Status() if rank == 0: # 进程0:发送ping,接收pong for i in range(MAX_ITER): # 发送ping消息给进程1 comm.send("ping", dest=1, tag=1) print(f"[{rank}] Sent 'ping' to 1") # 接收pong回复 msg = comm.recv(source=1, tag=2, status=status) print(f"[{rank}] Received '{msg}' from 1") time.sleep(1) elif rank == 1: # 进程1:发送pong,接收ping for i in range(MAX_ITER): # 发送pong消息给进程0 comm.send("pong", dest=0, tag=2) print(f"[{rank}] Sent 'pong' to 0") # 接收ping消息 msg = comm.recv(source=0, tag=1, status=status) print(f"[{rank}] Received '{msg}' from 0") time.sleep(1) if __name__ == "__main__": main()
-
mpi4py库中的集合通信
参考代码
from mpi4py import MPI import numpy as np def mpi_np_gather(send_buffer: np.ndarray) -> np.ndarray: comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() # 获取本地数据的行数和列数 rows, cols = send_buffer.shape # 收集所有进程的行数 all_rows = comm.allgather(rows) # 计算每个进程的元素数量(行数×列数) send_count = rows * cols recv_counts = [r * cols for r in all_rows] # 计算接收缓冲区中的位移 displacements = [0] for i in range(1, size): displacements.append(displacements[i-1] + recv_counts[i-1]) # 创建接收缓冲区 total_elements = sum(recv_counts) recv_buffer = np.empty(total_elements, dtype=send_buffer.dtype) # 执行Allgatherv操作 comm.Allgatherv( send_buffer.flatten(), # 本地发送数据(展平为一维) [recv_buffer, (recv_counts, displacements)] # 接收缓冲区配置 ) # 将一维接收缓冲区重塑为二维数组 return recv_buffer.reshape(-1, cols) if __name__ == "__main__": comm = MPI.COMM_WORLD rank = comm.Get_rank() if rank == 0: data = np.array([[0, 1]]) elif rank == 1: data = np.array([[1, 0], [2, 3]]) elif rank == 2: data = 7 * np.ones((3, 2), dtype=int) elif rank == 3: data = np.array([[114, 514]]) result = mpi_np_gather(data) if rank == 3: print("Gathered result:") print(result)
向量化实验答案¶
-
在“代码提示”内给出了一段矩阵加法的向量化程序,请你对其进行4倍循环展开。
参考代码
#include <immintrin.h> void matrix_add_unrolled(const float* A, const float* B, float* C, int M, int N) { int total = M * N; int i = 0; for (; i <= total - 64; i += 64) { __m512 a0 = _mm512_loadu_ps(&A[i]); __m512 a1 = _mm512_loadu_ps(&A[i + 16]); __m512 a2 = _mm512_loadu_ps(&A[i + 32]); __m512 a3 = _mm512_loadu_ps(&A[i + 48]); __m512 b0 = _mm512_loadu_ps(&B[i]); __m512 b1 = _mm512_loadu_ps(&B[i + 16]); __m512 b2 = _mm512_loadu_ps(&B[i + 32]); __m512 b3 = _mm512_loadu_ps(&B[i + 48]); __m512 c0 = _mm512_add_ps(a0, b0); __m512 c1 = _mm512_add_ps(a1, b1); __m512 c2 = _mm512_add_ps(a2, b2); __m512 c3 = _mm512_add_ps(a3, b3); _mm512_storeu_ps(&C[i], c0); _mm512_storeu_ps(&C[i + 16], c1); _mm512_storeu_ps(&C[i + 32], c2); _mm512_storeu_ps(&C[i + 48], c3); } // 处理剩余元素 (小于64个) for (; i < total; i += 16) { __m512 a = _mm512_loadu_ps(&A[i]); __m512 b = _mm512_loadu_ps(&B[i]); __m512 c = _mm512_add_ps(a, b); _mm512_storeu_ps(&C[i], c); } }
-
计算数组arr的Sigmoid函数:1/(1+exp(-x))
参考代码
#include <immintrin.h> void vector_sigmoid(float* arr, int n) { const __m512 vOne = _mm512_set1_ps(1.0f); for (int i = 0; i < n; i += 16) { __m512 x = _mm512_loadu_ps(arr + i); // 近似计算: exp(-|x|) __m512 exp_neg = _mm512_expm1_ps(_mm512_sub_ps(_mm512_setzero_ps(), x)); exp_neg = _mm512_add_ps(exp_neg, vOne); // exp(-x) ≈ 1 + expm1(-x) // sigmoid = 1/(1+exp(-x)) __m512 denom = _mm512_add_ps(vOne, exp_neg); __m512 result = _mm512_div_ps(vOne, denom); _mm512_storeu_ps(arr + i, result); } }
-
使用向量化实现一个16*16的矩阵乘法。
参考代码
#include <iostream> #include <cmath> #include <immintrin.h> const int N = 16; // 基础版本 void matrix_mul_basic(float A[N][N], float B[N][N], float C[N][N]) { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { float sum = 0.0f; for (int k = 0; k < N; k++) { sum += A[i][k] * B[k][j]; } C[i][j] = sum; } } } // AVX512优化版本 void matrix_mul_avx512(float A[N][N], float B[N][N], float C[N][N]) { __m512 c_rows[N]; for (int i = 0; i < N; i++) { c_rows[i] = _mm512_setzero_ps(); } for (int k = 0; k < N; k++) { __m512 b_row = _mm512_loadu_ps(&B[k][0]); for (int i = 0; i < N; i++) { __m512 a_broadcast = _mm512_set1_ps(A[i][k]); c_rows[i] = _mm512_fmadd_ps(a_broadcast, b_row, c_rows[i]); } } for (int i = 0; i < N; i++) { _mm512_storeu_ps(&C[i][0], c_rows[i]); } } // 验证结果一致性 bool verify_results(float C1[N][N], float C2[N][N], float epsilon = 1e-5f) { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { if (std::fabs(C1[i][j] - C2[i][j]) > epsilon) { std::cerr << "Mismatch at (" << i << "," << j << "): " << C1[i][j] << " vs " << C2[i][j] << std::endl; return false; } } } return true; } int main() { float A[N][N], B[N][N]; float C_basic[N][N], C_avx[N][N]; // 初始化矩阵 for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { A[i][j] = static_cast<float>(i * N + j); B[i][j] = static_cast<float>(i - j); } } matrix_mul_basic(A, B, C_basic); matrix_mul_avx512(A, B, C_avx); // 验证结果 if (verify_results(C_basic, C_avx)) { std::cout << "Verification PASSED" << std::endl; } else { std::cout << "Verification FAILED" << std::endl; } return 0; }