Code

#include <iostream>
#include <random>
#include <cstdint>
#include <ctime>
#include <omp.h>
#include <iomanip>

long double estimate_pi_parallel(uint64_t total_points) {
    uint64_t inside_circle = 0;

    #pragma omp parallel
    {
        std::mt19937 gen(time(NULL) ^ omp_get_thread_num());
        std::uniform_real_distribution<double> dist(0.0, 1.0);
        uint64_t local_count = 0;

        #pragma omp for schedule(static) reduction(+:inside_circle)
        for (uint64_t i = 0; i < total_points; ++i) {
            double x = dist(gen);
            double y = dist(gen);
            if (x * x + y * y <= 1.0) {
                ++inside_circle;
            }
        }
    }

    return 4.0L * inside_circle / total_points;
}

int main(int argc, char** argv) {
    if (argc < 2) {
        std::cout << "Usage: " << argv[0] << " <total_points>\\n";
        return 1;
    }

    uint64_t total_points = std::strtoull(argv[1], nullptr, 10);
    long double result = estimate_pi_parallel(total_points);
    std::cout << "Estimated Pi: " << std::fixed << std::setprecision(10) << result << "\\n";

    return 0;
}
#include <iostream>
#include <random>
#include <cstdint>
#include <ctime>
#include <omp.h>
#include <iomanip>

long double estimate_pi_parallel(uint64_t total_points) {
    uint64_t inside_circle = 0;

    #pragma omp parallel
    {
        std::mt19937 gen(time(NULL) ^ omp_get_thread_num());
        std::uniform_real_distribution<double> dist(0.0, 1.0);
        uint64_t local_count = 0;

        #pragma omp for schedule(static) reduction(+:inside_circle)
        for (uint64_t i = 0; i < total_points; ++i) {
            double x = dist(gen);
            double y = dist(gen);
            if (x * x + y * y <= 1.0) {
                ++inside_circle;
            }
        }
    }

    return 4.0L * inside_circle / total_points;
}

int main(int argc, char** argv) {
    if (argc < 2) {
        std::cout << "Usage: " << argv[0] << " <total_points>\\n";
        return 1;
    }

    uint64_t total_points = std::strtoull(argv[1], nullptr, 10);
    long double result = estimate_pi_parallel(total_points);
    std::cout << "Estimated Pi: " << std::fixed << std::setprecision(10) << result << "\\n";

    return 0;
}
#include <iostream>
#include <random>
#include <cstdint>
#include <ctime>
#include <omp.h>
#include <iomanip>

long double estimate_pi_parallel(uint64_t total_points) {
    uint64_t inside_circle = 0;

    #pragma omp parallel
    {
        std::mt19937 gen(time(NULL) ^ omp_get_thread_num());
        std::uniform_real_distribution<double> dist(0.0, 1.0);
        uint64_t local_count = 0;

        #pragma omp for schedule(static) reduction(+:inside_circle)
        for (uint64_t i = 0; i < total_points; ++i) {
            double x = dist(gen);
            double y = dist(gen);
            if (x * x + y * y <= 1.0) {
                ++inside_circle;
            }
        }
    }

    return 4.0L * inside_circle / total_points;
}

int main(int argc, char** argv) {
    if (argc < 2) {
        std::cout << "Usage: " << argv[0] << " <total_points>\\n";
        return 1;
    }

    uint64_t total_points = std::strtoull(argv[1], nullptr, 10);
    long double result = estimate_pi_parallel(total_points);
    std::cout << "Estimated Pi: " << std::fixed << std::setprecision(10) << result << "\\n";

    return 0;
}

摘要

藉由蒙地卡羅求$\pi$的平行化程式,觀察不同compiler與compiler flags之間的執行時間差距,並使用vtune進行觀察CPU使用率等。

Elapsed Time and Effective CPU Utilization

GCC

GCC Default

image.png

GCC -O2

image.png

GCC -O3 -march=native

image.png

ICPX

ICPX Default

image.png

ICPX -O2

image.png

ICPX -O3 -xHost