STL Parallel Algorithms & Execution Policy

개요

C++17은 STL 알고리즘에 실행 정책(Execution Policy)을 추가하여, 기존 순차 알고리즘을 코드 변경 최소화로 병렬화할 수 있게 했습니다. std::sort, std::transform, std::reduce 등 수십 개의 알고리즘이 병렬 실행을 지원합니다.

1. 실행 정책 종류

#include <execution>
#include <algorithm>
#include <vector>

std::vector<int> data(1'000'000);

// 순차 실행 (기존과 동일)
std::sort(std::execution::seq, data.begin(), data.end());

// 병렬 실행 (멀티스레드)
std::sort(std::execution::par, data.begin(), data.end());

// 병렬 + 벡터화 (멀티스레드 + SIMD)
std::sort(std::execution::par_unseq, data.begin(), data.end());

// 벡터화만 (단일 스레드 + SIMD) - C++20
std::sort(std::execution::unseq, data.begin(), data.end());

정책	멀티스레드	SIMD	설명
`seq`	아니오	아니오	기존 순차 실행
`par`	예	아니오	멀티스레드 병렬
`par_unseq`	예	예	병렬 + 벡터화
`unseq`	아니오	예	단일 스레드 벡터화

2. 주요 알고리즘 실전 예제

2.1 std::transform (병렬 변환)

#include <execution>
#include <algorithm>
#include <vector>
#include <cmath>

int main() {
    const std::size_t N = 10'000'000;
    std::vector<double> input(N), output(N);

    // 입력 초기화
    std::iota(input.begin(), input.end(), 0.0);

    // 병렬 변환: 각 원소에 sqrt 적용
    std::transform(
        std::execution::par_unseq,
        input.begin(), input.end(),
        output.begin(),
        [](double x) { return std::sqrt(x); }
    );

    return 0;
}

2.2 std::reduce (병렬 합산)

#include <execution>
#include <numeric>
#include <vector>

int main() {
    std::vector<long long> data(10'000'000);
    std::iota(data.begin(), data.end(), 1LL);

    // std::accumulate는 순차만 지원
    // std::reduce는 병렬 + 교환법칙 성립하는 연산 지원
    long long sum = std::reduce(
        std::execution::par,
        data.begin(), data.end(),
        0LL,
        std::plus<long long>{}
    );

    // 주의: reduce는 연산 순서 보장 안 됨 (부동소수점에서 결과 차이 가능)
    return 0;
}

2.3 std::transform_reduce (맵-리듀스)

#include <execution>
#include <numeric>
#include <vector>

// 내적 (dot product) 병렬 계산
double dot_product(const std::vector<double>& a, const std::vector<double>& b) {
    return std::transform_reduce(
        std::execution::par_unseq,
        a.begin(), a.end(),
        b.begin(),
        0.0,                      // 초기값
        std::plus<double>{},      // 리듀스 연산
        std::multiplies<double>{} // 변환 연산
    );
}

int main() {
    std::vector<double> a = {1.0, 2.0, 3.0, 4.0};
    std::vector<double> b = {5.0, 6.0, 7.0, 8.0};
    double result = dot_product(a, b); // 1*5 + 2*6 + 3*7 + 4*8 = 70
    return 0;
}

2.4 std::for_each (병렬 순회)

#include <execution>
#include <algorithm>
#include <atomic>
#include <vector>

int main() {
    std::vector<int> data(1'000'000, 1);
    std::atomic<long long> sum{0};

    std::for_each(
        std::execution::par,
        data.begin(), data.end(),
        [&sum](int val) {
            sum.fetch_add(val, std::memory_order_relaxed);
        }
    );
    // sum == 1,000,000
    return 0;
}

3. 병렬 정렬 성능 측정

#include <execution>
#include <algorithm>
#include <vector>
#include <random>
#include <chrono>
#include <iostream>

template<typename Policy, typename Container>
auto measure(Policy policy, Container data) {
    auto start = std::chrono::high_resolution_clock::now();
    std::sort(policy, data.begin(), data.end());
    auto end = std::chrono::high_resolution_clock::now();
    return std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
}

int main() {
    const std::size_t N = 50'000'000;
    std::vector<int> data(N);
    std::mt19937 rng(42);
    std::uniform_int_distribution<int> dist(0, 1'000'000);
    std::generate(data.begin(), data.end(), [&]{ return dist(rng); });

    auto t_seq = measure(std::execution::seq, data);
    auto t_par = measure(std::execution::par, data);
    auto t_par_unseq = measure(std::execution::par_unseq, data);

    std::cout << "seq:       " << t_seq << " ms\n";
    std::cout << "par:       " << t_par << " ms\n";
    std::cout << "par_unseq: " << t_par_unseq << " ms\n";
    std::cout << "speedup (par): " << (double)t_seq / t_par << "x\n";

    return 0;
}
// 8코어 기준 예시: seq=2500ms, par=380ms, par_unseq=320ms

4. par_unseq 사용 시 주의사항

par_unseq는 SIMD 벡터화와 멀티스레드를 동시에 활용합니다. 람다 내부에서 다음을 피해야 합니다.

// 잘못된 예 - par_unseq에서 mutex 사용 불가 (벡터화 불가, 데드락 위험)
std::mutex mtx;
std::for_each(std::execution::par_unseq, v.begin(), v.end(),
    [&](int x) {
        std::lock_guard lock(mtx);  // 절대 금지
        shared_value += x;
    }
);

// 올바른 예 - atomic 사용
std::atomic<int> total{0};
std::for_each(std::execution::par_unseq, v.begin(), v.end(),
    [&](int x) {
        total.fetch_add(x, std::memory_order_relaxed);
    }
);

5. 사용 가능한 병렬 알고리즘 목록

std::sort, std::stable_sort, std::partial_sort
std::find, std::find_if, std::count, std::count_if
std::copy, std::copy_if, std::move
std::transform, std::for_each
std::reduce, std::transform_reduce
std::inclusive_scan, std::exclusive_scan
std::fill, std::generate
std::replace, std::replace_if
std::remove, std::remove_if
std::reverse, std::rotate
std::unique, std::merge

6. 컴파일러 및 라이브러리 지원

# GCC: Intel TBB 또는 OpenMP 백엔드 필요
g++ -std=c++17 -O2 -ltbb main.cpp

# MSVC: 내장 병렬 백엔드 (추가 라이브러리 불필요)
cl /std:c++17 /O2 main.cpp

# Clang: LLVM PSTL 또는 TBB 필요
clang++ -std=c++17 -O2 -ltbb main.cpp

7. inclusive_scan / exclusive_scan

#include <numeric>
#include <execution>
#include <vector>
#include <iostream>

std::vector<int> data = {1, 2, 3, 4, 5};
std::vector<int> out(data.size());

// inclusive_scan: 각 위치까지 포함한 누적합 (C++17)
std::inclusive_scan(std::execution::par, data.begin(), data.end(), out.begin());
// out = {1, 3, 6, 10, 15}

// exclusive_scan: 현재 위치를 제외한 누적합
std::exclusive_scan(std::execution::par, data.begin(), data.end(), out.begin(), 0);
// out = {0, 1, 3, 6, 10}

// transform_inclusive_scan: 변환 후 스캔
std::transform_inclusive_scan(std::execution::par,
    data.begin(), data.end(), out.begin(),
    std::plus<>{},
    [](int x) { return x * x; }); // 제곱의 누적합
// out = {1, 5, 14, 30, 55}

8. 예외 처리 동작

병렬 알고리즘에서 람다 내부에서 예외가 발생하면 std::terminate()가 호출됩니다. std::execution::seq에서만 예외가 전파됩니다.

std::vector<int> data = {1, 0, 2};

// seq: 예외 전파됨
try {
    std::for_each(std::execution::seq, data.begin(), data.end(),
        [](int x) { if (x == 0) throw std::runtime_error("0 발견"); });
} catch (const std::exception& e) {
    std::cout << e.what(); // "0 발견"
}

// par: 예외 발생 시 std::terminate() 호출
// std::for_each(std::execution::par, data.begin(), data.end(),
//     [](int x) { if (x == 0) throw std::runtime_error("위험!"); });
// 위 코드는 프로그램을 종료시킴

// 안전한 방법: atomic 플래그로 오류 처리
std::atomic<bool> hasError{false};
std::for_each(std::execution::par, data.begin(), data.end(),
    [&](int x) {
        if (x == 0) hasError.store(true, std::memory_order_relaxed);
    });

9. 병렬화 효과적인 데이터 크기 기준

// 병렬화 오버헤드 vs 이득
// 일반적으로 원소 수가 10,000 이상일 때 이득이 오버헤드를 초과
// 원소당 작업량이 클수록 임계점이 낮아짐

template<typename Container>
void smart_sort(Container& c) {
    if (c.size() < 10000) {
        std::sort(c.begin(), c.end());           // 소규모: 순차
    } else {
        std::sort(std::execution::par, c.begin(), c.end()); // 대규모: 병렬
    }
}

데이터 크기	권장 정책
< 1,000	`seq` (병렬 오버헤드 > 이득)
1,000~10,000	벤치마크 후 결정
> 10,000	`par` 또는 `par_unseq`

정리

STL 병렬 알고리즘은 기존 코드에서 실행 정책 인자 하나만 추가하면 멀티코어를 활용할 수 있는 우아한 방법입니다. 단, 람다 내부에서 공유 상태 접근에는 thread-safe한 방법을 사용해야 하며, par_unseq에서는 뮤텍스를 완전히 배제해야 합니다. 데이터 크기가 수만 건 이상일 때 성능 이득이 뚜렷하게 나타납니다.