在C ++ 11中,这个:
const std::vector<int>& f() {
static const std::vector<int> x { 1, 2, 3 };
return x;
}
是线程安全的。但是,由于这种额外的线程安全保证,在第一次(即初始化之后)调用此函数是否会有额外的惩罚?我想知道函数是否会慢于使用全局变量的函数,因为它必须获取一个互斥量来检查它是否在每次被调用时被另一个线程初始化,或者其他什么。
在C ++ 11中,这个:
const std::vector<int>& f() {
static const std::vector<int> x { 1, 2, 3 };
return x;
}
是线程安全的。但是,由于这种额外的线程安全保证,在第一次(即初始化之后)调用此函数是否会有额外的惩罚?我想知道函数是否会慢于使用全局变量的函数,因为它必须获取一个互斥量来检查它是否在每次被调用时被另一个线程初始化,或者其他什么。
“有史以来最好的直觉是'我应该衡量这个。'” 所以 让我们来看看:
#include <atomic>
#include <chrono>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <vector>
namespace {
class timer {
using hrc = std::chrono::high_resolution_clock;
hrc::time_point start;
static hrc::time_point now() {
// Prevent memory operations from reordering across the
// time measurement. This is likely overkill, needs more
// research to determine the correct fencing.
std::atomic_thread_fence(std::memory_order_seq_cst);
auto t = hrc::now();
std::atomic_thread_fence(std::memory_order_seq_cst);
return t;
}
public:
timer() : start(now()) {}
hrc::duration elapsed() const {
return now() - start;
}
template <typename Duration>
typename Duration::rep elapsed() const {
return std::chrono::duration_cast<Duration>(elapsed()).count();
}
template <typename Rep, typename Period>
Rep elapsed() const {
return elapsed<std::chrono::duration<Rep,Period>>();
}
};
const std::vector<int>& f() {
static const auto x = std::vector<int>{ 1, 2, 3 };
return x;
}
static const auto y = std::vector<int>{ 1, 2, 3 };
const std::vector<int>& g() {
return y;
}
const unsigned long long n_iterations = 500000000;
template <typename F>
void test_one(const char* name, F f) {
f(); // First call outside the timer.
using value_type = typename std::decay<decltype(f()[0])>::type;
std::cout << name << ": " << std::flush;
auto t = timer{};
auto sum = uint64_t{};
for (auto i = n_iterations; i > 0; --i) {
const auto& vec = f();
sum += std::accumulate(begin(vec), end(vec), value_type{});
}
const auto elapsed = t.elapsed<std::chrono::milliseconds>();
std::cout << elapsed << " ms (" << sum << ")\n";
}
} // anonymous namespace
int main() {
test_one("local static", f);
test_one("global static", g);
}
在Coliru运行,本地版本在4618毫秒内完成5e8次迭代,全局版本在4392毫秒内完成。所以是的,每次迭代本地版本慢了大约0.452纳秒。虽然存在可测量的差异,但在大多数情况下,它太小而不能影响观察到的性能。
#include <atomic>
#include <chrono>
#include <cstdint>
#include <iostream>
#include <numeric>
#include <vector>
namespace {
class timer {
using hrc = std::chrono::high_resolution_clock;
hrc::time_point start;
static hrc::time_point now() {
// Prevent memory operations from reordering across the
// time measurement. This is likely overkill.
std::atomic_thread_fence(std::memory_order_seq_cst);
auto t = hrc::now();
std::atomic_thread_fence(std::memory_order_seq_cst);
return t;
}
public:
timer() : start(now()) {}
hrc::duration elapsed() const {
return now() - start;
}
template <typename Duration>
typename Duration::rep elapsed() const {
return std::chrono::duration_cast<Duration>(elapsed()).count();
}
template <typename Rep, typename Period>
Rep elapsed() const {
return elapsed<std::chrono::duration<Rep,Period>>();
}
};
class f {
public:
const std::vector<int>& operator()() {
static const auto x = std::vector<int>{ 1, 2, 3 };
return x;
}
};
class g {
static const std::vector<int> x;
public:
const std::vector<int>& operator()() {
return x;
}
};
const std::vector<int> g::x{ 1, 2, 3 };
const unsigned long long n_iterations = 500000000;
template <typename F>
void test_one(const char* name, F f) {
f(); // First call outside the timer.
using value_type = typename std::decay<decltype(f()[0])>::type;
std::cout << name << ": " << std::flush;
auto t = timer{};
auto sum = uint64_t{};
for (auto i = n_iterations; i > 0; --i) {
const auto& vec = f();
sum += std::accumulate(begin(vec), end(vec), value_type{});
}
const auto elapsed = t.elapsed<std::chrono::milliseconds>();
std::cout << elapsed << " ms (" << sum << ")\n";
}
} // anonymous namespace
int main() {
test_one("local static", f());
test_one("global static", g());
}
毫不奇怪,两者的运行时间都比较快 g ++(本地3803ms,全球2323ms) 和 clang(当地4183ms,全球3253ms)。结果肯定了我们的直觉,即全局技术应该比本地技术更快,每次迭代的增量为2.96纳秒(g ++)和1.86纳秒(clang)。
是的,检查对象是否已初始化会有成本。这通常会测试原子布尔变量,而不是锁定互斥锁。