/* Copyright (c) 2014, NVIDIA Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#undef _WIN32_WINNT //#define _WIN32_WINNT 0x0602 #if defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__APPLE__) // Skip for now #else #include #ifdef USEOMP #include #endif #include #include #include #include #include #include #include #include //#include
//#undef __SYNCHRONIC_COMPATIBLE #include #include #include "TestSynchronic.hpp" // Uncomment to allow test to dump output //#define VERBOSE_TEST namespace Test { unsigned next_table[] = { 0, 1, 2, 3, //0-3 4, 4, 6, 6, //4-7 8, 8, 8, 8, //8-11 12, 12, 12, 12, //12-15 16, 16, 16, 16, //16-19 16, 16, 16, 16, //20-23 24, 24, 24, 24, //24-27 24, 24, 24, 24, //28-31 32, 32, 32, 32, //32-35 32, 32, 32, 32, //36-39 40, 40, 40, 40, //40-43 40, 40, 40, 40, //44-47 48, 48, 48, 48, //48-51 48, 48, 48, 48, //52-55 56, 56, 56, 56, //56-59 56, 56, 56, 56, //60-63 }; //change this if you want to allow oversubscription of the system, by default only the range {1-(system size)} is tested #define FOR_GAUNTLET(x) for(unsigned x = (std::min)(std::thread::hardware_concurrency()*8,unsigned(sizeof(next_table)/sizeof(unsigned))); x; x = next_table[x-1]) //set this to override the benchmark of barriers to use OMP barriers instead of n3998 std::barrier //#define USEOMP #if defined(__SYNCHRONIC_COMPATIBLE) #define PREFIX "futex-" #else #define PREFIX "backoff-" #endif //this test uses a custom Mersenne twister to eliminate implementation variation MersenneTwister mt; int dummya = 1, dummyb =1; int dummy1 = 1; std::atomic dummy2(1); std::atomic dummy3(1); double time_item(int const count = (int)1E8) { clock_t const start = clock(); for(int i = 0;i < count; ++i) mt.integer(); clock_t const end = clock(); double elapsed_seconds = (end - start) / double(CLOCKS_PER_SEC); return elapsed_seconds / count; } double time_nil(int const count = (int)1E08) { clock_t const start = clock(); dummy3 = count; for(int i = 0;i < (int)1E6; ++i) { if(dummy1) { // Do some work while holding the lock int workunits = dummy3;//(int) (mtc.poissonInterval((float)num_items_critical) + 0.5f); for (int j = 1; j < workunits; j++) dummy1 &= j; // Do one work unit dummy2.fetch_add(dummy1,std::memory_order_relaxed); } } clock_t const end = clock(); double elapsed_seconds = (end - start) / double(CLOCKS_PER_SEC); return elapsed_seconds / count; } template void testmutex_inner(mutex_type& m, std::atomic& t,std::atomic& wc,std::atomic& wnc, int const num_iterations, int const num_items_critical, int const num_items_noncritical, MersenneTwister& mtc, MersenneTwister& mtnc, bool skip) { for(int k = 0; k < num_iterations; ++k) { if(num_items_noncritical) { // Do some work without holding the lock int workunits = num_items_noncritical;//(int) (mtnc.poissonInterval((float)num_items_noncritical) + 0.5f); for (int i = 1; i < workunits; i++) mtnc.integer(); // Do one work unit wnc.fetch_add(workunits,std::memory_order_relaxed); } t.fetch_add(1,std::memory_order_relaxed); if(!skip) { std::unique_lock l(m); if(num_items_critical) { // Do some work while holding the lock int workunits = num_items_critical;//(int) (mtc.poissonInterval((float)num_items_critical) + 0.5f); for (int i = 1; i < workunits; i++) mtc.integer(); // Do one work unit wc.fetch_add(workunits,std::memory_order_relaxed); } } } } template void testmutex_outer(std::map>& results, std::string const& name, double critical_fraction, double critical_duration) { std::ostringstream truename; truename << name << " (f=" << critical_fraction << ",d=" << critical_duration << ")"; std::vector& data = results[truename.str()]; double const workItemTime = time_item() , nilTime = time_nil(); int const num_items_critical = (critical_duration <= 0 ? 0 : (std::max)( int(critical_duration / workItemTime + 0.5), int(100 * nilTime / workItemTime + 0.5))), num_items_noncritical = (num_items_critical <= 0 ? 0 : int( ( 1 - critical_fraction ) * num_items_critical / critical_fraction + 0.5 )); FOR_GAUNTLET(num_threads) { //Kokkos::Impl::portable_sleep(std::chrono::microseconds(2000000)); int const num_iterations = (num_items_critical + num_items_noncritical != 0) ? #ifdef __SYNCHRONIC_JUST_YIELD int( 1 / ( 8 * workItemTime ) / (num_items_critical + num_items_noncritical) / num_threads + 0.5 ) : #else int( 1 / ( 8 * workItemTime ) / (num_items_critical + num_items_noncritical) / num_threads + 0.5 ) : #endif #ifdef WIN32 int( 1 / workItemTime / (20 * num_threads * num_threads) ); #else int( 1 / workItemTime / (200 * num_threads * num_threads) ); #endif #ifdef VERBOSE_TEST std::cerr << "running " << truename.str() << " #" << num_threads << ", " << num_iterations << " * " << num_items_noncritical << "\n" << std::flush; #endif std::atomic t[2], wc[2], wnc[2]; clock_t start[2], end[2]; for(int pass = 0; pass < 2; ++pass) { t[pass] = 0; wc[pass] = 0; wnc[pass] = 0; srand(num_threads); std::vector randomsnc(num_threads), randomsc(num_threads); mutex_type m; start[pass] = clock(); #ifdef USEOMP omp_set_num_threads(num_threads); std::atomic _j(0); #pragma omp parallel { int const j = _j.fetch_add(1,std::memory_order_relaxed); testmutex_inner(m, t[pass], wc[pass], wnc[pass], num_iterations, num_items_critical, num_items_noncritical, randomsc[j], randomsnc[j], pass==0); num_threads = omp_get_num_threads(); } #else std::vector threads(num_threads); for(unsigned j = 0; j < num_threads; ++j) threads[j] = new std::thread([&,j](){ testmutex_inner(m, t[pass], wc[pass], wnc[pass], num_iterations, num_items_critical, num_items_noncritical, randomsc[j], randomsnc[j], pass==0); } ); for(unsigned j = 0; j < num_threads; ++j) { threads[j]->join(); delete threads[j]; } #endif end[pass] = clock(); } if(t[0] != t[1]) throw std::string("mismatched iteration counts"); if(wnc[0] != wnc[1]) throw std::string("mismatched work item counts"); double elapsed_seconds_0 = (end[0] - start[0]) / double(CLOCKS_PER_SEC), elapsed_seconds_1 = (end[1] - start[1]) / double(CLOCKS_PER_SEC); double time = (elapsed_seconds_1 - elapsed_seconds_0 - wc[1]*workItemTime) / num_iterations; data.push_back(time); #ifdef VERBOSE_TEST std::cerr << truename.str() << " : " << num_threads << "," << elapsed_seconds_1 / num_iterations << " - " << elapsed_seconds_0 / num_iterations << " - " << wc[1]*workItemTime/num_iterations << " = " << time << " \n"; #endif } } template void testbarrier_inner(barrier_type& b, int const num_threads, int const j, std::atomic& t,std::atomic& w, int const num_iterations_odd, int const num_iterations_even, int const num_items_noncritical, MersenneTwister& arg_mt, bool skip) { for(int k = 0; k < (std::max)(num_iterations_even,num_iterations_odd); ++k) { if(k >= (~j & 0x1 ? num_iterations_odd : num_iterations_even )) { if(!skip) b.arrive_and_drop(); break; } if(num_items_noncritical) { // Do some work without holding the lock int workunits = (int) (arg_mt.poissonInterval((float)num_items_noncritical) + 0.5f); for (int i = 1; i < workunits; i++) arg_mt.integer(); // Do one work unit w.fetch_add(workunits,std::memory_order_relaxed); } t.fetch_add(1,std::memory_order_relaxed); if(!skip) { int const thiscount = (std::min)(k+1,num_iterations_odd)*((num_threads>>1)+(num_threads&1)) + (std::min)(k+1,num_iterations_even)*(num_threads>>1); if(t.load(std::memory_order_relaxed) > thiscount) { std::cerr << "FAILURE: some threads have run ahead of the barrier (" << t.load(std::memory_order_relaxed) << ">" << thiscount << ").\n"; EXPECT_TRUE(false); } #ifdef USEOMP #pragma omp barrier #else b.arrive_and_wait(); #endif if(t.load(std::memory_order_relaxed) < thiscount) { std::cerr << "FAILURE: some threads have fallen behind the barrier (" << t.load(std::memory_order_relaxed) << "<" << thiscount << ").\n"; EXPECT_TRUE(false); } } } } template void testbarrier_outer(std::map>& results, std::string const& name, double barrier_frequency, double phase_duration, bool randomIterations = false) { std::vector& data = results[name]; double const workItemTime = time_item(); int const num_items_noncritical = int( phase_duration / workItemTime + 0.5 ); FOR_GAUNTLET(num_threads) { int const num_iterations = int( barrier_frequency ); #ifdef VERBOSE_TEST std::cerr << "running " << name << " #" << num_threads << ", " << num_iterations << " * " << num_items_noncritical << "\r" << std::flush; #endif srand(num_threads); MersenneTwister local_mt; int const num_iterations_odd = randomIterations ? int(local_mt.poissonInterval((float)num_iterations)+0.5f) : num_iterations, num_iterations_even = randomIterations ? int(local_mt.poissonInterval((float)num_iterations)+0.5f) : num_iterations; std::atomic t[2], w[2]; std::chrono::time_point start[2], end[2]; for(int pass = 0; pass < 2; ++pass) { t[pass] = 0; w[pass] = 0; srand(num_threads); std::vector randoms(num_threads); barrier_type b(num_threads); start[pass] = std::chrono::high_resolution_clock::now(); #ifdef USEOMP omp_set_num_threads(num_threads); std::atomic _j(0); #pragma omp parallel { int const j = _j.fetch_add(1,std::memory_order_relaxed); testbarrier_inner(b, num_threads, j, t[pass], w[pass], num_iterations_odd, num_iterations_even, num_items_noncritical, randoms[j], pass==0); num_threads = omp_get_num_threads(); } #else std::vector threads(num_threads); for(unsigned j = 0; j < num_threads; ++j) threads[j] = new std::thread([&,j](){ testbarrier_inner(b, num_threads, j, t[pass], w[pass], num_iterations_odd, num_iterations_even, num_items_noncritical, randoms[j], pass==0); }); for(unsigned j = 0; j < num_threads; ++j) { threads[j]->join(); delete threads[j]; } #endif end[pass] = std::chrono::high_resolution_clock::now(); } if(t[0] != t[1]) throw std::string("mismatched iteration counts"); if(w[0] != w[1]) throw std::string("mismatched work item counts"); int const phases = (std::max)(num_iterations_odd, num_iterations_even); std::chrono::duration elapsed_seconds_0 = end[0]-start[0], elapsed_seconds_1 = end[1]-start[1]; double const time = (elapsed_seconds_1.count() - elapsed_seconds_0.count()) / phases; data.push_back(time); #ifdef VERBOSE_TEST std::cerr << name << " : " << num_threads << "," << elapsed_seconds_1.count() / phases << " - " << elapsed_seconds_0.count() / phases << " = " << time << " \n"; #endif } } template struct mutex_tester; template struct mutex_tester { static void run(std::map>& results, std::string const name[], double critical_fraction, double critical_duration) { testmutex_outer(results, *name, critical_fraction, critical_duration); } }; template struct mutex_tester { static void run(std::map>& results, std::string const name[], double critical_fraction, double critical_duration) { mutex_tester::run(results, name, critical_fraction, critical_duration); mutex_tester::run(results, ++name, critical_fraction, critical_duration); } }; TEST( synchronic, main ) { //warm up time_item(); //measure up #ifdef VERBOSE_TEST std::cerr << "measuring work item speed...\r"; std::cerr << "work item speed is " << time_item() << " per item, nil is " << time_nil() << "\n"; #endif try { std::pair testpoints[] = { {1, 0}, /*{1E-1, 10E-3}, {5E-1, 2E-6}, {3E-1, 50E-9},*/ }; for(auto x : testpoints ) { std::map> results; //testbarrier_outer(results, PREFIX"bar 1khz 100us", 1E3, x.second); std::string const names[] = { PREFIX"tkt", PREFIX"mcs", PREFIX"ttas", PREFIX"std" #ifdef WIN32 ,PREFIX"srw" #endif }; //run --> mutex_tester< ticket_mutex, mcs_mutex, ttas_mutex, std::mutex #ifdef WIN32 ,srw_mutex #endif >::run(results, names, x.first, x.second); //<-- run #ifdef VERBOSE_TEST std::cout << "threads"; for(auto & i : results) std::cout << ",\"" << i.first << '\"'; std::cout << std::endl; int j = 0; FOR_GAUNTLET(num_threads) { std::cout << num_threads; for(auto & i : results) std::cout << ',' << i.second[j]; std::cout << std::endl; ++j; } #endif } } catch(std::string & e) { std::cerr << "EXCEPTION : " << e << std::endl; EXPECT_TRUE( false ); } } } // namespace Test #endif