SlideShare a Scribd company logo
1C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
/*
* multi.cpp
*
* Created on: 13 Mar 2014
* Author: Russell John Childs.
*/
//=======================================================================================================
// COPYRIGHT NOTICE
// This code sample and ideas embodied remain the property of Russell John Childs, PhD and have been
// distributed as a representative example of my use of C++11 features.
//==========================================================================================================
============
//====================================================
// File contains
// (1) Implementation of lock-based thread pool.
// (2) Implementation of lock-free "thread pool".
// (3) Implementation of SIMD, autovectorisation (N.B. This was compiled for Intel i7-3720QM Ivy Bridge
processor)
// Implementation of parallel search (using lock-based thread pool, lock-free event-based pool, SIMD
parallel for):
// 1) Split sorted array into <num_threads> equal chunks
// 2) Assign each chunk to a thread.
// 3) Thread returns true iff chunk.begin() <= search_val <= chunk.end()
// 4) Replace array with chunk that returned true and return to step 1
// Complexity:
// t - number of threads ( > 1 )
// Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t))
// Parallel = O(log_t(n))
//
// Compiling this code sample (Linux Mint - g++ 4.8)
//
// Compiler options:
// g++ -O3 -fopenmp -mavx -m64 -g -Wall -c -fmessage-length=0 -fno-omit-frame-pointer
// --fast-math -ftree-vectorizer-verbose=3 -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include
/
// multithreading.cpp
//
// Linker options:
// g++ -fopenmp -L/opt/intel/vtune_amplifier_xe_2013/lib64/ -o "multithreading" $(OBJS) $(USER_OBJS) $
(LIBS) -lpthread -latomic -littnotify -ldl
//
//==============================================================
#include <thread>
#include <future>
#include <condition_variable>
#include <atomic>
#include <functional>
#include <deque>
#include <vector>
#include <set>
#include <iostream>
#include <string>
#include <sstream>
#include <cmath>
#include <algorithm>
#include <omp.h>
#include <immintrin.h>
//#include <cilk/cilk.h>
//Uncomment following #define if you do not have Intel VTune Amplifier XE 2013 performance profiler.
#define INTEL_NO_ITTNOTIFY_API
//include Vtune API header iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifndef INTEL_NO_ITTNOTIFY_API
#include "ittnotify.h"
#endif
//Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifdef INTEL_NO_ITTNOTIFY_API
#define VTUNE(STATEMENT)
#else
2C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
#define VTUNE(STATEMENT) STATEMENT
#endif
//Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifdef INTEL_NO_ITTNOTIFY_API
#define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) STATEMENTS
#else
#define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) 
{ 
auto domain = __itt_domain_create(DOMAIN); 
__itt_task_begin(domain, __itt_null, __itt_null, __itt_string_handle_create("simd_search()")); 
STATEMENTS 
__itt_task_end(domain); 
}
#endif
// =================================================================
// Class wrapper for std::packaged_task to make different signatures, e.g. int(void), fload(int,int), ...
// storable in STL container for thread pool.
// ===
// N.B. simpler mechanism would be std::vector<std:function<void(void)>>; v[i]= std:packaged_task<Type
(Type)>(type),
// since packaged_task has void operator()(void). However, there is a problem: std::function
// requires command object to be copyable and packaged_task has move-only semantics.
//==================================================================
//============================
//Primary template
//============================
template< typename Out = void, typename In = void >
struct MyPackagedTask
{
virtual ~MyPackagedTask(void)
{
}
};
//============================
//Explicit specialization, acts as base class
// MyPackagedTask<>& poly = *new MyPackagedTaks<MyType(OtherType)>;
// poly(); --> calls MyPackagedTaks<MyType(OtherType)>::op()
//============================
template<>
struct MyPackagedTask<>
{
virtual ~MyPackagedTask(void)
{
}
virtual void operator()(void)
{
}
};
std::mutex last_return_mutex;
//============================
//Specialization for function signature
// MyPackagedTaks<MyType(OtherType)>
//============================
template< typename Out, typename... In >
struct MyPackagedTask< Out(In...) > : public MyPackagedTask<>
{
MyPackagedTask(std::function<Out(In...)> func, In... in) :
m_task(std::bind(func, in...))
{
}
virtual ~MyPackagedTask(void)
{
}
MyPackagedTask(MyPackagedTask&& other) :
m_task(std::move(other.m_task))
3C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
}
void operator()(void)
{
m_task();
}
std::future<Out> get_future(void)
{
return m_task.get_future();
}
private:
std::packaged_task<Out(void)> m_task;
};
//======================================================================
// Simple thread pool class
// Places tasks onto common queue
// Allocates fixed number of threads which pop tasks.
// TODO: Load balancing, cache ping-pong (RFOs), VTune optimisation.
//====================================================================
class ThreadPool
{
public:
ThreadPool(unsigned max_num_threads = 1U << 31) :
m_done(false), //notice to threads to shut down
m_print_shutdown_msg(true), //print or not print shutdown msg
m_max_num_threads(max_num_threads), //maximum num threads allowed in pool
m_num_threads(0), //num threads allocated by the pool
m_processing(0), //tasks still running
m_cancel(false)
{
}
~ThreadPool(void)
{
//Shut down threads iff user has not alread called shutdown()
if (!m_done)
{
shutdown();
}
}
//=================
// Push task onto pool
//================
template< typename Out, typename... In >
std::future<Out> push(std::function<Out(In...)> func, In... in)
{
//Create task, store future
MyPackagedTask<Out(In...)> task(func, in...);
std::future<Out> ret_val = task.get_future();
//lock task queue, push the new task onto the queue, notify threads waiting on empty queue, release
lock
if (m_cancel == false)
{
{
std::unique_ptr<MyPackagedTask<>> ptr(new MyPackagedTask<Out(In...)>(std::move(task))); //
Base*=&Derived for poly call
std::lock_guard<std::mutex> lock(m_tasks); //lock queue
m_pool.push_back(std::move(ptr)); //push task
} //release lock
m_condition_variable.notify_all(); //notify waiting threads
//spawn a thread(async will prevent oversubscription) and store thread future(to check for
thread termination at pool shutdown)
if ((++m_num_threads <= m_max_num_threads))
{
std::unique_lock<std::mutex> lock(m_threads);
m_thread_list.push_back(std::async(std::launch::async, &ThreadPool::run_tasks, this));
}
4C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
}
//return packaged_task future so that caller can wait for result
return ret_val;
}
//=================
// get number of threads allocated
//================
unsigned get_num_threads(void)
{
std::unique_lock<std::mutex> lock(m_threads);
return m_thread_list.size();
}
//=================
// Cancel all tasks but keep threads alivre (for reuse by next set of tasks during iteration). Not yest
tested.
//================
void cancel_tasks(void)
{
m_cancel = true;
while (m_processing != 0);
{
std::unique_lock<std::mutex> lock(m_tasks); //lock task queue
m_pool.clear();
}
m_cancel = false;
}
//=================
// Kill all threads and print out shutdown message (iff msg==true)
//================
void shutdown(bool msg = true)
{
m_print_shutdown_msg = msg;
{
if (m_print_shutdown_msg)
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::endl << "=================================================================
" << std::endl
<< "Shutting down threads: ";
}
}
cancel_tasks();
//Notify all threads of thread pool termination
m_done = true;
m_condition_variable.notify_all();
//Loop over all threads and wait for them to terminate
{
std::unique_lock<std::mutex> lock(m_threads);
for (auto& elem : m_thread_list)
{
while (!elem.valid());
elem.get();
}
}
//Clear thread queue
{
std::unique_lock<std::mutex> lock(m_threads);
m_thread_list.clear();
}
//Print out shutdown message
if (m_print_shutdown_msg)
5C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::endl << "=================================================================" <<
std::endl;
}
}
private:
//=================
// Pop and run tasks in threads.
//================
void run_tasks(void)
{
//To avoid branch misprediction, use array to store branch code instead of if-else
std::unique_ptr<MyPackagedTask<>> func;
std::function<void(void)> branch_true = [&]{ func = std::move(m_pool.front()); m_pool.pop_front();
};
std::function<void(void)> branch_false = [&]{ func = std::unique_ptr<MyPackagedTask<>>(new
MyPackagedTask<>); };//NOP
std::function<void(void)> switch_func[2]{ branch_false, branch_true};
while (!m_done)
{
// Only wait if there are still tasks to be processed
{
bool empty; //Status of task queue
std::unique_lock<std::mutex> lock(m_tasks); //lock task queue
m_condition_variable.wait_for(lock, std::chrono::nanoseconds(100), [&]{ return !(empty =
m_pool.empty()) || m_done; }); //wakeup if queue empty or shutdown
switch_func[!empty && !m_done](); //only run non-NOP if queue not empty and not shutdown.
}
++m_processing;
(*func)();
--m_processing;
}
//Print out shutdown msg
if (m_done & m_print_shutdown_msg)
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::this_thread::get_id() << " ";
}
}
std::atomic<bool> m_done;
std::atomic<bool> m_print_shutdown_msg;
std::atomic<unsigned> m_max_num_threads;
std::atomic<unsigned> m_num_threads;
std::atomic<unsigned> m_processing;
std::atomic<bool> m_cancel;
std::deque< std::unique_ptr<MyPackagedTask<>> > m_pool;
std::vector< std::future<void> > m_thread_list;
std::mutex m_threads;
std::mutex m_tasks;
std::mutex m_shutdown;
std::condition_variable m_condition_variable;
};
//=====================================
// Simple test class
// Creates a few tasks, pushes them onto thread pool, gets results
//==================================================================
struct SimpleTest
{
SimpleTest(void) try
{
std::cout << std::endl << "Simple Test......" << std::endl << std::endl;
//Create thread pool
ThreadPool thread_pool;
6C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
//create a task
std::thread::id f1_id;
std::function< int(int, int) > f1 = [&](int i, int j)
{
f1_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
return i*j;
};
//create another task
std::thread::id f2_id;
std::function< std::string(void) > f2 = [&](void)
{
f2_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
return std::string("return value of f2");
};
//create another task
std::thread::id f3_id;
std::string f3_str;
std::function< void(void) > f3 = [&](void)
{
f3_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
f3_str = "f3 called";
};
//push tasks
auto start = std::chrono::high_resolution_clock::now(); //start timer
std::future<int> fut_1(std::move(thread_pool.push(f1, 10, 20)));
std::future<std::string> fut_2 = thread_pool.push(f2);
int fut_1_res = fut_1.get();
std::string fut_2_res = fut_2.get();
auto end = std::chrono::high_resolution_clock::now(); //stop timer
//std::future<void> fut_3 = thread_pool.push(f3); //TODO - fix compilation error.
// std::cout << typeid(decltype(thread_pool.push(f3))).name() << std::endl; // gives std::future
<void>
//std::future<void> test_fut; //compiles
//std::future<void> test_fut1 = std::move(test_fut); //compiles
//thread_pool.push(f3); // doesn't compile
// std::function< int(int) > f4 = [&](int i){ return ++i; }; //compiles
// thread_pool.push(f4, 2); //compiles
// std::function< void(int) > f4 = [&](int i){ ++i; }; //compiles
// thread_pool.push(f4, 2); //doesn't compile
//print num of threads running, thread id for tasks, result sent back by tasks
std::cout << "num threads=" << thread_pool.get_num_threads() << std::endl;
std::cout << "f1 thread id=" << f1_id << std::endl;
std::cout << "f1's result: " << fut_1_res << std::endl;
std::cout << "f2 thread id=" << f2_id << std::endl;
std::cout << "f2's result: " << fut_2_res << std::endl;
//std::cout << "f3 thread id=" << f3_id << std::endl;
//std::cout << "f3's result: " << f3_str << std::endl;
std::cout << "thread_pool time = "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns" << std::
endl;
//cleanup threads
//thread_pool.shutdown(); test dtor
}
catch (...)
{
std::cout << "SimpleTest exception" << std::endl;
}
};
//==============================================================
// Parallel vs binary search test class
// t - number of threads ( > 1)
// Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t))
// Parallel = O(log_t(n))
//
// Binary = std::find (single threaded for comparison).
7C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
// Parallel: (1) Split array into equal chuncks, push them onto thread pool
// (2) Chunk containing search-val returns true. N.B. predicate simply returns begin() <= val
<= end().
// (3) Chunk returning true replaces array and step(1) repeated.
// N.B. Parallel search gets insertion point of nearest match rather than vector.end() if no match
// Not optimised for RFOs (cache ping-pong), load balancing, VTune or early match (binary is
quicker if early match found).
// Benchmarks show high overhead of thread pool.
//===============================================================
struct ParallelSearch
{
//Choose which to run
bool is_lock_free = false; //run lock free lambda (if enabled, should set variable "factor", below, to
100 since the search is time consuming)
bool is_lock_based = false; //run lock-based lambda (if enabled, should set variable "factor", below,
to 100 since the search is time consuming)
bool is_simd = true; //run simd-based lambda
//Choose number of threads
//unsigned num_threads = 2; //To get thread overhead (parallel/binary = 1 for no overhead)
//const unsigned num_threads = std::thread::hardware_concurrency()/2; //undersubscription,
should run slower than optimal
const unsigned num_threads = std::thread::hardware_concurrency(); //Should be optimal
choice
//const unsigned num_threads = 2*std::thread::hardware_concurrency(); //moderate
oversubsrciption, should run slower tha optimal
//const unsigned num_threads = 4*std::thread::hardware_concurrency(); //heavy
oversubsrciption, should run slower tha optimal
//const unsigned num_threads = 128 * std::thread::hardware_concurrency(); //massive
oversubsrciption, should run slower tha optimal
ParallelSearch(void) try
{
std::atomic<bool> done(false); //flag used in lcok-free search to notify of completion
//Create large, sorted array on heap to avoid seg fault.
const unsigned size = 2 << 24;
std::vector<unsigned> my_array(size);
for (auto& elem : my_array)
{
static unsigned i = 0;
elem = 2 * i; //even numbers
++i;
}
//double-word atomic containing the address of a matching chunk and the new new chunk length (size,
size/t, size/t^2 ...)
struct DoubleWord
{
unsigned* m_address;
unsigned m_chunk_length;
};
std::atomic<DoubleWord> chunk_address_and_length(DoubleWord{ &my_array[0], size / num_threads });
//val seacrched for (TODO: binary search faster than parallel search if binary finds early match.
Need to terminate parallel search earlier)
bool even = true;
unsigned val = my_array[((size >> 1) + ((size >> 1) - 1)*rand() / RAND_MAX)] + (even ? 0 : 1); //
even/odd number --> found/not found
//Variables for found position, passes taken and whether to printout progress(incurs overhead)
unsigned* ret_val = &my_array[0];
int passes = 0; //int required by g++ autovectorize
bool printout = false;
//SIMD lambda (Proved to be quite difficult getting g++ to autovectorise)
//(N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor)
// 1. Split array into t chunks
// 2. Allocate chunks to t SIMD lanes
// 3. Each lane checks chunk.begin() <= search-val <= chunk.end()
// 4. The SIMD lane getting a match set array = chunk
// 5. Steps 1 to 4 repeated until chunk is 1 element long.
8C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
std::function<bool(void)> simd_search = [&]()
{
//Alignment (SSE - 16 byte SIMD register, AVX 32-byte SIMD register)
const unsigned alignment = 16; //g++ bug with 32-byte ((http://gcc.gnu.org/bugzilla/show_bug.cgi
?id=56787)
//Pre-calculate chunk size (size/8, size/64, size/8^3 ... 1 element(s))
alignas(alignment) int chunk_length[9]{size >> 3, size >> 6, size >> 9, size >> 12, size >> 15,
size >> 18, size >> 21, size >> 24, 1};
//Pre-calculate lower index for lower <= val << upper. N.B This is converted to lower[n]/8,
lower[n]/64 ...
alignas(alignment) int lower_index[8]{0, size, 2 * size, 3 * size, 4 * size, 5 * size, 6 * size,
7 * size};
//Pre-caclulate num of SIMD lanes to allocate to for loop to be vectorised
alignas(alignment) int limits[9]{8, 8, 8, 8, 8, 8, 8, 8, 2};
//Running tally of start of chunk to be searched
alignas(alignment) int offset = 0;
alignas(alignment) int tmp_offset = 0;
//Loop until chunk length is 1 element
for (passes = 0; passes<9; ++passes)
{
//Following lambda is a test to see if hotspots marked "LINE X" and "LINE Y", below, are
due to memory stalls.
//It turns out prefetch does eliminate hotspots X, Y, but adds overhead of its own, so this
search algorithm is unavoidably
//memory-bound unless something along the lines of a heap-ordered array (i.e array is laid
out as a breadth-first n-ary tree) is
//used to convert random access to linear access without need for scatter-gather.
//#pragma omp parallel for //Adds too much overhead
[&]() //Sadly, won't vectorise due to function call
{
unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0],
alignment); //Requirement for autovectorise.
for (int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes to prefetch data
they will use
{
int tmp = pos*chunk_length[passes]; //Get lower index
for chunk interval
__builtin_prefetch(&my_array[0] + offset + tmp); //See if it
removes hotspot from "LINE X", below
__builtin_prefetch(&my_array[0] + offset + tmp + chunk_length[passes] - 1); //See if
it removes hotspot from "LINE Y", below
}
}();
//Fork: Assign each chunk to an SIMD lane
//N.B. Use lmabda to force vectorisation of loop. Without it, loop is unrolled but SLP not
vectorised. This does autovectorise under g++ 4.8
//N.B. Code has been borken down into painfully simple steps to help autovectoriser and
pinpoint which operations are causing trouble
[&]()
{
unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0],
alignment); //Requirement for autovectorise.
alignas(alignment) int chunk = chunk_length[passes];
for (alignas(alignment) int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes
{
//Find matching chunk by adding 0 to offset for no-match and chunk address for a
match
alignas(alignment) int tmp = pos*chunk; //Lower index of chunk range without offset
//int tmp=lower_index[pos]>>3; //g++ bug (http://gcc.gnu.org/bugzilla/show_bug.cgi?
id=56787). Can't use 32-byte AVX.
alignas(alignment) int lower_ind = offset + tmp; //Lower index of chunk range
alignas(alignment) int upper_ind = lower_ind + chunk - 1; //Upper index of chunk
range
unsigned lower_val = tmp_ret_val[lower_ind]; //LINE X - Hotspot (eliminated by above
9C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
prefetch)
unsigned upper_val = tmp_ret_val[upper_ind]; //LINE Y - Hotspot (eliminated by above
prefecth)
alignas(alignment) bool test_lower = lower_val <= val; //Lower
alignas(alignment) bool test_upper = val <= upper_val; //and upper limit check
alignas(alignment) bool test = test_lower && test_upper; // is search-val inside
chunk for this SIMD lane?
tmp_offset += test*tmp; //Horrible construct to get it to autovec.
It masks out SIMD lanes that don't contain search val.
//Following fails because it is "not suitable for gather" (whatever that means)
//offset += ((tmp_ret_val[offset+tmp] <= val) & (val <= tmp_ret_val[offset+tmp+
chunk_length[passes]-1]))*tmp;
//Following fails because of "control flow" (Can't see why g++ doesn't autovec it,
control flow can be reaplced with masked op)
//if((tmp_ret_val[offset+tmp] <= val) && (val <= tmp_ret_val[offset+tmp+chunk_length
[passes]-1])) tmp_offset = tmp;
}
}();
//Join: end of SIMD
//Update chunk start address index
offset = tmp_offset;
/*std::cout << "offset=" << offset << std::endl;
std::cout << "passes=" << passes;
std::cout << ", val=" << val;
std::cout << ", range=[" << array[offset] << "," << array[offset+1];
std::cout<< ", chunk length=" << chunk_length[passes] << std::endl;
*/
}
//Update final index of search-val
ret_val = &my_array[0] + offset;
return true;
};
//Lock-free lambda for each thead
//Operation:
//1. The array is split into t (num of threads) chunks
//2. Each thread examines its chunk
//3. If a match is found in a chunk, the thread changes the array to be that chunk.
//4. The process repeated from step 1.
//t threads continous monitor the array and process their chunk of the array. Since the array
pointer is
// atomic, when one thread sees a matching chunk and changes the array to be that chunk, this is
picked up
// by all threads. No synchronisation is needed.
// arg chunk_pos - section of chunk to search (0 - [begin, begin+chunk_length/t], 1 - [begin+
chunk_length/t, begin+2*chunk_length/t], ..)
std::atomic<unsigned> running_threads(0);
std::atomic<bool> go(false);
std::function<bool(unsigned)> lock_free = [&](unsigned chunk_pos)
{
//Increment running thread count
++running_threads;
//Keep all threads on hold until signalled to begin together (for timings).
while (!go);
//Keep searching until a thread notifies completion.
while (!done)
{
//capture chunk address and length
DoubleWord capture = chunk_address_and_length;
//Check if search-val between chunk.begin() and chunk.end()
unsigned *begin = capture.m_address + chunk_pos*capture.m_chunk_length;
unsigned *end = begin + capture.m_chunk_length - 1;
unsigned test1 = *begin, test2 = *end;
if (*begin <= val && val <= *end)
10C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
{
//Print out iterations (adds significant overhead)
static std::mutex printout_mutex;
if (printout)
{
std::unique_lock<std::mutex> lock(printout_mutex);
//Print out iterations (adds significant overhead)
std::cout << "Parallel find (pass " << passes << "): Closest match "
<< *begin << "<=" << val << "<=" << *end
<< ", chunk length=" << capture.m_chunk_length << std::endl;
}
//Update parent variables for printouts
ret_val = begin;
++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num
threads)
std::function<void(void)> branch_true = [&]() //IF
{
//Update chunk length and address
capture.m_chunk_length = (capture.m_chunk_length >= num_threads ? (capture.
m_chunk_length / num_threads) : 1); //divide chunk evenly
capture.m_address = begin; //point to this chunk
chunk_address_and_length = capture;
};
std::function<void(void)> branch_false = [&]() //ELSE
{
done = true; //notify parent and sister threads of completion
};
std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if}
if_else[capture.m_chunk_length > 1](); //if-else
}
else
{
std::this_thread::sleep_for(std::chrono::nanoseconds(5000));
}
}
return true;;
};
//Create thread pool for lock-based search
static ThreadPool thread_pool(num_threads);
//Notification of completion of lock-based search
std::condition_variable finished;
// lock - based lambda for each thread.It simply tests whether array[pos] <= search_val <= array[pos
+ chuhnk_length]
// and iff true, spawns t threads to narrow down the search, iteratively arriving at the insertion
point.
std::function<bool(unsigned*, unsigned)> lock_based = [&](unsigned* tmp, unsigned chunk_length) //
{
//Keep all threads on hold until signalled to begin together (for timings).
while (!go);
//Check if search-val between chunk.begin() and chunk.end()
if (*tmp <= val && val <= *(tmp + chunk_length - 1))
{
//Print out iterations (adds significant overhead)
if (printout)
{
std::cout << "Parallel find (pass " << passes << "): Closest match " << *tmp << "<=" <<
val
<< "<=" << (chunk_length > 1 ? *(tmp + chunk_length - 1) : *tmp == val ? val : *(tmp
+ 1))
<< ", chunk length=" << chunk_length << std::endl;
}
//Update parent variables for printouts
ret_val = tmp;
++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num
threads)
11C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
//Spawn new tasks to process this chunk
//Following peculiar construct is to avoid branch misprediction by using array of fn ptrs to
replace if-else
//need VTune to test out whether it saves us any mispredictions.
std::function<void(void)> branch_true = [&]() //IF
{
chunk_length = (chunk_length >= num_threads ? (chunk_length / num_threads) : 1);
//divide chunk evenly
for (unsigned index = 0; index < num_threads; ++index)
{
thread_pool.push(lock_based, tmp + index*chunk_length, chunk_length);
}
};
std::function<void(void)> branch_false = [&]() //ELSE
{
finished.notify_one(); //chunk length is 1, so we are finished dividing-and-
conquering
};
std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if}
if_else[chunk_length>1](); //if-else
}
return true;
};
std::cout << std::endl << "Parallel vs Binary Search......" << std::endl << std::endl;
//Obtain position of element (to verify parallel search finds correct position).
auto pos = std::find(my_array.begin(), my_array.end(), val);
//Ordinary binary search for timing comparison
std::cout << std::endl << "=========================================================================
=====" << std::endl;
std::cout << "Running binary search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" << std
::endl;
unsigned factor = 10000; //number of times to run search
auto start_binary = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
for (unsigned i = 0; i < factor; ++i) std::binary_search(my_array.begin(), my_array.end(), val); //
binary search
VTUNE(__itt_pause();)
auto end_binary = std::chrono::high_resolution_clock::now(); //stop timer
//print out results of binary search
using std::chrono::duration_cast;
using std::chrono::nanoseconds;
std::cout << "clock resolution is: " << static_cast<double>(std::chrono::high_resolution_clock::
period::num) << " ns" << std::endl;
std::cout << "std::find: val=" << val << ", element=" << (pos != my_array.end() ? *pos : -1) << ",
index=" << pos - my_array.begin()
<< ", found==" << std::boolalpha << (pos != my_array.end()) << ", time=" << duration_cast
<nanoseconds>(end_binary - start_binary).count() << "ns" << std::endl;
//Parallel searches
//SIMD search
if (is_simd)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running simd parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
//Kick off the search
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "simd_search()",
for (unsigned i = 0; i<factor; i++) simd_search();
)
//Wait for result and then get the intertion point and number of paasses
12C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
auto parallel_time = std::chrono::duration_cast<nanoseconds>(end - start).count();
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Simd results:" << std::endl;
std::cout << "Size of array=" << size / 1000000 << " million elements" << std::endl;
std::cout << "Search repeated " << factor << " times" << std::endl;
std::cout << "number of threads=" << running_threads << std::endl;
std::cout << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," <<
ret_val[1] << "]";
std::cout << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl;
std::cout << "O(n_parallel)/O(n_binary)=" << complexity << std::endl;
std::cout << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time
<< "ns";
std::cout << " = ";
std::cout << parallel_time / binary_time << std::endl;
std::cout << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::
endl;
}
//Lock-free multithreaded search
if (is_lock_free)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running lock-free parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
double parallel_time = 0;
for (unsigned i = 0; i<factor; ++i)
{
//reset passes counter, chunk struct, done flag
passes = 0;
chunk_address_and_length = DoubleWord{ &my_array[0], size / num_threads };
done = false;
//Kick off the search
//auto start = std::chrono::high_resolution_clock::now(); //start timer
std::vector<std::future<bool>> futures;
go = false;
futures.push_back(std::move(std::async(std::launch::deferred, lock_free, 0)));
for (unsigned chunk_pos = 1; chunk_pos < num_threads; ++chunk_pos)
{
futures.push_back(std::move(std::async(std::launch::async, lock_free, chunk_pos)));
}
//Wait for result and then get the intertion point and number of paasses
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "lock_free()",
go = true; futures[0].get();
)
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count();
}
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Lock free results:" << std::endl
<< "Size of array=" << size / 1000000 << " million elements" << std::endl
<< "Search repeated " << factor << " times" << std::endl
13C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
<< "number of threads=" << running_threads << std::endl
<< "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] <
< "]"
<< ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl
<< "O(n_parallel)/O(n_binary)=" << complexity << std::endl
<< "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time <<
"ns"
<< " = "
<< parallel_time / binary_time << std::endl
<< "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl;
}
if (is_lock_based)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running lock-based parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
double parallel_time = 0;
for (unsigned i = 0; i<factor; ++i)
{
//reset passes counter, chunk struct, done flag
passes = 0;
//Kick off the search
//auto start = std::chrono::high_resolution_clock::now(); //start timer
go = false;
auto f = thread_pool.push(lock_based, &my_array[0], size);
//Wait for result and then get the intertion point and number of paasses
{
//wait for completion
std::mutex dummy;
std::unique_lock<std::mutex> lock(dummy);
auto start = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
VTUNE_TASK("Parallel Search", "lock_based()",
go = true;
finished.wait(lock);
)
VTUNE(__itt_pause();)
//get execution time
auto end = std::chrono::high_resolution_clock::now(); //stop timer
parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count();
thread_pool.cancel_tasks();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
//kill thread pool
thread_pool.shutdown(false);
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Lock based results:" << std::endl
<< "Size of array=" << size / 1000000 << " million elements" << std::endl
<< "Search repeated " << factor << " times" << std::endl
<< "number of threads=" << running_threads << std::endl
<< "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] <
< "]"
<< ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl
<< "O(n_parallel)/O(n_binary)=" << complexity << std::endl
<< "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time <<
"ns"
<< " = "
14C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
<< parallel_time / binary_time << std::endl
<< "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl;
}
}
catch (...)
{
std::cout << "ParallelSearch exception" << std::endl;
}
};
int main(void)
{
//SimpleTest simple_test;
VTUNE(__itt_pause();)
ParallelSearch parallel_search;
char c;
std::cout << "Press any key to exit" << std::endl;
std::cin >> c; //keep console alive
}

More Related Content

PDF
PHP Tips & Tricks
PDF
From mysql to MongoDB(MongoDB2011北京交流会)
PDF
PythonでJWT生成からボット作成、投稿までやってみた
PPT
PHP and MySQL
PDF
Обзор фреймворка Twisted
KEY
Can't Miss Features of PHP 5.3 and 5.4
TXT
Nouveau document texte
TXT
Yy
PHP Tips & Tricks
From mysql to MongoDB(MongoDB2011北京交流会)
PythonでJWT生成からボット作成、投稿までやってみた
PHP and MySQL
Обзор фреймворка Twisted
Can't Miss Features of PHP 5.3 and 5.4
Nouveau document texte
Yy

What's hot (19)

PDF
Rooted 2010 ppp
TXT
PDF
Asciidoctor New, Noteworthy and Beyond Devoxx-2017
PDF
Teaching Your Machine To Find Fraudsters
TXT
PDF
Debugging: Rules And Tools - PHPTek 11 Version
KEY
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
TXT
PDF
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
PDF
"let ECMAScript = 6"
PDF
PHP Static Code Review
PDF
Your code is not a string
KEY
Php 101: PDO
PDF
Drush. Secrets come out.
PPTX
How I Built a Power Debugger Out of the Standard Library and Things I Found o...
PDF
Twitter codeigniter library
PDF
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
PDF
PHP Data Objects
Rooted 2010 ppp
Asciidoctor New, Noteworthy and Beyond Devoxx-2017
Teaching Your Machine To Find Fraudsters
Debugging: Rules And Tools - PHPTek 11 Version
Introduction to CloudForecast / YAPC::Asia 2010 Tokyo
Security Meetup 22 октября. «Реверс-инжиниринг в Enterprise». Алексей Секрето...
"let ECMAScript = 6"
PHP Static Code Review
Your code is not a string
Php 101: PDO
Drush. Secrets come out.
How I Built a Power Debugger Out of the Standard Library and Things I Found o...
Twitter codeigniter library
Eric Redmond – Distributed Search on Riak 2.0 - NoSQL matters Barcelona 2014
PHP Data Objects
Ad

Viewers also liked (7)

PDF
Shared_memory_hash_table
PDF
Algorithms devised for a google interview
PDF
Dynamic programming burglar_problem
PDF
Cpp11 sample linux
PDF
Simple shared mutex UML
PDF
Full resume dr_russell_john_childs_2016
PDF
Interview C++11 code
Shared_memory_hash_table
Algorithms devised for a google interview
Dynamic programming burglar_problem
Cpp11 sample linux
Simple shared mutex UML
Full resume dr_russell_john_childs_2016
Interview C++11 code
Ad

Similar to Cpp11 multithreading and_simd_linux_code (20)

PDF
Deploying Plone and Volto, the Hard Way
PPT
PHP CLI: A Cinderella Story
PDF
Augeas @RMLL 2012
PDF
Configuration Surgery with Augeas
TXT
My shell
PDF
A journey through the years of UNIX and Linux service management
PPTX
Introduction to Apache Mesos
PDF
Quick tour of PHP from inside
DOCX
Program Assignment Process ManagementObjective This program a.docx
KEY
PDF
Ansible inside
PDF
Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...
PPTX
Harmonious Development: Via Vagrant and Puppet
PDF
Assignment of SOS operating systemThe file lmemman.c has one incom.pdf
PDF
Penetration Testing for Easy RM to MP3 Converter Application and Post Exploit
PDF
start_printf: dev/ic/com.c comstart()
PDF
The Rust Programming Language: an Overview
PDF
Vagrant for real
PDF
Puppet and the HashiStack
Deploying Plone and Volto, the Hard Way
PHP CLI: A Cinderella Story
Augeas @RMLL 2012
Configuration Surgery with Augeas
My shell
A journey through the years of UNIX and Linux service management
Introduction to Apache Mesos
Quick tour of PHP from inside
Program Assignment Process ManagementObjective This program a.docx
Ansible inside
Hadoop meetup : HUGFR Construire le cluster le plus rapide pour l'analyse des...
Harmonious Development: Via Vagrant and Puppet
Assignment of SOS operating systemThe file lmemman.c has one incom.pdf
Penetration Testing for Easy RM to MP3 Converter Application and Post Exploit
start_printf: dev/ic/com.c comstart()
The Rust Programming Language: an Overview
Vagrant for real
Puppet and the HashiStack

More from Russell Childs (20)

PDF
spinor_quantum_simulator_user_guide_.pdf
PDF
String searching o_n
PDF
String searching o_n
PDF
String searching o_n
PDF
String searching
PDF
PDF
PDF
Feature extraction using adiabatic theorem
PDF
Feature extraction using adiabatic theorem
PDF
Wavelets_and_multiresolution_in_two_pages
PDF
Relativity 2
PDF
Recursion to iteration automation.
PDF
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
PDF
Design pattern to avoid downcasting
PDF
Interview uml design
PDF
Full_resume_Dr_Russell_John_Childs
PDF
K d tree_cpp
PDF
Multithreaded sockets c++11
PDF
IBM Kinexa Prove It! C programming test results.
PDF
IBM Kinexa Prove It! C++ programming test results.
spinor_quantum_simulator_user_guide_.pdf
String searching o_n
String searching o_n
String searching o_n
String searching
Feature extraction using adiabatic theorem
Feature extraction using adiabatic theorem
Wavelets_and_multiresolution_in_two_pages
Relativity 2
Recursion to iteration automation.
Dirac demo (quantum mechanics with C++). Please note: There is a problem with...
Design pattern to avoid downcasting
Interview uml design
Full_resume_Dr_Russell_John_Childs
K d tree_cpp
Multithreaded sockets c++11
IBM Kinexa Prove It! C programming test results.
IBM Kinexa Prove It! C++ programming test results.

Recently uploaded (20)

PDF
Advanced methodologies resolving dimensionality complications for autism neur...
PDF
Approach and Philosophy of On baking technology
PDF
Dropbox Q2 2025 Financial Results & Investor Presentation
PPTX
Digital-Transformation-Roadmap-for-Companies.pptx
PPT
“AI and Expert System Decision Support & Business Intelligence Systems”
PDF
Build a system with the filesystem maintained by OSTree @ COSCUP 2025
PDF
Building Integrated photovoltaic BIPV_UPV.pdf
PDF
Mobile App Security Testing_ A Comprehensive Guide.pdf
PPTX
Spectroscopy.pptx food analysis technology
PDF
Assigned Numbers - 2025 - Bluetooth® Document
PDF
Agricultural_Statistics_at_a_Glance_2022_0.pdf
PDF
Profit Center Accounting in SAP S/4HANA, S4F28 Col11
PDF
gpt5_lecture_notes_comprehensive_20250812015547.pdf
PDF
Review of recent advances in non-invasive hemoglobin estimation
PDF
MIND Revenue Release Quarter 2 2025 Press Release
PPTX
KOM of Painting work and Equipment Insulation REV00 update 25-dec.pptx
PPTX
VMware vSphere Foundation How to Sell Presentation-Ver1.4-2-14-2024.pptx
PPT
Teaching material agriculture food technology
PDF
Encapsulation theory and applications.pdf
PDF
Encapsulation_ Review paper, used for researhc scholars
Advanced methodologies resolving dimensionality complications for autism neur...
Approach and Philosophy of On baking technology
Dropbox Q2 2025 Financial Results & Investor Presentation
Digital-Transformation-Roadmap-for-Companies.pptx
“AI and Expert System Decision Support & Business Intelligence Systems”
Build a system with the filesystem maintained by OSTree @ COSCUP 2025
Building Integrated photovoltaic BIPV_UPV.pdf
Mobile App Security Testing_ A Comprehensive Guide.pdf
Spectroscopy.pptx food analysis technology
Assigned Numbers - 2025 - Bluetooth® Document
Agricultural_Statistics_at_a_Glance_2022_0.pdf
Profit Center Accounting in SAP S/4HANA, S4F28 Col11
gpt5_lecture_notes_comprehensive_20250812015547.pdf
Review of recent advances in non-invasive hemoglobin estimation
MIND Revenue Release Quarter 2 2025 Press Release
KOM of Painting work and Equipment Insulation REV00 update 25-dec.pptx
VMware vSphere Foundation How to Sell Presentation-Ver1.4-2-14-2024.pptx
Teaching material agriculture food technology
Encapsulation theory and applications.pdf
Encapsulation_ Review paper, used for researhc scholars

Cpp11 multithreading and_simd_linux_code

  • 1. 1C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp /* * multi.cpp * * Created on: 13 Mar 2014 * Author: Russell John Childs. */ //======================================================================================================= // COPYRIGHT NOTICE // This code sample and ideas embodied remain the property of Russell John Childs, PhD and have been // distributed as a representative example of my use of C++11 features. //========================================================================================================== ============ //==================================================== // File contains // (1) Implementation of lock-based thread pool. // (2) Implementation of lock-free "thread pool". // (3) Implementation of SIMD, autovectorisation (N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor) // Implementation of parallel search (using lock-based thread pool, lock-free event-based pool, SIMD parallel for): // 1) Split sorted array into <num_threads> equal chunks // 2) Assign each chunk to a thread. // 3) Thread returns true iff chunk.begin() <= search_val <= chunk.end() // 4) Replace array with chunk that returned true and return to step 1 // Complexity: // t - number of threads ( > 1 ) // Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t)) // Parallel = O(log_t(n)) // // Compiling this code sample (Linux Mint - g++ 4.8) // // Compiler options: // g++ -O3 -fopenmp -mavx -m64 -g -Wall -c -fmessage-length=0 -fno-omit-frame-pointer // --fast-math -ftree-vectorizer-verbose=3 -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include / // multithreading.cpp // // Linker options: // g++ -fopenmp -L/opt/intel/vtune_amplifier_xe_2013/lib64/ -o "multithreading" $(OBJS) $(USER_OBJS) $ (LIBS) -lpthread -latomic -littnotify -ldl // //============================================================== #include <thread> #include <future> #include <condition_variable> #include <atomic> #include <functional> #include <deque> #include <vector> #include <set> #include <iostream> #include <string> #include <sstream> #include <cmath> #include <algorithm> #include <omp.h> #include <immintrin.h> //#include <cilk/cilk.h> //Uncomment following #define if you do not have Intel VTune Amplifier XE 2013 performance profiler. #define INTEL_NO_ITTNOTIFY_API //include Vtune API header iff INTEL_NO_ITTNOTIFY_API is not #defined #ifndef INTEL_NO_ITTNOTIFY_API #include "ittnotify.h" #endif //Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined #ifdef INTEL_NO_ITTNOTIFY_API #define VTUNE(STATEMENT) #else
  • 2. 2C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp #define VTUNE(STATEMENT) STATEMENT #endif //Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined #ifdef INTEL_NO_ITTNOTIFY_API #define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) STATEMENTS #else #define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) { auto domain = __itt_domain_create(DOMAIN); __itt_task_begin(domain, __itt_null, __itt_null, __itt_string_handle_create("simd_search()")); STATEMENTS __itt_task_end(domain); } #endif // ================================================================= // Class wrapper for std::packaged_task to make different signatures, e.g. int(void), fload(int,int), ... // storable in STL container for thread pool. // === // N.B. simpler mechanism would be std::vector<std:function<void(void)>>; v[i]= std:packaged_task<Type (Type)>(type), // since packaged_task has void operator()(void). However, there is a problem: std::function // requires command object to be copyable and packaged_task has move-only semantics. //================================================================== //============================ //Primary template //============================ template< typename Out = void, typename In = void > struct MyPackagedTask { virtual ~MyPackagedTask(void) { } }; //============================ //Explicit specialization, acts as base class // MyPackagedTask<>& poly = *new MyPackagedTaks<MyType(OtherType)>; // poly(); --> calls MyPackagedTaks<MyType(OtherType)>::op() //============================ template<> struct MyPackagedTask<> { virtual ~MyPackagedTask(void) { } virtual void operator()(void) { } }; std::mutex last_return_mutex; //============================ //Specialization for function signature // MyPackagedTaks<MyType(OtherType)> //============================ template< typename Out, typename... In > struct MyPackagedTask< Out(In...) > : public MyPackagedTask<> { MyPackagedTask(std::function<Out(In...)> func, In... in) : m_task(std::bind(func, in...)) { } virtual ~MyPackagedTask(void) { } MyPackagedTask(MyPackagedTask&& other) : m_task(std::move(other.m_task))
  • 3. 3C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { } void operator()(void) { m_task(); } std::future<Out> get_future(void) { return m_task.get_future(); } private: std::packaged_task<Out(void)> m_task; }; //====================================================================== // Simple thread pool class // Places tasks onto common queue // Allocates fixed number of threads which pop tasks. // TODO: Load balancing, cache ping-pong (RFOs), VTune optimisation. //==================================================================== class ThreadPool { public: ThreadPool(unsigned max_num_threads = 1U << 31) : m_done(false), //notice to threads to shut down m_print_shutdown_msg(true), //print or not print shutdown msg m_max_num_threads(max_num_threads), //maximum num threads allowed in pool m_num_threads(0), //num threads allocated by the pool m_processing(0), //tasks still running m_cancel(false) { } ~ThreadPool(void) { //Shut down threads iff user has not alread called shutdown() if (!m_done) { shutdown(); } } //================= // Push task onto pool //================ template< typename Out, typename... In > std::future<Out> push(std::function<Out(In...)> func, In... in) { //Create task, store future MyPackagedTask<Out(In...)> task(func, in...); std::future<Out> ret_val = task.get_future(); //lock task queue, push the new task onto the queue, notify threads waiting on empty queue, release lock if (m_cancel == false) { { std::unique_ptr<MyPackagedTask<>> ptr(new MyPackagedTask<Out(In...)>(std::move(task))); // Base*=&Derived for poly call std::lock_guard<std::mutex> lock(m_tasks); //lock queue m_pool.push_back(std::move(ptr)); //push task } //release lock m_condition_variable.notify_all(); //notify waiting threads //spawn a thread(async will prevent oversubscription) and store thread future(to check for thread termination at pool shutdown) if ((++m_num_threads <= m_max_num_threads)) { std::unique_lock<std::mutex> lock(m_threads); m_thread_list.push_back(std::async(std::launch::async, &ThreadPool::run_tasks, this)); }
  • 4. 4C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp } //return packaged_task future so that caller can wait for result return ret_val; } //================= // get number of threads allocated //================ unsigned get_num_threads(void) { std::unique_lock<std::mutex> lock(m_threads); return m_thread_list.size(); } //================= // Cancel all tasks but keep threads alivre (for reuse by next set of tasks during iteration). Not yest tested. //================ void cancel_tasks(void) { m_cancel = true; while (m_processing != 0); { std::unique_lock<std::mutex> lock(m_tasks); //lock task queue m_pool.clear(); } m_cancel = false; } //================= // Kill all threads and print out shutdown message (iff msg==true) //================ void shutdown(bool msg = true) { m_print_shutdown_msg = msg; { if (m_print_shutdown_msg) { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::endl << "================================================================= " << std::endl << "Shutting down threads: "; } } cancel_tasks(); //Notify all threads of thread pool termination m_done = true; m_condition_variable.notify_all(); //Loop over all threads and wait for them to terminate { std::unique_lock<std::mutex> lock(m_threads); for (auto& elem : m_thread_list) { while (!elem.valid()); elem.get(); } } //Clear thread queue { std::unique_lock<std::mutex> lock(m_threads); m_thread_list.clear(); } //Print out shutdown message if (m_print_shutdown_msg)
  • 5. 5C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::endl << "=================================================================" << std::endl; } } private: //================= // Pop and run tasks in threads. //================ void run_tasks(void) { //To avoid branch misprediction, use array to store branch code instead of if-else std::unique_ptr<MyPackagedTask<>> func; std::function<void(void)> branch_true = [&]{ func = std::move(m_pool.front()); m_pool.pop_front(); }; std::function<void(void)> branch_false = [&]{ func = std::unique_ptr<MyPackagedTask<>>(new MyPackagedTask<>); };//NOP std::function<void(void)> switch_func[2]{ branch_false, branch_true}; while (!m_done) { // Only wait if there are still tasks to be processed { bool empty; //Status of task queue std::unique_lock<std::mutex> lock(m_tasks); //lock task queue m_condition_variable.wait_for(lock, std::chrono::nanoseconds(100), [&]{ return !(empty = m_pool.empty()) || m_done; }); //wakeup if queue empty or shutdown switch_func[!empty && !m_done](); //only run non-NOP if queue not empty and not shutdown. } ++m_processing; (*func)(); --m_processing; } //Print out shutdown msg if (m_done & m_print_shutdown_msg) { std::unique_lock<std::mutex> lock(m_shutdown); std::cout << std::this_thread::get_id() << " "; } } std::atomic<bool> m_done; std::atomic<bool> m_print_shutdown_msg; std::atomic<unsigned> m_max_num_threads; std::atomic<unsigned> m_num_threads; std::atomic<unsigned> m_processing; std::atomic<bool> m_cancel; std::deque< std::unique_ptr<MyPackagedTask<>> > m_pool; std::vector< std::future<void> > m_thread_list; std::mutex m_threads; std::mutex m_tasks; std::mutex m_shutdown; std::condition_variable m_condition_variable; }; //===================================== // Simple test class // Creates a few tasks, pushes them onto thread pool, gets results //================================================================== struct SimpleTest { SimpleTest(void) try { std::cout << std::endl << "Simple Test......" << std::endl << std::endl; //Create thread pool ThreadPool thread_pool;
  • 6. 6C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp //create a task std::thread::id f1_id; std::function< int(int, int) > f1 = [&](int i, int j) { f1_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); return i*j; }; //create another task std::thread::id f2_id; std::function< std::string(void) > f2 = [&](void) { f2_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); return std::string("return value of f2"); }; //create another task std::thread::id f3_id; std::string f3_str; std::function< void(void) > f3 = [&](void) { f3_id = std::this_thread::get_id(); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); f3_str = "f3 called"; }; //push tasks auto start = std::chrono::high_resolution_clock::now(); //start timer std::future<int> fut_1(std::move(thread_pool.push(f1, 10, 20))); std::future<std::string> fut_2 = thread_pool.push(f2); int fut_1_res = fut_1.get(); std::string fut_2_res = fut_2.get(); auto end = std::chrono::high_resolution_clock::now(); //stop timer //std::future<void> fut_3 = thread_pool.push(f3); //TODO - fix compilation error. // std::cout << typeid(decltype(thread_pool.push(f3))).name() << std::endl; // gives std::future <void> //std::future<void> test_fut; //compiles //std::future<void> test_fut1 = std::move(test_fut); //compiles //thread_pool.push(f3); // doesn't compile // std::function< int(int) > f4 = [&](int i){ return ++i; }; //compiles // thread_pool.push(f4, 2); //compiles // std::function< void(int) > f4 = [&](int i){ ++i; }; //compiles // thread_pool.push(f4, 2); //doesn't compile //print num of threads running, thread id for tasks, result sent back by tasks std::cout << "num threads=" << thread_pool.get_num_threads() << std::endl; std::cout << "f1 thread id=" << f1_id << std::endl; std::cout << "f1's result: " << fut_1_res << std::endl; std::cout << "f2 thread id=" << f2_id << std::endl; std::cout << "f2's result: " << fut_2_res << std::endl; //std::cout << "f3 thread id=" << f3_id << std::endl; //std::cout << "f3's result: " << f3_str << std::endl; std::cout << "thread_pool time = " << std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns" << std:: endl; //cleanup threads //thread_pool.shutdown(); test dtor } catch (...) { std::cout << "SimpleTest exception" << std::endl; } }; //============================================================== // Parallel vs binary search test class // t - number of threads ( > 1) // Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t)) // Parallel = O(log_t(n)) // // Binary = std::find (single threaded for comparison).
  • 7. 7C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp // Parallel: (1) Split array into equal chuncks, push them onto thread pool // (2) Chunk containing search-val returns true. N.B. predicate simply returns begin() <= val <= end(). // (3) Chunk returning true replaces array and step(1) repeated. // N.B. Parallel search gets insertion point of nearest match rather than vector.end() if no match // Not optimised for RFOs (cache ping-pong), load balancing, VTune or early match (binary is quicker if early match found). // Benchmarks show high overhead of thread pool. //=============================================================== struct ParallelSearch { //Choose which to run bool is_lock_free = false; //run lock free lambda (if enabled, should set variable "factor", below, to 100 since the search is time consuming) bool is_lock_based = false; //run lock-based lambda (if enabled, should set variable "factor", below, to 100 since the search is time consuming) bool is_simd = true; //run simd-based lambda //Choose number of threads //unsigned num_threads = 2; //To get thread overhead (parallel/binary = 1 for no overhead) //const unsigned num_threads = std::thread::hardware_concurrency()/2; //undersubscription, should run slower than optimal const unsigned num_threads = std::thread::hardware_concurrency(); //Should be optimal choice //const unsigned num_threads = 2*std::thread::hardware_concurrency(); //moderate oversubsrciption, should run slower tha optimal //const unsigned num_threads = 4*std::thread::hardware_concurrency(); //heavy oversubsrciption, should run slower tha optimal //const unsigned num_threads = 128 * std::thread::hardware_concurrency(); //massive oversubsrciption, should run slower tha optimal ParallelSearch(void) try { std::atomic<bool> done(false); //flag used in lcok-free search to notify of completion //Create large, sorted array on heap to avoid seg fault. const unsigned size = 2 << 24; std::vector<unsigned> my_array(size); for (auto& elem : my_array) { static unsigned i = 0; elem = 2 * i; //even numbers ++i; } //double-word atomic containing the address of a matching chunk and the new new chunk length (size, size/t, size/t^2 ...) struct DoubleWord { unsigned* m_address; unsigned m_chunk_length; }; std::atomic<DoubleWord> chunk_address_and_length(DoubleWord{ &my_array[0], size / num_threads }); //val seacrched for (TODO: binary search faster than parallel search if binary finds early match. Need to terminate parallel search earlier) bool even = true; unsigned val = my_array[((size >> 1) + ((size >> 1) - 1)*rand() / RAND_MAX)] + (even ? 0 : 1); // even/odd number --> found/not found //Variables for found position, passes taken and whether to printout progress(incurs overhead) unsigned* ret_val = &my_array[0]; int passes = 0; //int required by g++ autovectorize bool printout = false; //SIMD lambda (Proved to be quite difficult getting g++ to autovectorise) //(N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor) // 1. Split array into t chunks // 2. Allocate chunks to t SIMD lanes // 3. Each lane checks chunk.begin() <= search-val <= chunk.end() // 4. The SIMD lane getting a match set array = chunk // 5. Steps 1 to 4 repeated until chunk is 1 element long.
  • 8. 8C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp std::function<bool(void)> simd_search = [&]() { //Alignment (SSE - 16 byte SIMD register, AVX 32-byte SIMD register) const unsigned alignment = 16; //g++ bug with 32-byte ((http://gcc.gnu.org/bugzilla/show_bug.cgi ?id=56787) //Pre-calculate chunk size (size/8, size/64, size/8^3 ... 1 element(s)) alignas(alignment) int chunk_length[9]{size >> 3, size >> 6, size >> 9, size >> 12, size >> 15, size >> 18, size >> 21, size >> 24, 1}; //Pre-calculate lower index for lower <= val << upper. N.B This is converted to lower[n]/8, lower[n]/64 ... alignas(alignment) int lower_index[8]{0, size, 2 * size, 3 * size, 4 * size, 5 * size, 6 * size, 7 * size}; //Pre-caclulate num of SIMD lanes to allocate to for loop to be vectorised alignas(alignment) int limits[9]{8, 8, 8, 8, 8, 8, 8, 8, 2}; //Running tally of start of chunk to be searched alignas(alignment) int offset = 0; alignas(alignment) int tmp_offset = 0; //Loop until chunk length is 1 element for (passes = 0; passes<9; ++passes) { //Following lambda is a test to see if hotspots marked "LINE X" and "LINE Y", below, are due to memory stalls. //It turns out prefetch does eliminate hotspots X, Y, but adds overhead of its own, so this search algorithm is unavoidably //memory-bound unless something along the lines of a heap-ordered array (i.e array is laid out as a breadth-first n-ary tree) is //used to convert random access to linear access without need for scatter-gather. //#pragma omp parallel for //Adds too much overhead [&]() //Sadly, won't vectorise due to function call { unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0], alignment); //Requirement for autovectorise. for (int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes to prefetch data they will use { int tmp = pos*chunk_length[passes]; //Get lower index for chunk interval __builtin_prefetch(&my_array[0] + offset + tmp); //See if it removes hotspot from "LINE X", below __builtin_prefetch(&my_array[0] + offset + tmp + chunk_length[passes] - 1); //See if it removes hotspot from "LINE Y", below } }(); //Fork: Assign each chunk to an SIMD lane //N.B. Use lmabda to force vectorisation of loop. Without it, loop is unrolled but SLP not vectorised. This does autovectorise under g++ 4.8 //N.B. Code has been borken down into painfully simple steps to help autovectoriser and pinpoint which operations are causing trouble [&]() { unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0], alignment); //Requirement for autovectorise. alignas(alignment) int chunk = chunk_length[passes]; for (alignas(alignment) int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes { //Find matching chunk by adding 0 to offset for no-match and chunk address for a match alignas(alignment) int tmp = pos*chunk; //Lower index of chunk range without offset //int tmp=lower_index[pos]>>3; //g++ bug (http://gcc.gnu.org/bugzilla/show_bug.cgi? id=56787). Can't use 32-byte AVX. alignas(alignment) int lower_ind = offset + tmp; //Lower index of chunk range alignas(alignment) int upper_ind = lower_ind + chunk - 1; //Upper index of chunk range unsigned lower_val = tmp_ret_val[lower_ind]; //LINE X - Hotspot (eliminated by above
  • 9. 9C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp prefetch) unsigned upper_val = tmp_ret_val[upper_ind]; //LINE Y - Hotspot (eliminated by above prefecth) alignas(alignment) bool test_lower = lower_val <= val; //Lower alignas(alignment) bool test_upper = val <= upper_val; //and upper limit check alignas(alignment) bool test = test_lower && test_upper; // is search-val inside chunk for this SIMD lane? tmp_offset += test*tmp; //Horrible construct to get it to autovec. It masks out SIMD lanes that don't contain search val. //Following fails because it is "not suitable for gather" (whatever that means) //offset += ((tmp_ret_val[offset+tmp] <= val) & (val <= tmp_ret_val[offset+tmp+ chunk_length[passes]-1]))*tmp; //Following fails because of "control flow" (Can't see why g++ doesn't autovec it, control flow can be reaplced with masked op) //if((tmp_ret_val[offset+tmp] <= val) && (val <= tmp_ret_val[offset+tmp+chunk_length [passes]-1])) tmp_offset = tmp; } }(); //Join: end of SIMD //Update chunk start address index offset = tmp_offset; /*std::cout << "offset=" << offset << std::endl; std::cout << "passes=" << passes; std::cout << ", val=" << val; std::cout << ", range=[" << array[offset] << "," << array[offset+1]; std::cout<< ", chunk length=" << chunk_length[passes] << std::endl; */ } //Update final index of search-val ret_val = &my_array[0] + offset; return true; }; //Lock-free lambda for each thead //Operation: //1. The array is split into t (num of threads) chunks //2. Each thread examines its chunk //3. If a match is found in a chunk, the thread changes the array to be that chunk. //4. The process repeated from step 1. //t threads continous monitor the array and process their chunk of the array. Since the array pointer is // atomic, when one thread sees a matching chunk and changes the array to be that chunk, this is picked up // by all threads. No synchronisation is needed. // arg chunk_pos - section of chunk to search (0 - [begin, begin+chunk_length/t], 1 - [begin+ chunk_length/t, begin+2*chunk_length/t], ..) std::atomic<unsigned> running_threads(0); std::atomic<bool> go(false); std::function<bool(unsigned)> lock_free = [&](unsigned chunk_pos) { //Increment running thread count ++running_threads; //Keep all threads on hold until signalled to begin together (for timings). while (!go); //Keep searching until a thread notifies completion. while (!done) { //capture chunk address and length DoubleWord capture = chunk_address_and_length; //Check if search-val between chunk.begin() and chunk.end() unsigned *begin = capture.m_address + chunk_pos*capture.m_chunk_length; unsigned *end = begin + capture.m_chunk_length - 1; unsigned test1 = *begin, test2 = *end; if (*begin <= val && val <= *end)
  • 10. 10C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp { //Print out iterations (adds significant overhead) static std::mutex printout_mutex; if (printout) { std::unique_lock<std::mutex> lock(printout_mutex); //Print out iterations (adds significant overhead) std::cout << "Parallel find (pass " << passes << "): Closest match " << *begin << "<=" << val << "<=" << *end << ", chunk length=" << capture.m_chunk_length << std::endl; } //Update parent variables for printouts ret_val = begin; ++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num threads) std::function<void(void)> branch_true = [&]() //IF { //Update chunk length and address capture.m_chunk_length = (capture.m_chunk_length >= num_threads ? (capture. m_chunk_length / num_threads) : 1); //divide chunk evenly capture.m_address = begin; //point to this chunk chunk_address_and_length = capture; }; std::function<void(void)> branch_false = [&]() //ELSE { done = true; //notify parent and sister threads of completion }; std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if} if_else[capture.m_chunk_length > 1](); //if-else } else { std::this_thread::sleep_for(std::chrono::nanoseconds(5000)); } } return true;; }; //Create thread pool for lock-based search static ThreadPool thread_pool(num_threads); //Notification of completion of lock-based search std::condition_variable finished; // lock - based lambda for each thread.It simply tests whether array[pos] <= search_val <= array[pos + chuhnk_length] // and iff true, spawns t threads to narrow down the search, iteratively arriving at the insertion point. std::function<bool(unsigned*, unsigned)> lock_based = [&](unsigned* tmp, unsigned chunk_length) // { //Keep all threads on hold until signalled to begin together (for timings). while (!go); //Check if search-val between chunk.begin() and chunk.end() if (*tmp <= val && val <= *(tmp + chunk_length - 1)) { //Print out iterations (adds significant overhead) if (printout) { std::cout << "Parallel find (pass " << passes << "): Closest match " << *tmp << "<=" << val << "<=" << (chunk_length > 1 ? *(tmp + chunk_length - 1) : *tmp == val ? val : *(tmp + 1)) << ", chunk length=" << chunk_length << std::endl; } //Update parent variables for printouts ret_val = tmp; ++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num threads)
  • 11. 11C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp //Spawn new tasks to process this chunk //Following peculiar construct is to avoid branch misprediction by using array of fn ptrs to replace if-else //need VTune to test out whether it saves us any mispredictions. std::function<void(void)> branch_true = [&]() //IF { chunk_length = (chunk_length >= num_threads ? (chunk_length / num_threads) : 1); //divide chunk evenly for (unsigned index = 0; index < num_threads; ++index) { thread_pool.push(lock_based, tmp + index*chunk_length, chunk_length); } }; std::function<void(void)> branch_false = [&]() //ELSE { finished.notify_one(); //chunk length is 1, so we are finished dividing-and- conquering }; std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if} if_else[chunk_length>1](); //if-else } return true; }; std::cout << std::endl << "Parallel vs Binary Search......" << std::endl << std::endl; //Obtain position of element (to verify parallel search finds correct position). auto pos = std::find(my_array.begin(), my_array.end(), val); //Ordinary binary search for timing comparison std::cout << std::endl << "========================================================================= =====" << std::endl; std::cout << "Running binary search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std ::endl; unsigned factor = 10000; //number of times to run search auto start_binary = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) for (unsigned i = 0; i < factor; ++i) std::binary_search(my_array.begin(), my_array.end(), val); // binary search VTUNE(__itt_pause();) auto end_binary = std::chrono::high_resolution_clock::now(); //stop timer //print out results of binary search using std::chrono::duration_cast; using std::chrono::nanoseconds; std::cout << "clock resolution is: " << static_cast<double>(std::chrono::high_resolution_clock:: period::num) << " ns" << std::endl; std::cout << "std::find: val=" << val << ", element=" << (pos != my_array.end() ? *pos : -1) << ", index=" << pos - my_array.begin() << ", found==" << std::boolalpha << (pos != my_array.end()) << ", time=" << duration_cast <nanoseconds>(end_binary - start_binary).count() << "ns" << std::endl; //Parallel searches //SIMD search if (is_simd) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running simd parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; //Kick off the search auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "simd_search()", for (unsigned i = 0; i<factor; i++) simd_search(); ) //Wait for result and then get the intertion point and number of paasses
  • 12. 12C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer auto parallel_time = std::chrono::duration_cast<nanoseconds>(end - start).count(); //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Simd results:" << std::endl; std::cout << "Size of array=" << size / 1000000 << " million elements" << std::endl; std::cout << "Search repeated " << factor << " times" << std::endl; std::cout << "number of threads=" << running_threads << std::endl; std::cout << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] << "]"; std::cout << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl; std::cout << "O(n_parallel)/O(n_binary)=" << complexity << std::endl; std::cout << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns"; std::cout << " = "; std::cout << parallel_time / binary_time << std::endl; std::cout << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std:: endl; } //Lock-free multithreaded search if (is_lock_free) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running lock-free parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; double parallel_time = 0; for (unsigned i = 0; i<factor; ++i) { //reset passes counter, chunk struct, done flag passes = 0; chunk_address_and_length = DoubleWord{ &my_array[0], size / num_threads }; done = false; //Kick off the search //auto start = std::chrono::high_resolution_clock::now(); //start timer std::vector<std::future<bool>> futures; go = false; futures.push_back(std::move(std::async(std::launch::deferred, lock_free, 0))); for (unsigned chunk_pos = 1; chunk_pos < num_threads; ++chunk_pos) { futures.push_back(std::move(std::async(std::launch::async, lock_free, chunk_pos))); } //Wait for result and then get the intertion point and number of paasses auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "lock_free()", go = true; futures[0].get(); ) VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count(); } //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Lock free results:" << std::endl << "Size of array=" << size / 1000000 << " million elements" << std::endl << "Search repeated " << factor << " times" << std::endl
  • 13. 13C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp << "number of threads=" << running_threads << std::endl << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] < < "]" << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl << "O(n_parallel)/O(n_binary)=" << complexity << std::endl << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns" << " = " << parallel_time / binary_time << std::endl << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl; } if (is_lock_based) { std::cout << std::endl << "===================================================================== =========" << std::endl; std::cout << "Running lock-based parallel search, please wait a few minutes ..." << std::endl; std::cout << "==============================================================================" << std::endl; double parallel_time = 0; for (unsigned i = 0; i<factor; ++i) { //reset passes counter, chunk struct, done flag passes = 0; //Kick off the search //auto start = std::chrono::high_resolution_clock::now(); //start timer go = false; auto f = thread_pool.push(lock_based, &my_array[0], size); //Wait for result and then get the intertion point and number of paasses { //wait for completion std::mutex dummy; std::unique_lock<std::mutex> lock(dummy); auto start = std::chrono::high_resolution_clock::now(); //start timer VTUNE(__itt_resume();) VTUNE_TASK("Parallel Search", "lock_based()", go = true; finished.wait(lock); ) VTUNE(__itt_pause();) //get execution time auto end = std::chrono::high_resolution_clock::now(); //stop timer parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count(); thread_pool.cancel_tasks(); std::this_thread::sleep_for(std::chrono::milliseconds(10)); } } //kill thread pool thread_pool.shutdown(false); //print results double complexity = (passes + 1) / (std::log(size) / std::log(2)); auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count(); std::cout << "Lock based results:" << std::endl << "Size of array=" << size / 1000000 << " million elements" << std::endl << "Search repeated " << factor << " times" << std::endl << "number of threads=" << running_threads << std::endl << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] < < "]" << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - & my_array[0] << "]" << std::endl << "O(n_parallel)/O(n_binary)=" << complexity << std::endl << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time << "ns" << " = "
  • 14. 14C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp << parallel_time / binary_time << std::endl << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl; } } catch (...) { std::cout << "ParallelSearch exception" << std::endl; } }; int main(void) { //SimpleTest simple_test; VTUNE(__itt_pause();) ParallelSearch parallel_search; char c; std::cout << "Press any key to exit" << std::endl; std::cin >> c; //keep console alive }