Cpp11 multithreading and_simd_linux_code

1C:my_docsvirtual_machinesvmware_playershared_foldergeneralmultithreading_and_simd_linux.cpp
/*
* multi.cpp
*
* Created on: 13 Mar 2014
* Author: Russell John Childs.
*/
//=======================================================================================================
// COPYRIGHT NOTICE
// This code sample and ideas embodied remain the property of Russell John Childs, PhD and have been
// distributed as a representative example of my use of C++11 features.
//==========================================================================================================
============
//====================================================
// File contains
// (1) Implementation of lock-based thread pool.
// (2) Implementation of lock-free "thread pool".
// (3) Implementation of SIMD, autovectorisation (N.B. This was compiled for Intel i7-3720QM Ivy Bridge
processor)
// Implementation of parallel search (using lock-based thread pool, lock-free event-based pool, SIMD
parallel for):
// 1) Split sorted array into <num_threads> equal chunks
// 2) Assign each chunk to a thread.
// 3) Thread returns true iff chunk.begin() <= search_val <= chunk.end()
// 4) Replace array with chunk that returned true and return to step 1
// Complexity:
// t - number of threads ( > 1 )
// Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t))
// Parallel = O(log_t(n))
//
// Compiling this code sample (Linux Mint - g++ 4.8)
//
// Compiler options:
// g++ -O3 -fopenmp -mavx -m64 -g -Wall -c -fmessage-length=0 -fno-omit-frame-pointer
// --fast-math -ftree-vectorizer-verbose=3 -std=c++11 -I/opt/intel/vtune_amplifier_xe_2013/include
/
// multithreading.cpp
//
// Linker options:
// g++ -fopenmp -L/opt/intel/vtune_amplifier_xe_2013/lib64/ -o "multithreading" $(OBJS) $(USER_OBJS) $
(LIBS) -lpthread -latomic -littnotify -ldl
//
//==============================================================
#include <thread>
#include <future>
#include <condition_variable>
#include <atomic>
#include <functional>
#include <deque>
#include <vector>
#include <set>
#include <iostream>
#include <string>
#include <sstream>
#include <cmath>
#include <algorithm>
#include <omp.h>
#include <immintrin.h>
//#include <cilk/cilk.h>
//Uncomment following #define if you do not have Intel VTune Amplifier XE 2013 performance profiler.
#define INTEL_NO_ITTNOTIFY_API
//include Vtune API header iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifndef INTEL_NO_ITTNOTIFY_API
#include "ittnotify.h"
#endif
//Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifdef INTEL_NO_ITTNOTIFY_API
#define VTUNE(STATEMENT)
#else

#define VTUNE(STATEMENT) STATEMENT
#endif
//Macro to add VTUNE API call iff INTEL_NO_ITTNOTIFY_API is not #defined
#ifdef INTEL_NO_ITTNOTIFY_API
#define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS) STATEMENTS
#else
#define VTUNE_TASK(DOMAIN, FUNC, STATEMENTS)
{
auto domain = __itt_domain_create(DOMAIN);
__itt_task_begin(domain, __itt_null, __itt_null, __itt_string_handle_create("simd_search()"));
STATEMENTS
__itt_task_end(domain);
}
#endif
// =================================================================
// Class wrapper for std::packaged_task to make different signatures, e.g. int(void), fload(int,int), ...
// storable in STL container for thread pool.
// ===
// N.B. simpler mechanism would be std::vector<std:function<void(void)>>; v[i]= std:packaged_task<Type
(Type)>(type),
// since packaged_task has void operator()(void). However, there is a problem: std::function
// requires command object to be copyable and packaged_task has move-only semantics.
//==================================================================
//============================
//Primary template
//============================
template< typename Out = void, typename In = void >
struct MyPackagedTask
{
virtual ~MyPackagedTask(void)
{
}
};
//============================
//Explicit specialization, acts as base class
// MyPackagedTask<>& poly = *new MyPackagedTaks<MyType(OtherType)>;
// poly(); --> calls MyPackagedTaks<MyType(OtherType)>::op()
//============================
template<>
struct MyPackagedTask<>
{
{
}
virtual void operator()(void)
{
}
};
std::mutex last_return_mutex;
//============================
//Specialization for function signature
// MyPackagedTaks<MyType(OtherType)>
//============================
template< typename Out, typename... In >
struct MyPackagedTask< Out(In...) > : public MyPackagedTask<>
{
MyPackagedTask(std::function<Out(In...)> func, In... in) :
m_task(std::bind(func, in...))
{
}
{
}
MyPackagedTask(MyPackagedTask&& other) :
m_task(std::move(other.m_task))

{
}
void operator()(void)
{
m_task();
}
std::future<Out> get_future(void)
{
return m_task.get_future();
}
private:
std::packaged_task<Out(void)> m_task;
};
//======================================================================
// Simple thread pool class
// Places tasks onto common queue
// Allocates fixed number of threads which pop tasks.
// TODO: Load balancing, cache ping-pong (RFOs), VTune optimisation.
//====================================================================
class ThreadPool
{
public:
ThreadPool(unsigned max_num_threads = 1U << 31) :
m_done(false), //notice to threads to shut down
m_print_shutdown_msg(true), //print or not print shutdown msg
m_max_num_threads(max_num_threads), //maximum num threads allowed in pool
m_num_threads(0), //num threads allocated by the pool
m_processing(0), //tasks still running
m_cancel(false)
{
}
~ThreadPool(void)
{
//Shut down threads iff user has not alread called shutdown()
if (!m_done)
{
shutdown();
}
}
//=================
// Push task onto pool
//================
template< typename Out, typename... In >
std::future<Out> push(std::function<Out(In...)> func, In... in)
{
//Create task, store future
MyPackagedTask<Out(In...)> task(func, in...);
std::future<Out> ret_val = task.get_future();
//lock task queue, push the new task onto the queue, notify threads waiting on empty queue, release
lock
if (m_cancel == false)
{
{
std::unique_ptr<MyPackagedTask<>> ptr(new MyPackagedTask<Out(In...)>(std::move(task))); //
Base*=&Derived for poly call
std::lock_guard<std::mutex> lock(m_tasks); //lock queue
m_pool.push_back(std::move(ptr)); //push task
} //release lock
m_condition_variable.notify_all(); //notify waiting threads
//spawn a thread(async will prevent oversubscription) and store thread future(to check for
thread termination at pool shutdown)
if ((++m_num_threads <= m_max_num_threads))
{
std::unique_lock<std::mutex> lock(m_threads);
m_thread_list.push_back(std::async(std::launch::async, &ThreadPool::run_tasks, this));
}

}
//return packaged_task future so that caller can wait for result
return ret_val;
}
//=================
// get number of threads allocated
//================
unsigned get_num_threads(void)
{
return m_thread_list.size();
}
//=================
// Cancel all tasks but keep threads alivre (for reuse by next set of tasks during iteration). Not yest
tested.
//================
void cancel_tasks(void)
{
m_cancel = true;
while (m_processing != 0);
{
std::unique_lock<std::mutex> lock(m_tasks); //lock task queue
m_pool.clear();
}
m_cancel = false;
}
//=================
// Kill all threads and print out shutdown message (iff msg==true)
//================
void shutdown(bool msg = true)
{
m_print_shutdown_msg = msg;
{
if (m_print_shutdown_msg)
{
std::unique_lock<std::mutex> lock(m_shutdown);
std::cout << std::endl << "=================================================================
" << std::endl
<< "Shutting down threads: ";
}
}
cancel_tasks();
//Notify all threads of thread pool termination
m_done = true;
m_condition_variable.notify_all();
//Loop over all threads and wait for them to terminate
{
for (auto& elem : m_thread_list)
{
while (!elem.valid());
elem.get();
}
}
//Clear thread queue
{
m_thread_list.clear();
}
//Print out shutdown message
if (m_print_shutdown_msg)

{
std::cout << std::endl << "=================================================================" <<
std::endl;
}
}
private:
//=================
// Pop and run tasks in threads.
//================
void run_tasks(void)
{
//To avoid branch misprediction, use array to store branch code instead of if-else
std::unique_ptr<MyPackagedTask<>> func;
std::function<void(void)> branch_true = [&]{ func = std::move(m_pool.front()); m_pool.pop_front();
};
std::function<void(void)> branch_false = [&]{ func = std::unique_ptr<MyPackagedTask<>>(new
MyPackagedTask<>); };//NOP
std::function<void(void)> switch_func[2]{ branch_false, branch_true};
while (!m_done)
{
// Only wait if there are still tasks to be processed
{
bool empty; //Status of task queue
std::unique_lock<std::mutex> lock(m_tasks); //lock task queue
m_condition_variable.wait_for(lock, std::chrono::nanoseconds(100), [&]{ return !(empty =
m_pool.empty()) || m_done; }); //wakeup if queue empty or shutdown
switch_func[!empty && !m_done](); //only run non-NOP if queue not empty and not shutdown.
}
++m_processing;
(*func)();
--m_processing;
}
//Print out shutdown msg
if (m_done & m_print_shutdown_msg)
{
std::cout << std::this_thread::get_id() << " ";
}
}
std::atomic<bool> m_done;
std::atomic<bool> m_print_shutdown_msg;
std::atomic<unsigned> m_max_num_threads;
std::atomic<unsigned> m_num_threads;
std::atomic<unsigned> m_processing;
std::atomic<bool> m_cancel;
std::deque< std::unique_ptr<MyPackagedTask<>> > m_pool;
std::vector< std::future<void> > m_thread_list;
std::mutex m_threads;
std::mutex m_tasks;
std::mutex m_shutdown;
std::condition_variable m_condition_variable;
};
//=====================================
// Simple test class
// Creates a few tasks, pushes them onto thread pool, gets results
//==================================================================
struct SimpleTest
{
SimpleTest(void) try
{
std::cout << std::endl << "Simple Test......" << std::endl << std::endl;
//Create thread pool
ThreadPool thread_pool;

//create a task
std::thread::id f1_id;
std::function< int(int, int) > f1 = [&](int i, int j)
{
f1_id = std::this_thread::get_id();
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
return i*j;
};
//create another task
std::function< std::string(void) > f2 = [&](void)
{
return std::string("return value of f2");
};
//create another task
std::string f3_str;
std::function< void(void) > f3 = [&](void)
{
f3_str = "f3 called";
};
//push tasks
auto start = std::chrono::high_resolution_clock::now(); //start timer
std::future<int> fut_1(std::move(thread_pool.push(f1, 10, 20)));
std::future<std::string> fut_2 = thread_pool.push(f2);
int fut_1_res = fut_1.get();
std::string fut_2_res = fut_2.get();
auto end = std::chrono::high_resolution_clock::now(); //stop timer
//std::future<void> fut_3 = thread_pool.push(f3); //TODO - fix compilation error.
// std::cout << typeid(decltype(thread_pool.push(f3))).name() << std::endl; // gives std::future
<void>
//std::future<void> test_fut; //compiles
//std::future<void> test_fut1 = std::move(test_fut); //compiles
//thread_pool.push(f3); // doesn't compile
// std::function< int(int) > f4 = [&](int i){ return ++i; }; //compiles
// thread_pool.push(f4, 2); //compiles
// std::function< void(int) > f4 = [&](int i){ ++i; }; //compiles
// thread_pool.push(f4, 2); //doesn't compile
//print num of threads running, thread id for tasks, result sent back by tasks
std::cout << "num threads=" << thread_pool.get_num_threads() << std::endl;
std::cout << "f1 thread id=" << f1_id << std::endl;
std::cout << "f1's result: " << fut_1_res << std::endl;
std::cout << "f2 thread id=" << f2_id << std::endl;
std::cout << "f2's result: " << fut_2_res << std::endl;
//std::cout << "f3 thread id=" << f3_id << std::endl;
//std::cout << "f3's result: " << f3_str << std::endl;
std::cout << "thread_pool time = "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() << "ns" << std::
endl;
//cleanup threads
//thread_pool.shutdown(); test dtor
}
catch (...)
{
std::cout << "SimpleTest exception" << std::endl;
}
};
//==============================================================
// Parallel vs binary search test class
// t - number of threads ( > 1)
// Binary = O(log_2(n/t)) = O(log_2(n)-log_2(t))
// Parallel = O(log_t(n))
//
// Binary = std::find (single threaded for comparison).

// Parallel: (1) Split array into equal chuncks, push them onto thread pool
// (2) Chunk containing search-val returns true. N.B. predicate simply returns begin() <= val
<= end().
// (3) Chunk returning true replaces array and step(1) repeated.
// N.B. Parallel search gets insertion point of nearest match rather than vector.end() if no match
// Not optimised for RFOs (cache ping-pong), load balancing, VTune or early match (binary is
quicker if early match found).
// Benchmarks show high overhead of thread pool.
//===============================================================
struct ParallelSearch
{
//Choose which to run
bool is_lock_free = false; //run lock free lambda (if enabled, should set variable "factor", below, to
100 since the search is time consuming)
bool is_lock_based = false; //run lock-based lambda (if enabled, should set variable "factor", below,
to 100 since the search is time consuming)
bool is_simd = true; //run simd-based lambda
//Choose number of threads
//unsigned num_threads = 2; //To get thread overhead (parallel/binary = 1 for no overhead)
//const unsigned num_threads = std::thread::hardware_concurrency()/2; //undersubscription,
should run slower than optimal
const unsigned num_threads = std::thread::hardware_concurrency(); //Should be optimal
choice
//const unsigned num_threads = 2*std::thread::hardware_concurrency(); //moderate
oversubsrciption, should run slower tha optimal
//const unsigned num_threads = 4*std::thread::hardware_concurrency(); //heavy
//const unsigned num_threads = 128 * std::thread::hardware_concurrency(); //massive
ParallelSearch(void) try
{
std::atomic<bool> done(false); //flag used in lcok-free search to notify of completion
//Create large, sorted array on heap to avoid seg fault.
const unsigned size = 2 << 24;
std::vector<unsigned> my_array(size);
for (auto& elem : my_array)
{
static unsigned i = 0;
elem = 2 * i; //even numbers
++i;
}
//double-word atomic containing the address of a matching chunk and the new new chunk length (size,
size/t, size/t^2 ...)
struct DoubleWord
{
unsigned* m_address;
unsigned m_chunk_length;
};
std::atomic<DoubleWord> chunk_address_and_length(DoubleWord{ &my_array[0], size / num_threads });
//val seacrched for (TODO: binary search faster than parallel search if binary finds early match.
Need to terminate parallel search earlier)
bool even = true;
unsigned val = my_array[((size >> 1) + ((size >> 1) - 1)*rand() / RAND_MAX)] + (even ? 0 : 1); //
even/odd number --> found/not found
//Variables for found position, passes taken and whether to printout progress(incurs overhead)
unsigned* ret_val = &my_array[0];
int passes = 0; //int required by g++ autovectorize
bool printout = false;
//SIMD lambda (Proved to be quite difficult getting g++ to autovectorise)
//(N.B. This was compiled for Intel i7-3720QM Ivy Bridge processor)
// 1. Split array into t chunks
// 2. Allocate chunks to t SIMD lanes
// 3. Each lane checks chunk.begin() <= search-val <= chunk.end()
// 4. The SIMD lane getting a match set array = chunk
// 5. Steps 1 to 4 repeated until chunk is 1 element long.

std::function<bool(void)> simd_search = [&]()
{
//Alignment (SSE - 16 byte SIMD register, AVX 32-byte SIMD register)
const unsigned alignment = 16; //g++ bug with 32-byte ((http://gcc.gnu.org/bugzilla/show_bug.cgi
?id=56787)
//Pre-calculate chunk size (size/8, size/64, size/8^3 ... 1 element(s))
alignas(alignment) int chunk_length[9]{size >> 3, size >> 6, size >> 9, size >> 12, size >> 15,
size >> 18, size >> 21, size >> 24, 1};
//Pre-calculate lower index for lower <= val << upper. N.B This is converted to lower[n]/8,
lower[n]/64 ...
alignas(alignment) int lower_index[8]{0, size, 2 * size, 3 * size, 4 * size, 5 * size, 6 * size,
7 * size};
//Pre-caclulate num of SIMD lanes to allocate to for loop to be vectorised
alignas(alignment) int limits[9]{8, 8, 8, 8, 8, 8, 8, 8, 2};
//Running tally of start of chunk to be searched
alignas(alignment) int offset = 0;
alignas(alignment) int tmp_offset = 0;
//Loop until chunk length is 1 element
for (passes = 0; passes<9; ++passes)
{
//Following lambda is a test to see if hotspots marked "LINE X" and "LINE Y", below, are
due to memory stalls.
//It turns out prefetch does eliminate hotspots X, Y, but adds overhead of its own, so this
search algorithm is unavoidably
//memory-bound unless something along the lines of a heap-ordered array (i.e array is laid
out as a breadth-first n-ary tree) is
//used to convert random access to linear access without need for scatter-gather.
//#pragma omp parallel for //Adds too much overhead
[&]() //Sadly, won't vectorise due to function call
{
unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0],
alignment); //Requirement for autovectorise.
for (int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes to prefetch data
they will use
{
int tmp = pos*chunk_length[passes]; //Get lower index
for chunk interval
__builtin_prefetch(&my_array[0] + offset + tmp); //See if it
removes hotspot from "LINE X", below
__builtin_prefetch(&my_array[0] + offset + tmp + chunk_length[passes] - 1); //See if
it removes hotspot from "LINE Y", below
}
}();
//Fork: Assign each chunk to an SIMD lane
//N.B. Use lmabda to force vectorisation of loop. Without it, loop is unrolled but SLP not
vectorised. This does autovectorise under g++ 4.8
//N.B. Code has been borken down into painfully simple steps to help autovectoriser and
pinpoint which operations are causing trouble
[&]()
{
unsigned *__restrict__ tmp_ret_val = (unsigned*)__builtin_assume_aligned(&my_array[0],
alignment); //Requirement for autovectorise.
alignas(alignment) int chunk = chunk_length[passes];
for (alignas(alignment) int pos = 0; pos<limits[passes]; ++pos) //Loop over SIMD lanes
{
//Find matching chunk by adding 0 to offset for no-match and chunk address for a
match
alignas(alignment) int tmp = pos*chunk; //Lower index of chunk range without offset
//int tmp=lower_index[pos]>>3; //g++ bug (http://gcc.gnu.org/bugzilla/show_bug.cgi?
id=56787). Can't use 32-byte AVX.
alignas(alignment) int lower_ind = offset + tmp; //Lower index of chunk range
alignas(alignment) int upper_ind = lower_ind + chunk - 1; //Upper index of chunk
range
unsigned lower_val = tmp_ret_val[lower_ind]; //LINE X - Hotspot (eliminated by above

prefetch)
unsigned upper_val = tmp_ret_val[upper_ind]; //LINE Y - Hotspot (eliminated by above
prefecth)
alignas(alignment) bool test_lower = lower_val <= val; //Lower
alignas(alignment) bool test_upper = val <= upper_val; //and upper limit check
alignas(alignment) bool test = test_lower && test_upper; // is search-val inside
chunk for this SIMD lane?
tmp_offset += test*tmp; //Horrible construct to get it to autovec.
It masks out SIMD lanes that don't contain search val.
//Following fails because it is "not suitable for gather" (whatever that means)
//offset += ((tmp_ret_val[offset+tmp] <= val) & (val <= tmp_ret_val[offset+tmp+
chunk_length[passes]-1]))*tmp;
//Following fails because of "control flow" (Can't see why g++ doesn't autovec it,
control flow can be reaplced with masked op)
//if((tmp_ret_val[offset+tmp] <= val) && (val <= tmp_ret_val[offset+tmp+chunk_length
[passes]-1])) tmp_offset = tmp;
}
}();
//Join: end of SIMD
//Update chunk start address index
offset = tmp_offset;
/*std::cout << "offset=" << offset << std::endl;
std::cout << "passes=" << passes;
std::cout << ", val=" << val;
std::cout << ", range=[" << array[offset] << "," << array[offset+1];
std::cout<< ", chunk length=" << chunk_length[passes] << std::endl;
*/
}
//Update final index of search-val
ret_val = &my_array[0] + offset;
return true;
};
//Lock-free lambda for each thead
//Operation:
//1. The array is split into t (num of threads) chunks
//2. Each thread examines its chunk
//3. If a match is found in a chunk, the thread changes the array to be that chunk.
//4. The process repeated from step 1.
//t threads continous monitor the array and process their chunk of the array. Since the array
pointer is
// atomic, when one thread sees a matching chunk and changes the array to be that chunk, this is
picked up
// by all threads. No synchronisation is needed.
// arg chunk_pos - section of chunk to search (0 - [begin, begin+chunk_length/t], 1 - [begin+
chunk_length/t, begin+2*chunk_length/t], ..)
std::atomic<unsigned> running_threads(0);
std::atomic<bool> go(false);
std::function<bool(unsigned)> lock_free = [&](unsigned chunk_pos)
{
//Increment running thread count
++running_threads;
//Keep all threads on hold until signalled to begin together (for timings).
while (!go);
//Keep searching until a thread notifies completion.
while (!done)
{
//capture chunk address and length
DoubleWord capture = chunk_address_and_length;
//Check if search-val between chunk.begin() and chunk.end()
unsigned *begin = capture.m_address + chunk_pos*capture.m_chunk_length;
unsigned *end = begin + capture.m_chunk_length - 1;
unsigned test1 = *begin, test2 = *end;
if (*begin <= val && val <= *end)

{
//Print out iterations (adds significant overhead)
static std::mutex printout_mutex;
if (printout)
{
std::unique_lock<std::mutex> lock(printout_mutex);
std::cout << "Parallel find (pass " << passes << "): Closest match "
<< *begin << "<=" << val << "<=" << *end
<< ", chunk length=" << capture.m_chunk_length << std::endl;
}
//Update parent variables for printouts
ret_val = begin;
++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num
threads)
std::function<void(void)> branch_true = [&]() //IF
{
//Update chunk length and address
capture.m_chunk_length = (capture.m_chunk_length >= num_threads ? (capture.
m_chunk_length / num_threads) : 1); //divide chunk evenly
capture.m_address = begin; //point to this chunk
chunk_address_and_length = capture;
};
std::function<void(void)> branch_false = [&]() //ELSE
{
done = true; //notify parent and sister threads of completion
};
std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if}
if_else[capture.m_chunk_length > 1](); //if-else
}
else
{
std::this_thread::sleep_for(std::chrono::nanoseconds(5000));
}
}
return true;;
};
//Create thread pool for lock-based search
static ThreadPool thread_pool(num_threads);
//Notification of completion of lock-based search
std::condition_variable finished;
// lock - based lambda for each thread.It simply tests whether array[pos] <= search_val <= array[pos
+ chuhnk_length]
// and iff true, spawns t threads to narrow down the search, iteratively arriving at the insertion
point.
std::function<bool(unsigned*, unsigned)> lock_based = [&](unsigned* tmp, unsigned chunk_length) //
{
//Keep all threads on hold until signalled to begin together (for timings).
while (!go);
//Check if search-val between chunk.begin() and chunk.end()
if (*tmp <= val && val <= *(tmp + chunk_length - 1))
{
if (printout)
{
std::cout << "Parallel find (pass " << passes << "): Closest match " << *tmp << "<=" <<
val
<< "<=" << (chunk_length > 1 ? *(tmp + chunk_length - 1) : *tmp == val ? val : *(tmp
+ 1))
<< ", chunk length=" << chunk_length << std::endl;
}
//Update parent variables for printouts
ret_val = tmp;
++passes; //keep count of passes (0 - size, 1 - size/t, 2 - size/t^2 ... with t - num
threads)

//Spawn new tasks to process this chunk
//Following peculiar construct is to avoid branch misprediction by using array of fn ptrs to
replace if-else
//need VTune to test out whether it saves us any mispredictions.
std::function<void(void)> branch_true = [&]() //IF
{
chunk_length = (chunk_length >= num_threads ? (chunk_length / num_threads) : 1);
//divide chunk evenly
for (unsigned index = 0; index < num_threads; ++index)
{
thread_pool.push(lock_based, tmp + index*chunk_length, chunk_length);
}
};
std::function<void(void)> branch_false = [&]() //ELSE
{
finished.notify_one(); //chunk length is 1, so we are finished dividing-and-
conquering
};
std::function<void(void)> if_else[2]{branch_false, branch_true}; //{else, if}
if_else[chunk_length>1](); //if-else
}
return true;
};
std::cout << std::endl << "Parallel vs Binary Search......" << std::endl << std::endl;
//Obtain position of element (to verify parallel search finds correct position).
auto pos = std::find(my_array.begin(), my_array.end(), val);
//Ordinary binary search for timing comparison
std::cout << std::endl << "=========================================================================
=====" << std::endl;
std::cout << "Running binary search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" << std
::endl;
unsigned factor = 10000; //number of times to run search
auto start_binary = std::chrono::high_resolution_clock::now(); //start timer
VTUNE(__itt_resume();)
for (unsigned i = 0; i < factor; ++i) std::binary_search(my_array.begin(), my_array.end(), val); //
binary search
VTUNE(__itt_pause();)
auto end_binary = std::chrono::high_resolution_clock::now(); //stop timer
//print out results of binary search
using std::chrono::duration_cast;
using std::chrono::nanoseconds;
std::cout << "clock resolution is: " << static_cast<double>(std::chrono::high_resolution_clock::
period::num) << " ns" << std::endl;
std::cout << "std::find: val=" << val << ", element=" << (pos != my_array.end() ? *pos : -1) << ",
index=" << pos - my_array.begin()
<< ", found==" << std::boolalpha << (pos != my_array.end()) << ", time=" << duration_cast
<nanoseconds>(end_binary - start_binary).count() << "ns" << std::endl;
//Parallel searches
//SIMD search
if (is_simd)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running simd parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
//Kick off the search
VTUNE_TASK("Parallel Search", "simd_search()",
for (unsigned i = 0; i<factor; i++) simd_search();
)
//Wait for result and then get the intertion point and number of paasses

//get execution time
auto parallel_time = std::chrono::duration_cast<nanoseconds>(end - start).count();
//print results
double complexity = (passes + 1) / (std::log(size) / std::log(2));
auto binary_time = duration_cast<nanoseconds>(end_binary - start_binary).count();
std::cout << "Simd results:" << std::endl;
std::cout << "Size of array=" << size / 1000000 << " million elements" << std::endl;
std::cout << "Search repeated " << factor << " times" << std::endl;
std::cout << "number of threads=" << running_threads << std::endl;
std::cout << "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," <<
ret_val[1] << "]";
std::cout << ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl;
std::cout << "O(n_parallel)/O(n_binary)=" << complexity << std::endl;
std::cout << "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time
<< "ns";
std::cout << " = ";
std::cout << parallel_time / binary_time << std::endl;
std::cout << "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::
endl;
}
//Lock-free multithreaded search
if (is_lock_free)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running lock-free parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
double parallel_time = 0;
for (unsigned i = 0; i<factor; ++i)
{
//reset passes counter, chunk struct, done flag
passes = 0;
chunk_address_and_length = DoubleWord{ &my_array[0], size / num_threads };
done = false;
//auto start = std::chrono::high_resolution_clock::now(); //start timer
std::vector<std::future<bool>> futures;
go = false;
futures.push_back(std::move(std::async(std::launch::deferred, lock_free, 0)));
for (unsigned chunk_pos = 1; chunk_pos < num_threads; ++chunk_pos)
{
futures.push_back(std::move(std::async(std::launch::async, lock_free, chunk_pos)));
}
VTUNE_TASK("Parallel Search", "lock_free()",
go = true; futures[0].get();
)
parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count();
}
//print results
std::cout << "Lock free results:" << std::endl
<< "Size of array=" << size / 1000000 << " million elements" << std::endl
<< "Search repeated " << factor << " times" << std::endl

<< "number of threads=" << running_threads << std::endl
<< "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] <
< "]"
<< ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl
<< "O(n_parallel)/O(n_binary)=" << complexity << std::endl
<< "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time <<
"ns"
<< " = "
<< parallel_time / binary_time << std::endl
<< "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl;
}
if (is_lock_based)
{
std::cout << std::endl << "=====================================================================
=========" << std::endl;
std::cout << "Running lock-based parallel search, please wait a few minutes ..." << std::endl;
std::cout << "==============================================================================" <<
std::endl;
double parallel_time = 0;
for (unsigned i = 0; i<factor; ++i)
{
//reset passes counter, chunk struct, done flag
passes = 0;
//auto start = std::chrono::high_resolution_clock::now(); //start timer
go = false;
auto f = thread_pool.push(lock_based, &my_array[0], size);
{
//wait for completion
std::mutex dummy;
std::unique_lock<std::mutex> lock(dummy);
VTUNE_TASK("Parallel Search", "lock_based()",
go = true;
finished.wait(lock);
)
parallel_time += std::chrono::duration_cast<nanoseconds>(end - start).count();
thread_pool.cancel_tasks();
}
}
//kill thread pool
thread_pool.shutdown(false);
//print results
std::cout << "Lock based results:" << std::endl
<< "Size of array=" << size / 1000000 << " million elements" << std::endl
<< "Search repeated " << factor << " times" << std::endl
<< "number of threads=" << running_threads << std::endl
<< "val=" << val << ", [element_lower,element_upper]=[" << ret_val[0] << "," << ret_val[1] <
< "]"
<< ", [index_lower,index_upper]=[" << ret_val - &my_array[0] << "," << ret_val + 1 - &
my_array[0] << "]" << std::endl
<< "O(n_parallel)/O(n_binary)=" << complexity << std::endl
<< "Time(parallel) / Time(binary) ) = " << parallel_time << "ns" << "/" << binary_time <<
"ns"
<< " = "

<< parallel_time / binary_time << std::endl
<< "threading overhead =" << parallel_time - complexity*binary_time << "ns" << std::endl;
}
}
catch (...)
{
std::cout << "ParallelSearch exception" << std::endl;
}
};
int main(void)
{
//SimpleTest simple_test;
ParallelSearch parallel_search;
char c;
std::cout << "Press any key to exit" << std::endl;
std::cin >> c; //keep console alive
}

Cpp11 multithreading and_simd_linux_code

More Related Content

What's hot (19)

Viewers also liked (7)

Similar to Cpp11 multithreading and_simd_linux_code (20)

More from Russell Childs (20)

Recently uploaded (20)

Cpp11 multithreading and_simd_linux_code