SlideShare a Scribd company logo
GPU
Programming
on CPUs
Using C++AMP
Miller Lee
Outline
1. Introduction to C++AMP
2. Introduction to Tiling
3. tile_static
4. barrier.wait and solutions
a. C++11 thread
b. setjmp/longjmp
c. ucontext
2
(Homogeneous coordinates)
(0, 0) (0, 1) (0, 2) (0, 3)
(1, 0) (1, 1) (1, 2) (1, 3)
(2, 0) (2, 1) (2, 2) (2, 3)
(3, 0) (3, 1) (3, 2) (3, 3)
X
0
1
2
3
Matrix A b
=
0
1
2
3
result
Computing example
● Simple matrix multiplication
3
C++ Version
1. int A[4][4];
2. int b[4];
3. int result[4];
4. for (int i = 0; i < 4; i++) {
5. result[i] = 0;
6. for (int j = 0; j < 4; j++)
7. result[i] += A[i][j] * b[j];
8. } 4
C++AMP Version
1. array_view<float, 2> A(4, 4);
2. array_view<float, 1> b(4);
3. array_view<float, 1> result(4);
4. extent<1> ext(4);
5. parallel_for_each(ext, [&](index<1> idx) restrict(amp)
6. {
7. result[idx[0]] = 0;
8. for (int i = 0; i < 4; i++)
9. result[idx[0]] += A(idx[0], i) * b(i);
10. });
5
memory access
0 1 2 3
P0 P1 P2 P3
global memory
b
100t
Total access time = 400t 6
shared memory
0 1 2 3
shared memory
10t
100t
Total access time = 130t
b
7
1. array_view<float, 2> A(4, 4);
2. array_view<float, 1> b(4);
3. array_view<float, 1> result(4);
4. extent<1> ext(4);
5. parallel_for_each(ext.tile<4>(), [&](tiled_index<4> tidx)
restrict(amp)
6. {
7. int local = tidx.local[0];
8. int global = tidx.global[0];
9. tile_statc int buf[4];
10. buf[local] = b[global];
11. tidx.barrier.wait();
12. result[idx[0]] = 0;
13. for (int i = 0; i < 4; i++)
14. result[idx[0]] += A[idx[0]][i] * buf[i];
15. }); 8
barrier
9
Architecture
source: NVIDIA TESLA:AUNIFIED GRAPHICS AND COMPUTING ARCHITECTURE
shared memory
accessible to all SPs
10
Goal
● Implement all the C++AMP function on CPU
instead of GPU without any compiler
modification.
11
tiled_static
● The limitation of C++ syntax leads to the
following choices
○ const, volatile
○ __attribute__(...)
○ static
● Choose static
○ static memory can be shared among all the threads
○ side effect: At most one thread group can be
executed at the same time.
#define tile_static static
12
Barrier.wait
● Threads in the same thread group will be
waited at the point where “wait” is called.
● Program can
a. perform real barrier action
b. jump out of current execution context
13
● True threading
○ C++11 thread
● Fake threading(Coroutines)
○ setjmp/longjmp
○ makecontext/getcontext/swapcontext/setcontext
Approaches
14
C++11 thread
● launch hundreds of threads at a time.
● implemente my own barrier by using C++11
mutex library.
→ extremely slow.
→ The data on static memory will be corrupted
15
setjmp/longjmp
● int setjmp(jmp_buf env)
○ setjmp() saves the stack context/environment in env
for later use by longjmp.
○ The stack context will be invalidated if the function
which called setjmp() returns.
● void longjmp(jmp_buf env, int val);
○ longjmp() restores the environment saved by the last
call of setjmp.
16
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf;
4. void wait(void) {
5. printf("waitn"); // prints
6. longjmp(buf,1);
7. }
8. void first(void) {
9. wait();
10. printf("firstn"); // does not print
11. }
12. int main() {
13. if (!setjmp(buf))
14. first(); // when executed, setjmp returns 0
15. else // when longjmp jumps back, setjmp returns 1
16. printf("mainn"); // prints
17. return 0;
18. }
17
Pseudo code (1)
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
18
Pseudo code (2)
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
void entry()
{
while(!finish)
for(t : tasks)
run(t)
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
19
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. } 20
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
buf
21
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
ret address
buf
b
22
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
buf
b
23
1. #include <stdio.h>
2. #include <setjmp.h>
3. jmp_buf buf, b;
4. void wait(void) {
5. printf("waitn");
6. if (setjmp(b) == 0)
7. longjmp(buf,1);
8. }
9. void first(void) {
10. wait();
11. }
12. int main() {
13. if (!setjmp(buf) )
14. first();
15. else {
16. printf("mainn");
17. longjmp(b, 10);
18. }
19. return 0;
20. }
Cannot return
???
???
???
buf
b
24
Problems
● Cannot return
○ return address in the stack is destroyed
● Cannot use too many static variables
○ will lost spilled registers
→ can be solved by using “alloca”
http://www.codemud.net/~thinker/GinGin_CGI.
py/show_id_doc/489
25
ucontext.h
● ucontext_t
● getcontext
● makecontest
● swapcontext
● setcontext
26
ucontext_t
typedef struct ucontext {
struct ucontext *uc_link;
sigset_t uc_sigmask;
stack_t uc_stack;
mcontext_t uc_mcontext;
...
} ucontext_t;
● uc_link
○ points to the context that will be resumed when the current context
terminates
● uc_stack
○ the stack used by this context
● uc_mcontext
○ machine-specific representation of the saved context, that includes the
calling thread's machine registers
27
Functions
● int getcontext(ucontext_t *ucp);
○ initializes the structure pointed at by ucp.
● int setcontext(const ucontext_t *ucp);
○ restores the user context pointed at by ucp
● int swapcontext(ucontext_t *oucp, const
ucontext_t *ucp);
○ saves the current context in the structure pointed to
by oucp, and then activates the context pointed to by
ucp.
28
makecontext
● void makecontext(ucontext_t *ucp, void
(*func)(), int argc, ...);
○ glibc(x86_64) saves the arguments to registers
instead of pushing them on stack as AMD64 ABI
said
○ The size of the arguments that passed to
makecontext should be no less than sizeof(register)
29
1. #include <stdio.h>
2. #include <ucontext.h>
3. static ucontext_t ctx[2];
4. static void f1 (void) {
5. puts("start f1");
6. swapcontext(&ctx[1], &ctx[0]);
7. puts("finish f1");
8. }
9. int main (void)
10. {
11. char st1[8192];
12. getcontext(&ctx[1]);
13. ctx[1].uc_stack.ss_sp = st1;
14. ctx[1].uc_stack.ss_size = sizeof st1;
15. ctx[1].uc_link = &ctx[0];
16. makecontext(&ctx[1], f1, 0);
17. swapcontext(&ctx[0], &ctx[1]);
18. swapcontext(&ctx[0], &ctx[1]);
19. return 0;
20. } 30
1. #include <stdio.h>
2. #include <ucontext.h>
3. static ucontext_t ctx[3];
4. static void f1 (void) {
5. puts("start f1");
6. swapcontext(&ctx[1], &ctx
[0]);
7. puts("finish f1");
8. }
9. static void f2 (void)
10. {
11. puts("start f2");
12. swapcontext(&ctx[2], &ctx
[1]);
13. puts("finish f2");
14. }
1. int main (void)
2. {
3. char st1[8192], st2[8192];
4. getcontext(&ctx[1]);
5. ctx[1].uc_stack.ss_sp = st1;
6. ctx[1].uc_stack.ss_size = sizeof
st1;
7. ctx[1].uc_link = &ctx[0];
8. makecontext(&ctx[1], f1, 0);
9.
10. getcontext(&ctx[2]);
11. ctx[2].uc_stack.ss_sp = st2;
12. ctx[2].uc_stack.ss_size = sizeof
st2;
13. ctx[2].uc_link = &ctx[1];
14. makecontext(&ctx[2], f2, 0);
15. swapcontext(&ctx[0], &ctx[2]);
16. swapcontext(&ctx[0], &ctx[2]);
17. return 0;
18. }
31
Fake threading (yield)
void entry()
{
setup(fun, 2);
while(!finish)
switch_to();
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
32
void entry()
{
setup(fun, 2);
while(!finish)
switch_to();
}
void fun()
{
…
wait();
...
}
void fun()
{
…
wait();
...
}
Problems
1. How to pass a lambda?
○ makecontext(&ctx,
(void (*)(void))&Kernel::operator(), …);
2. How to pass non-int arguments?
○ What if sizeof(Type) > sizeof(int)
○ How about complex structure and class
33
Pass lambda
1. Use a wrapper function!!
template <typename Ker, typename Arg>
void fun(Ker k, Arg arg)
{
k(arg);
}
template <typename Ker, typename Arg>
void makectx(Ker k, Arg arg)
{
makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, k, arg);
}
34
Pass non-int arguments
2. Pass pointer instead!!
template <typename Ker, typename Arg>
void fun(Ker *k, Arg *arg)
{
(*k)(*arg);
}
template <typename Ker, typename Arg>
void makectx(Ker k, Arg arg)
{
makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, &k, &arg);
}
35
Additional
● Use a counter so that we can spawn
coroutines dynamically
● Can it be multithreaded? Yes
36
true threading
barrier
There are 12 threads in one thread group
37
one thread
barrier
38
multithreading
barrier
Hardware Core = 4
39
barrier
struct bar_t {
unsigned const count;
std::atomic<unsigned> spaces;
std::atomic<unsigned> generation;
bar_t(unsigned count_) :
count(count_), spaces(count_), generation(0)
{}
void wait() noexcept {
unsigned const my_generation = generation;
if (!--spaces) {
spaces = count;
++generation;
} else {
while(generation == my_generation);
}
}
}; source: C++ Concurrency in Action: Practical Multithreading
40
Summary
● It works fine on AMP right now
● The importance of low level knowledge
41
42

More Related Content

PDF
C++ amp on linux
PPTX
ISCA Final Presentaiton - Compilations
PDF
C++ How I learned to stop worrying and love metaprogramming
PPT
Intro2 Cuda Moayad
PDF
Tiramisu をちょっと、味見してみました。
PDF
Joel Falcou, Boost.SIMD
PDF
TVM VTA (TSIM)
PDF
Vc4c development of opencl compiler for videocore4
C++ amp on linux
ISCA Final Presentaiton - Compilations
C++ How I learned to stop worrying and love metaprogramming
Intro2 Cuda Moayad
Tiramisu をちょっと、味見してみました。
Joel Falcou, Boost.SIMD
TVM VTA (TSIM)
Vc4c development of opencl compiler for videocore4

What's hot (20)

PPTX
Story of static code analyzer development
DOCX
Histogram dan Segmentasi 2
PDF
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
PDF
OpenGL SC 2.0 Quick Reference
PDF
Vulkan 1.1 Reference Guide
PDF
Powered by Python - PyCon Germany 2016
PDF
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
PDF
Dafunctor
PDF
Facebook Glow Compiler のソースコードをグダグダ語る会
PDF
Kirk Shoop, Reactive programming in C++
PDF
HKG15-207: Advanced Toolchain Usage Part 3
PDF
Global Interpreter Lock: Episode I - Break the Seal
PDF
Cluj.py Meetup: Extending Python in C
PDF
深入淺出C語言
PDF
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
PDF
Open CL For Speedup Workshop
PDF
Interpreter, Compiler, JIT from scratch
PDF
Windbg랑 친해지기
PDF
Cluj Big Data Meetup - Big Data in Practice
PDF
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
Story of static code analyzer development
Histogram dan Segmentasi 2
개발 과정 최적화 하기 내부툴로 더욱 강력한 개발하기 Stephen kennedy _(11시40분_103호)
OpenGL SC 2.0 Quick Reference
Vulkan 1.1 Reference Guide
Powered by Python - PyCon Germany 2016
4Developers 2018: Evolution of C++ Class Design (Mariusz Łapiński)
Dafunctor
Facebook Glow Compiler のソースコードをグダグダ語る会
Kirk Shoop, Reactive programming in C++
HKG15-207: Advanced Toolchain Usage Part 3
Global Interpreter Lock: Episode I - Break the Seal
Cluj.py Meetup: Extending Python in C
深入淺出C語言
Google Edge TPUで TensorFlow Liteを使った時に 何をやっているのかを妄想してみる 2 「エッジAIモダン計測制御の世界」オ...
Open CL For Speedup Workshop
Interpreter, Compiler, JIT from scratch
Windbg랑 친해지기
Cluj Big Data Meetup - Big Data in Practice
Pythonによるカスタム可能な高位設計技術 (Design Solution Forum 2016@新横浜)
Ad

Similar to GPU Programming on CPU - Using C++AMP (20)

PDF
C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
PPT
Microkernel Development
PDF
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
PPT
Whats new in_csharp4
PDF
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
PDF
rrxv6 Build a Riscv xv6 Kernel in Rust.pdf
PDF
CUDA Deep Dive
PDF
Giorgio zoppi cpp11concurrency
DOCX
2.1 ### uVision Project, (C) Keil Software .docx
PPTX
2011.02.18 marco parenzan - modelli di programmazione per le gpu
PDF
Embedded systemsproject_2020
PDF
Rust LDN 24 7 19 Oxidising the Command Line
PDF
Software transactional memory. pure functional approach
PDF
20140531 serebryany lecture02_find_scary_cpp_bugs
PDF
40d5984d819aaa72e55aa10376b73bde_MIT6_087IAP10_lec12.pdf
PDF
Tema3_Introduction_to_CUDA_C.pdf
PDF
TLPI - 6 Process
PPTX
Embedded JavaScript
PPTX
Roll your own toy unix clone os
C++ CoreHard Autumn 2018. Concurrency and Parallelism in C++17 and C++20/23 -...
Microkernel Development
ExperiencesSharingOnEmbeddedSystemDevelopment_20160321
Whats new in_csharp4
Runtime Code Generation and Data Management for Heterogeneous Computing in Java
rrxv6 Build a Riscv xv6 Kernel in Rust.pdf
CUDA Deep Dive
Giorgio zoppi cpp11concurrency
2.1 ### uVision Project, (C) Keil Software .docx
2011.02.18 marco parenzan - modelli di programmazione per le gpu
Embedded systemsproject_2020
Rust LDN 24 7 19 Oxidising the Command Line
Software transactional memory. pure functional approach
20140531 serebryany lecture02_find_scary_cpp_bugs
40d5984d819aaa72e55aa10376b73bde_MIT6_087IAP10_lec12.pdf
Tema3_Introduction_to_CUDA_C.pdf
TLPI - 6 Process
Embedded JavaScript
Roll your own toy unix clone os
Ad

Recently uploaded (20)

PPTX
Patient Appointment Booking in Odoo with online payment
PDF
Product Update: Alluxio AI 3.7 Now with Sub-Millisecond Latency
DOCX
Greta — No-Code AI for Building Full-Stack Web & Mobile Apps
PPTX
Custom Software Development Services.pptx.pptx
PDF
STL Containers in C++ : Sequence Container : Vector
PPTX
Cybersecurity: Protecting the Digital World
PDF
Top 10 Software Development Trends to Watch in 2025 🚀.pdf
PDF
DNT Brochure 2025 – ISV Solutions @ D365
PPTX
Why Generative AI is the Future of Content, Code & Creativity?
PDF
Topaz Photo AI Crack New Download (Latest 2025)
PDF
EaseUS PDF Editor Pro 6.2.0.2 Crack with License Key 2025
PDF
Complete Guide to Website Development in Malaysia for SMEs
PPTX
GSA Content Generator Crack (2025 Latest)
PDF
DuckDuckGo Private Browser Premium APK for Android Crack Latest 2025
PPTX
WiFi Honeypot Detecscfddssdffsedfseztor.pptx
PDF
Autodesk AutoCAD Crack Free Download 2025
PPTX
Advanced SystemCare Ultimate Crack + Portable (2025)
PDF
wealthsignaloriginal-com-DS-text-... (1).pdf
PDF
AI-Powered Threat Modeling: The Future of Cybersecurity by Arun Kumar Elengov...
PPTX
Monitoring Stack: Grafana, Loki & Promtail
Patient Appointment Booking in Odoo with online payment
Product Update: Alluxio AI 3.7 Now with Sub-Millisecond Latency
Greta — No-Code AI for Building Full-Stack Web & Mobile Apps
Custom Software Development Services.pptx.pptx
STL Containers in C++ : Sequence Container : Vector
Cybersecurity: Protecting the Digital World
Top 10 Software Development Trends to Watch in 2025 🚀.pdf
DNT Brochure 2025 – ISV Solutions @ D365
Why Generative AI is the Future of Content, Code & Creativity?
Topaz Photo AI Crack New Download (Latest 2025)
EaseUS PDF Editor Pro 6.2.0.2 Crack with License Key 2025
Complete Guide to Website Development in Malaysia for SMEs
GSA Content Generator Crack (2025 Latest)
DuckDuckGo Private Browser Premium APK for Android Crack Latest 2025
WiFi Honeypot Detecscfddssdffsedfseztor.pptx
Autodesk AutoCAD Crack Free Download 2025
Advanced SystemCare Ultimate Crack + Portable (2025)
wealthsignaloriginal-com-DS-text-... (1).pdf
AI-Powered Threat Modeling: The Future of Cybersecurity by Arun Kumar Elengov...
Monitoring Stack: Grafana, Loki & Promtail

GPU Programming on CPU - Using C++AMP

  • 2. Outline 1. Introduction to C++AMP 2. Introduction to Tiling 3. tile_static 4. barrier.wait and solutions a. C++11 thread b. setjmp/longjmp c. ucontext 2
  • 3. (Homogeneous coordinates) (0, 0) (0, 1) (0, 2) (0, 3) (1, 0) (1, 1) (1, 2) (1, 3) (2, 0) (2, 1) (2, 2) (2, 3) (3, 0) (3, 1) (3, 2) (3, 3) X 0 1 2 3 Matrix A b = 0 1 2 3 result Computing example ● Simple matrix multiplication 3
  • 4. C++ Version 1. int A[4][4]; 2. int b[4]; 3. int result[4]; 4. for (int i = 0; i < 4; i++) { 5. result[i] = 0; 6. for (int j = 0; j < 4; j++) 7. result[i] += A[i][j] * b[j]; 8. } 4
  • 5. C++AMP Version 1. array_view<float, 2> A(4, 4); 2. array_view<float, 1> b(4); 3. array_view<float, 1> result(4); 4. extent<1> ext(4); 5. parallel_for_each(ext, [&](index<1> idx) restrict(amp) 6. { 7. result[idx[0]] = 0; 8. for (int i = 0; i < 4; i++) 9. result[idx[0]] += A(idx[0], i) * b(i); 10. }); 5
  • 6. memory access 0 1 2 3 P0 P1 P2 P3 global memory b 100t Total access time = 400t 6
  • 7. shared memory 0 1 2 3 shared memory 10t 100t Total access time = 130t b 7
  • 8. 1. array_view<float, 2> A(4, 4); 2. array_view<float, 1> b(4); 3. array_view<float, 1> result(4); 4. extent<1> ext(4); 5. parallel_for_each(ext.tile<4>(), [&](tiled_index<4> tidx) restrict(amp) 6. { 7. int local = tidx.local[0]; 8. int global = tidx.global[0]; 9. tile_statc int buf[4]; 10. buf[local] = b[global]; 11. tidx.barrier.wait(); 12. result[idx[0]] = 0; 13. for (int i = 0; i < 4; i++) 14. result[idx[0]] += A[idx[0]][i] * buf[i]; 15. }); 8
  • 10. Architecture source: NVIDIA TESLA:AUNIFIED GRAPHICS AND COMPUTING ARCHITECTURE shared memory accessible to all SPs 10
  • 11. Goal ● Implement all the C++AMP function on CPU instead of GPU without any compiler modification. 11
  • 12. tiled_static ● The limitation of C++ syntax leads to the following choices ○ const, volatile ○ __attribute__(...) ○ static ● Choose static ○ static memory can be shared among all the threads ○ side effect: At most one thread group can be executed at the same time. #define tile_static static 12
  • 13. Barrier.wait ● Threads in the same thread group will be waited at the point where “wait” is called. ● Program can a. perform real barrier action b. jump out of current execution context 13
  • 14. ● True threading ○ C++11 thread ● Fake threading(Coroutines) ○ setjmp/longjmp ○ makecontext/getcontext/swapcontext/setcontext Approaches 14
  • 15. C++11 thread ● launch hundreds of threads at a time. ● implemente my own barrier by using C++11 mutex library. → extremely slow. → The data on static memory will be corrupted 15
  • 16. setjmp/longjmp ● int setjmp(jmp_buf env) ○ setjmp() saves the stack context/environment in env for later use by longjmp. ○ The stack context will be invalidated if the function which called setjmp() returns. ● void longjmp(jmp_buf env, int val); ○ longjmp() restores the environment saved by the last call of setjmp. 16
  • 17. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf; 4. void wait(void) { 5. printf("waitn"); // prints 6. longjmp(buf,1); 7. } 8. void first(void) { 9. wait(); 10. printf("firstn"); // does not print 11. } 12. int main() { 13. if (!setjmp(buf)) 14. first(); // when executed, setjmp returns 0 15. else // when longjmp jumps back, setjmp returns 1 16. printf("mainn"); // prints 17. return 0; 18. } 17
  • 18. Pseudo code (1) void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } 18
  • 19. Pseudo code (2) void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } void entry() { while(!finish) for(t : tasks) run(t) } void fun() { … wait(); ... } void fun() { … wait(); ... } 19
  • 20. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } 20
  • 21. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } buf 21
  • 22. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } ret address buf b 22
  • 23. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } buf b 23
  • 24. 1. #include <stdio.h> 2. #include <setjmp.h> 3. jmp_buf buf, b; 4. void wait(void) { 5. printf("waitn"); 6. if (setjmp(b) == 0) 7. longjmp(buf,1); 8. } 9. void first(void) { 10. wait(); 11. } 12. int main() { 13. if (!setjmp(buf) ) 14. first(); 15. else { 16. printf("mainn"); 17. longjmp(b, 10); 18. } 19. return 0; 20. } Cannot return ??? ??? ??? buf b 24
  • 25. Problems ● Cannot return ○ return address in the stack is destroyed ● Cannot use too many static variables ○ will lost spilled registers → can be solved by using “alloca” http://www.codemud.net/~thinker/GinGin_CGI. py/show_id_doc/489 25
  • 26. ucontext.h ● ucontext_t ● getcontext ● makecontest ● swapcontext ● setcontext 26
  • 27. ucontext_t typedef struct ucontext { struct ucontext *uc_link; sigset_t uc_sigmask; stack_t uc_stack; mcontext_t uc_mcontext; ... } ucontext_t; ● uc_link ○ points to the context that will be resumed when the current context terminates ● uc_stack ○ the stack used by this context ● uc_mcontext ○ machine-specific representation of the saved context, that includes the calling thread's machine registers 27
  • 28. Functions ● int getcontext(ucontext_t *ucp); ○ initializes the structure pointed at by ucp. ● int setcontext(const ucontext_t *ucp); ○ restores the user context pointed at by ucp ● int swapcontext(ucontext_t *oucp, const ucontext_t *ucp); ○ saves the current context in the structure pointed to by oucp, and then activates the context pointed to by ucp. 28
  • 29. makecontext ● void makecontext(ucontext_t *ucp, void (*func)(), int argc, ...); ○ glibc(x86_64) saves the arguments to registers instead of pushing them on stack as AMD64 ABI said ○ The size of the arguments that passed to makecontext should be no less than sizeof(register) 29
  • 30. 1. #include <stdio.h> 2. #include <ucontext.h> 3. static ucontext_t ctx[2]; 4. static void f1 (void) { 5. puts("start f1"); 6. swapcontext(&ctx[1], &ctx[0]); 7. puts("finish f1"); 8. } 9. int main (void) 10. { 11. char st1[8192]; 12. getcontext(&ctx[1]); 13. ctx[1].uc_stack.ss_sp = st1; 14. ctx[1].uc_stack.ss_size = sizeof st1; 15. ctx[1].uc_link = &ctx[0]; 16. makecontext(&ctx[1], f1, 0); 17. swapcontext(&ctx[0], &ctx[1]); 18. swapcontext(&ctx[0], &ctx[1]); 19. return 0; 20. } 30
  • 31. 1. #include <stdio.h> 2. #include <ucontext.h> 3. static ucontext_t ctx[3]; 4. static void f1 (void) { 5. puts("start f1"); 6. swapcontext(&ctx[1], &ctx [0]); 7. puts("finish f1"); 8. } 9. static void f2 (void) 10. { 11. puts("start f2"); 12. swapcontext(&ctx[2], &ctx [1]); 13. puts("finish f2"); 14. } 1. int main (void) 2. { 3. char st1[8192], st2[8192]; 4. getcontext(&ctx[1]); 5. ctx[1].uc_stack.ss_sp = st1; 6. ctx[1].uc_stack.ss_size = sizeof st1; 7. ctx[1].uc_link = &ctx[0]; 8. makecontext(&ctx[1], f1, 0); 9. 10. getcontext(&ctx[2]); 11. ctx[2].uc_stack.ss_sp = st2; 12. ctx[2].uc_stack.ss_size = sizeof st2; 13. ctx[2].uc_link = &ctx[1]; 14. makecontext(&ctx[2], f2, 0); 15. swapcontext(&ctx[0], &ctx[2]); 16. swapcontext(&ctx[0], &ctx[2]); 17. return 0; 18. } 31
  • 32. Fake threading (yield) void entry() { setup(fun, 2); while(!finish) switch_to(); } void fun() { … wait(); ... } void fun() { … wait(); ... } 32 void entry() { setup(fun, 2); while(!finish) switch_to(); } void fun() { … wait(); ... } void fun() { … wait(); ... }
  • 33. Problems 1. How to pass a lambda? ○ makecontext(&ctx, (void (*)(void))&Kernel::operator(), …); 2. How to pass non-int arguments? ○ What if sizeof(Type) > sizeof(int) ○ How about complex structure and class 33
  • 34. Pass lambda 1. Use a wrapper function!! template <typename Ker, typename Arg> void fun(Ker k, Arg arg) { k(arg); } template <typename Ker, typename Arg> void makectx(Ker k, Arg arg) { makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, k, arg); } 34
  • 35. Pass non-int arguments 2. Pass pointer instead!! template <typename Ker, typename Arg> void fun(Ker *k, Arg *arg) { (*k)(*arg); } template <typename Ker, typename Arg> void makectx(Ker k, Arg arg) { makecontext(&ctx, (void (*)(void))fun<ker, Arg>, 2, &k, &arg); } 35
  • 36. Additional ● Use a counter so that we can spawn coroutines dynamically ● Can it be multithreaded? Yes 36
  • 37. true threading barrier There are 12 threads in one thread group 37
  • 40. barrier struct bar_t { unsigned const count; std::atomic<unsigned> spaces; std::atomic<unsigned> generation; bar_t(unsigned count_) : count(count_), spaces(count_), generation(0) {} void wait() noexcept { unsigned const my_generation = generation; if (!--spaces) { spaces = count; ++generation; } else { while(generation == my_generation); } } }; source: C++ Concurrency in Action: Practical Multithreading 40
  • 41. Summary ● It works fine on AMP right now ● The importance of low level knowledge 41
  • 42. 42