Racing To Win: Using Race Conditions to Build Correct and Concurrent Software

Racing To Win
Using Race Conditions to Build
Correct & Concurrent Software
Nathan Taylor | nathan.dijkstracula.net | @dijkstracula

Hi, I’m Nathan.
( @dijkstracula )

Cache node
Process A
stack
heap
text
Process B
stack
heap
text
Process C
stack
heap
text

A Persistent, Shared-
State Memory Allocator
Cache node
Process A
stack
heap
text
Process B
stack
heap
text
Process C
stack
heap
text
uSlab

Slab allocation
Object Object Object Object Object Object Object Object

Object Object Object Object Object Object Object Object

s_alloc();
s_alloc();Object
Object
Object
Object Object Object Object Object
s_alloc();

Object
Object
Object
Object Object Object Object Object

s_free(

);
s_free(

);
s_free(

);
Object ObjectObject Object Object Object Object Object

Allocation Protocol
• An request to allocate is followed by a response
containing an object
• A request to free is followed by a response after the
supplied object has been released 
 
• Allocation requests must not respond with an already-
allocated object
• A free request must not release an already-unallocated
object

An Execution History
void foo() { 
obj *a = s_alloc(); 
s_free(a); 
… 
}

Time
void foo() { 
obj *a = s_alloc(); 
s_free(a); 
… 
}
A(allocate request)
B(allocate response)
A(free request)
B(free response)

Time
A(allocate request)
B(allocate request)
A(allocate response)

Time
A(allocate request)
B(allocate request)
“X happened before Y” =>
“Y may observe X to have occurred”

A(allocate request)
B(allocate request)
Time

A(allocate request)
B(allocate request)
A protocol violation!
Time

Time A(allocate response)
A(allocate request)
B(allocate request)

http://cs.brown.edu/~mph/HerlihyW90/p463-herlihy.pdf

A Sequential History
Time
A(allocate request)
A(free request)
A(free response)
B(allocate request)

A Sequential History
Time
A(allocate request)
{ }
A(free request)
A(free response)
{ }
B(allocate request)
{ }

obj
*allocate(slab
*s)
{ 
 

obj
*a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

s-‐>head
=
a-‐>next; 
 

return
a; 
}

void
free(slab
*s,
obj
*o)
{ 

o-‐>next
=
s-‐>head; 

s-‐>head
=
o; 
}

obj
*allocate(slab
*s)
{ 

lock(&allocator_lock); 

obj
*a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

s-‐>head
=
a-‐>next; 

unlock(&allocator_lock); 

return
a; 
}

void
free(slab
*s,
obj
*o)
{ 

lock(&allocator_lock);

o-‐>next
=
s-‐>head; 

s-‐>head
=
o; 

unlock(&allocator_lock);

}

Was the State Locked?
Yes
Done
No
Atomic

Fetch Old Lock State
Set State Locked
Was old State Locked?
Yes
Done
No
Atomic

Fetch Old Lock State
Set State Locked
Was old State Locked?
Yes
Done
No
Atomic
Test And Set Lock

Test And Set Unlock
Set State Unlocked
Atomic

typedef
spinlock
int; 
#define
LOCKED
1 
#define
UNLOCKED
0 
 
void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 
}

void
unlock(spinlock
*m)
{ 

atomic_store(m,
UNLOCKED); 
} Many code examples
derived from Concurrency Kit
http://concurrencykit.org

void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 
}
A(TAS request)
A(TAS response)
{ }

A(TAS request)
A(TAS response)
{ }
TAS is embedded in Lock

A(TAS request)
A(TAS response)
{ }
A(lock request)
A(lock response)
Time
TAS is embedded in Lock

A(TAS request)
A(TAS response)
{ }
A(lock request)
A(lock response)
Time
TAS & Store can’t be
reordered

A(TAS request)
A(TAS response)
{ }
A(lock request)
A(lock response)
Time
B(unlock request)
B(unlock response)
B(Store request)
B(Store response)
{ }
TAS & Store can’t be
reordered

All execution histories
All sequentially-consistent
execution histories
⊇

execution histories
All ???able execution
histories
⊇
⊇

execution histories
All linearizable execution
histories
⊇
⊇

A(TAS request)
A(TAS response)
{ }
A(lock request)
A(lock response)
Time
Others can be reordered
B(unlock request)
B(unlock response)
B(Store request)
B(Store response)
{ }

void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 
}

void
unlock(spinlock
*m)
{ 

atomic_store(m,
UNLOCKED); 
}

http://dl.acm.org/citation.cfm?id=69624.357207

Spinlock performance
millionsoflock
acquisitions/sec
15
30
45
60
75
90
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
87.351
Test and Set

millionsoflock
acquisitions/sec
15
30
45
60
75
90
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Platonic ideal of a spinlock

millionsoflock
acquisitions/sec
15
30
45
60
75
90
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
87.351
4.343
Test and Set

millionsoflock
acquisitions/sec
15
30
45
60
75
90
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Test and Set

acquisitions/sec
1E+01
1E+02
1E+03
1E+04
1E+05
1E+06
1E+07
1E+08
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Test and Set

typedef
spinlock
int; 
#define
LOCKED
1 
#define
UNLOCKED
0 
 
void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 
}

typedef
spinlock
int; 
#define
LOCKED
1 
#define
UNLOCKED
0 
 
void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)
{ 

while
(atomic_store(m)
==
LOCKED)

snooze(); 

} 
}

typedef
spinlock
int; 
#define
LOCKED
1 
#define
UNLOCKED
0 
 
void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)
{ 

while
(atomic_store(m)
==
LOCKED)

snooze(); 

} 
}

Test-and-Test-and-Set

Lockedalloc/free(10s)
10
100
1,000
10,000
100,000
1,000,000
10,000,000
100,000,000
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Test and Set T&T&S

10
100
1,000
10,000
100,000
1,000,000
10,000,000
100,000,000
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Test and Set T&T&S

typedef
spinlock
int; 
#define
LOCKED
1 
#define
UNLOCKED
0 
 
void
lock(spinlock
*m)
{

unsigned
long
backoff,
exp
=
0;
 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)
{

for
(i
=
0;
i
<
backoff;
i++)

snooze();

backoff
=
(1ULL
<<
exp++);

} 
}

typedef
spinlock
int; 
#define
LOCKED
1 
#define
UNLOCKED
0 
 
void
lock(spinlock
*m)
{

unsigned
long
backoff,
exp
=
0;
 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)
{

for
(i
=
0;
i
<
backoff;
i++)

snooze();

backoff
=
(1ULL
<<
exp++);

} 
}

TAS + backoff

10,000,000
20,000,000
30,000,000
40,000,000
50,000,000
60,000,000
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
Test and Set T&T&S TAS + EB

void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 
}
void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 
}
spinlock
global_lock
= UNLOCKED

void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 
}
void
lock(spinlock
*m)
{ 

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 
}
spinlock
global_lock
= LOCKED

A function is lock-free if at all times
at least one thread is
guaranteed to be making
progress [in the function].
(Herlihy & Shavit)

//
TODO:
make
this
safe
and
scalable 
obj
*allocate(slab
*s)
{ 

obj
*a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

s-‐>head
=
a-‐>next; 

return
a; 
}

//
TODO:
make
this
safe
and
scalable

void
free(slab
*s,
obj
*o)
{

o-‐>next
=
s-‐>head; 

s-‐>head
=
o;

}

Compare-And-Swap
Cmpr and *
Old value
Destination
Address

Compare-And-Swap
≠
Return false
Old value
Destination
Address
Cmpr and *

Compare-And-Swap
Old value New value
≠
Return false
=
Destination
Address
Copy to *
Return true
Cmpr and *

Compare-And-Swap
Old value New value
≠
Return false
=
Destination
Address
Return true
Atomic
Copy to *Cmpr and *

Atomic i
=
i+1;
void
atomic_inc(int
*ptr)
{ 

int
i,
i_plus_one; 

do
{

i
=
*ptr;

i_plus_one
=
i
+
1; 

}
while
(!cas(i,
i_plus_one,
ptr));

 
}

void
atomic_inc(int
*ptr)
{ 

int
i,
i_plus_one; 

do
{

i
=
*ptr;

i_plus_one
=
i
+
1; 

}
while
(!cas(i,
i_plus_one,
ptr));

 
}
Atomic i
=
i+1;

void
atomic_inc_mod_32(int
*ptr)
{ 

int
i,
new_i; 

do
{

i
=
*ptr;

new_i
=
i
+
1;

new_i
=
new_i
%
32; 

}
while
(!cas(i,
new_i,
ptr)); 
}
Atomic i
=
(i+1)
%
32;

TAS using CAS
void
tas_loop(spinlock
*m)
{ 

do
{

; 

}
while
(!cas(UNLOCKED,
LOCKED,
m));

}

Read/Modify/Write
void
*ptr)
{ 

int
i,
new_i; 

do
{

i
=
*ptr;

/*
Read
*/

new_i
=
fancy_function();

/*
Modify
*/ 

}
while
(!cas(i,
new_i,
ptr));
/*
Write
*/

 
}

Read/Modify/Write
void
*ptr)
{ 

int
i,
new_i; 

do
{

i
=
*ptr;

/*
Read
*/

new_i
=
fancy_function();

/*
Modify
*/ 

}
while
(!cas(i,
new_i,
ptr));
/*
Write
*/

/*
(or
retry)
*/ 
}

obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head
)); 

return
a; 
}
slab head
A B …

obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head
)); 

return
a; 
}
A B …slab head

B …
slab head
A
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head
)); 

return
a; 
}

obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(

,

,

)); 

return

a; 
}
slab head
a
a b
Cmpr and *
&s->head
A B …
b
a

slab head
Cmpr and
Z
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(

,

,

)); 

return

a; 
}
a
a b &s->head
b
a

slab head
Z A B
Cmpr and
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(

,

,

)); 

return

a; 
}
a
a b &s->head
b
a

slab head
B …
Cmpr and
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(

,

,

)); 

return

a; 
}
a
a b &s->head
b
a

void
free(slab
*s,
obj
*o)
{

do
{

obj
*t
=
s-‐>head;

o-‐>next
=
t;

}
while
(!cas(t,
o,
&s-‐>head));

}
B …slab head

void
free(slab
*s,
obj
*o)
{

do
{

obj
*t
=
s-‐>head;

o-‐>next
=
t;

}
while
(!cas(t,
o,
&s-‐>head));

}
slab head
A B …

obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}
A B Cslab head

A B C
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}
slab head

A B C
some_object
=
allocate(&shared_slab);
slab head
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}

B C
A
slab head
some_object
=
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}

B C
another_obj
=
A
slab head
some_object
=
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}

C
A
B
slab head
another_obj
=
some_object
=
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}

B
C
A
slab head
another_obj
=
free(&shared_slab,
some_object);
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}

B
A Cslab head
another_obj
=
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}
free(&shared_slab,
some_object);

free(&shared_slab,
some_object);
B
B Cslab head
A
another_obj
=
obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}

The ABA Problem
“A reference about to be modiﬁed by a CAS
changes from a to b and back to a again. As a
result, the CAS succeeds even though its effect on
the data structure has changed and no longer has
the desired effect.” —Herlihy & Shavit, p. 235

obj
*allocate(slab
*s)
{ 

obj
*a,
*b; 

do
{ 

a
=
s-‐>head; 

if
(a
==
NULL)
return
NULL; 

b
=
a-‐>next; 

}
while
(!cas(a,
b,
&s-‐>head)); 

return
a; 
}
A B …slab head
166

obj
*allocate(slab
*s)
{ 

slab
orig,
update; 

do
{ 

orig.gen
=
s.gen; 

orig.head
=
s.head; 

if
(!orig.head)
return
NULL;

update.gen
=
orig.gen
+
1; 

update.head
=
orig.head-‐>next; 

}
while
(!dcas(&orig,
&update,
s)); 

return
orig.head; 
}
A B …slab head
166

free(slab
*s,
obj
*o)
{

do
{

obj
*t
=
s-‐>head;

o-‐>next
=
t;

}
while
(!cas(t,
o,
&s-‐>head));

}

slab head
A B …
obj
*o
=
obj
*o
=

slab head
B …
obj
*o
=
obj
*o
=
A
A

slab head
B …
obj
*o
=
obj
*o
=
A
A
Memory barriers


obj
*a
=
s-‐>head; 
…


obj
*a
=
s-‐>head; 
…

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 

obj
*a
=
s-‐>head; 
…

while
(atomic_tas(m,
LOCKED)
==
LOCKED)

snooze(); 

obj
*a
=
s-‐>head; 
…

LDREX
R5,
[m]

;
TAS:
fetch.
.
.

STREXEQ
R5,
LOCKED,
[m]
;
TAS:
.
.
.
and
set

CMPEQ
R5,
#0

;
Did
we
succeed?

LDR
R0,
[R1,
4]

;
a
=
s-‐>head

BEQ
lock_done

;
Yes:
we
are
all
done 

BL
snooze

;
No:
Call
snooze()… 

B
lock_loop

;

…then
loop
again 
lock_done:

B
LR

;
return
;;;;
IN
lock()

lock_loop:

;;;;
IN
allocate()

LDREX
R5,
[m]

;
TAS:
fetch.
.
.

STREXEQ
R5,
LOCKED,
[m]
;
TAS:
.
.
.
and
set

CMPEQ
R5,
#0

;
Did
we
succeed?

LDR
R0,
[R1,
4]

;
a
=
s-‐>head

BEQ
lock_done

;
Yes:
we
are
all
done 

BL
snooze

;
No:
Call
snooze()… 

B
lock_loop

;

…then
loop
again 
lock_done:

B
LR

;
return
;;;;
IN
allocate()

;;;;
IN
lock()

lock_loop:

LDR
R0,
[R1,
4]

;
a
=
s-‐>head

BEQ
lock_done

;
Yes:
we
are
all
done 

BL
snooze

;
No:
Call
snooze()… 

B
lock_loop

;

…then
loop
again 
lock_done:

B
LR

;
return
;;;;
IN
allocate()

LDREX
R5,
[m]

;
TAS:
fetch.
.
.

STREXEQ
R5,
LOCKED,
[m]
;
TAS:
.
.
.
and
set

CMPEQ
R5,
#0

;
Did
we
succeed?
;;;;
IN
lock()

lock_loop:

obj
*a
=
s-‐>head;

…

obj
*a
=
s-‐>head; 

…


<
-‐
-‐
-‐
-‐
-‐
-‐
-‐
-‐
-‐
-‐> 

obj
*a
=
s-‐>head; 
…


<
-‐
-‐
-‐
-‐
-‐
-‐
-‐
-‐
-‐
-‐> 

obj
*a
=
s-‐>head; 
…

LDREX
R5,
[m]

;
TAS:
fetch.
.
.

STREXEQ
R5,
LOCKED,
[m]
;
TAS:
.
.
.
and
set

CMPEQ
R5,
#0

;
Did
we
succeed?

BEQ
lock_done

;
Yes:
we
are
all
done 

BL
snooze

;
No:
Call
snooze()… 

B
lock_loop

;

…then
loop
again 
lock_done:

DMB

;
Ensure
all
previous
reads

;
have
been
completed

B
LR

;
return
;;;;
IN
unlock()

MOV
R0,
UNLOCKED

DMB

;
Ensure
all
previous
reads
have

;
been
completed

STR
R0,
LR
;;;;
IN
lock()

lock_loop:

nathan~$
cat
/proc/cpuinfo
|
grep
"physical.*0"
|
wc
-‐l

16

nathan~$
cat
/proc/cpuinfo
|
grep
"model
name"
|
uniq

model
name
:
Intel(R)
Xeon(R)
CPU
E5-‐2690
0
@
2.90GHz
Allocator performance

MillionsofAlloc/free 
pairs/sec
10
20
30
40
50
60
Threads
1
20.56822.392
50.52951.23452.721
T&S T&S-EB T&T&S CAS
pthread_mutex
Allocator Throughput

MillionsofAlloc/free 
pairs/sec
10
20
30
40
50
60
Threads
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
TAS T&T&S TAS + EB
Concurrent Allocator pthread
Allocator Throughput

Allocator latency
Threads
CPUCycles

https://github.com/fastly/uslab

The lyf so short,
the CAS so longe to lerne
• Cache coherency and NUMA architecture
• Transactional memory

“lock-free programming is
hard; let’s go ride bikes”?

• high-level performance necessitates an
understanding of low level performance

• your computer is a distributed system

• your computer is a distributed system
• (optional third answer: it’s real neato)

Come see us at the booth!
Nathan Taylor | nathan.dijkstracula.net | @dijkstracula
Thanks
credits, code, and additional material at
https://github.com/dijkstracula/Surge2015/

Racing To Win: Using Race Conditions to Build Correct and Concurrent Software

More Related Content

What's hot (20)

Similar to Racing To Win: Using Race Conditions to Build Correct and Concurrent Software (20)

More from Fastly (20)

Recently uploaded (20)

Racing To Win: Using Race Conditions to Build Correct and Concurrent Software