Skip to content

Commit 48e6990

Browse files
committed
Revert to pre-ticketing mpmcq pop algo
The original testing for performance impact of the ticketing algo change was done at Sendence. At the time, we didn't see any impact. However, we weren't looking in the right place. We have since found that the ticketing algo under some workloads can result in very long pauses (up to a second) getting a ticket. Further, we regularly see pauses while waiting to get a ticket measured in the 100's of micro-seconds. This commit reverts to the prior double word cas algo that we found has far better performance characteristics in the "bad" scenarios we found and the same or slightly better characteristics under normal circumstances. The reversion includes the work in a later commit to support GCC 4.7. It does not include the VALGRIND awareness that was added later. I'm not sure where to put that in, as such, I left it out. There was a change in heap that went along with the original ticketing algo change. That is not included as part of this commit as we at Sendence didn't switch that and thus are unsure if it should be switched or what the impact would be.
1 parent 3dfab96 commit 48e6990

File tree

3 files changed

+39
-84
lines changed

3 files changed

+39
-84
lines changed

src/libponyrt/sched/mpmcq.c

+31-75
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44
#include "../mem/pool.h"
55
#include "../sched/cpu.h"
66

7-
#ifdef USE_VALGRIND
8-
#include <valgrind/helgrind.h>
9-
#endif
10-
117
typedef struct mpmcq_node_t mpmcq_node_t;
128

139
struct mpmcq_node_t
@@ -22,19 +18,21 @@ void ponyint_mpmcq_init(mpmcq_t* q)
2218
atomic_store_explicit(&node->data, NULL, memory_order_relaxed);
2319
atomic_store_explicit(&node->next, NULL, memory_order_relaxed);
2420

21+
mpmcq_dwcas_t tail;
22+
tail.node = node;
23+
2524
atomic_store_explicit(&q->head, node, memory_order_relaxed);
26-
atomic_store_explicit(&q->tail, node, memory_order_relaxed);
27-
atomic_store_explicit(&q->ticket, 0, memory_order_relaxed);
28-
atomic_store_explicit(&q->waiting_for, 0, memory_order_relaxed);
25+
atomic_store_explicit(&q->tail, tail, memory_order_relaxed);
2926
}
3027

3128
void ponyint_mpmcq_destroy(mpmcq_t* q)
3229
{
33-
mpmcq_node_t* tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
30+
mpmcq_dwcas_t tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
3431

35-
POOL_FREE(mpmcq_node_t, tail);
36-
atomic_store_explicit(&q->head, NULL, memory_order_relaxed);
37-
atomic_store_explicit(&q->tail, NULL, memory_order_relaxed);
32+
POOL_FREE(mpmcq_node_t, tail.node);
33+
tail.node = NULL;
34+
q->head = NULL;
35+
atomic_store_explicit(&q->tail, tail, memory_order_relaxed);
3836
}
3937

4038
void ponyint_mpmcq_push(mpmcq_t* q, void* data)
@@ -45,9 +43,6 @@ void ponyint_mpmcq_push(mpmcq_t* q, void* data)
4543

4644
mpmcq_node_t* prev = atomic_exchange_explicit(&q->head, node,
4745
memory_order_relaxed);
48-
#ifdef USE_VALGRIND
49-
ANNOTATE_HAPPENS_BEFORE(&prev->next);
50-
#endif
5146
atomic_store_explicit(&prev->next, node, memory_order_release);
5247
}
5348

@@ -60,85 +55,46 @@ void ponyint_mpmcq_push_single(mpmcq_t* q, void* data)
6055
// If we have a single producer, the swap of the head need not be atomic RMW.
6156
mpmcq_node_t* prev = atomic_load_explicit(&q->head, memory_order_relaxed);
6257
atomic_store_explicit(&q->head, node, memory_order_relaxed);
63-
#ifdef USE_VALGRIND
64-
ANNOTATE_HAPPENS_BEFORE(&prev->next);
65-
#endif
6658
atomic_store_explicit(&prev->next, node, memory_order_release);
6759
}
6860

6961
void* ponyint_mpmcq_pop(mpmcq_t* q)
7062
{
71-
size_t my_ticket = atomic_fetch_add_explicit(&q->ticket, 1,
72-
memory_order_relaxed);
63+
mpmcq_dwcas_t cmp, xchg;
64+
mpmcq_node_t* next;
7365

74-
while(my_ticket != atomic_load_explicit(&q->waiting_for,
75-
memory_order_relaxed))
76-
ponyint_cpu_relax();
66+
cmp = atomic_load_explicit(&q->tail, memory_order_acquire);
7767

78-
atomic_thread_fence(memory_order_acquire);
79-
#ifdef USE_VALGRIND
80-
ANNOTATE_HAPPENS_AFTER(&q->waiting_for);
81-
#endif
82-
83-
mpmcq_node_t* tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
84-
// Get the next node rather than the tail. The tail is either a stub or has
85-
// already been consumed.
86-
mpmcq_node_t* next = atomic_load_explicit(&tail->next, memory_order_relaxed);
87-
88-
// Bailout if we have no next node.
89-
if(next == NULL)
68+
do
9069
{
91-
atomic_store_explicit(&q->waiting_for, my_ticket + 1, memory_order_relaxed);
92-
return NULL;
93-
}
94-
95-
atomic_store_explicit(&q->tail, next, memory_order_relaxed);
96-
#ifdef USE_VALGRIND
97-
ANNOTATE_HAPPENS_BEFORE(&q->waiting_for);
98-
#endif
99-
atomic_store_explicit(&q->waiting_for, my_ticket + 1, memory_order_release);
100-
101-
// Synchronise-with the push.
102-
atomic_thread_fence(memory_order_acquire);
103-
#ifdef USE_VALGRIND
104-
ANNOTATE_HAPPENS_AFTER(next);
105-
#endif
106-
70+
// Get the next node rather than the tail. The tail is either a stub or has
71+
// already been consumed.
72+
next = atomic_load_explicit(&cmp.node->next, memory_order_acquire);
73+
74+
// Bailout if we have no next node.
75+
if(next == NULL)
76+
return NULL;
77+
78+
// Make the next node the tail, incrementing the aba counter. If this
79+
// fails, cmp becomes the new tail and we retry the loop.
80+
xchg.aba = cmp.aba + 1;
81+
xchg.node = next;
82+
} while(!atomic_compare_exchange_weak_explicit(&q->tail, &cmp, xchg,
83+
memory_order_acq_rel, memory_order_acquire));
84+
10785
// We'll return the data pointer from the next node.
108-
void* data = atomic_load_explicit(&next->data, memory_order_relaxed);
86+
void* data = atomic_load_explicit(&next->data, memory_order_acquire);
10987

11088
// Since we will be freeing the old tail, we need to be sure no other
11189
// consumer is still reading the old tail. To do this, we set the data
11290
// pointer of our new tail to NULL, and we wait until the data pointer of
11391
// the old tail is NULL.
114-
#ifdef USE_VALGRIND
115-
ANNOTATE_HAPPENS_BEFORE(&next->data);
116-
#endif
11792
atomic_store_explicit(&next->data, NULL, memory_order_release);
11893

119-
while(atomic_load_explicit(&tail->data, memory_order_relaxed) != NULL)
94+
while(atomic_load_explicit(&cmp.node->data, memory_order_acquire) != NULL)
12095
ponyint_cpu_relax();
12196

122-
atomic_thread_fence(memory_order_acquire);
123-
#ifdef USE_VALGRIND
124-
ANNOTATE_HAPPENS_AFTER(&tail->data);
125-
ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(tail);
126-
#endif
127-
12897
// Free the old tail. The new tail is the next node.
129-
POOL_FREE(mpmcq_node_t, tail);
98+
POOL_FREE(mpmcq_node_t, cmp.node);
13099
return data;
131100
}
132-
133-
void* ponyint_mpmcq_pop_bailout_immediate(mpmcq_t* q)
134-
{
135-
mpmcq_node_t* head = atomic_load_explicit(&q->head, memory_order_relaxed);
136-
mpmcq_node_t* tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
137-
138-
// If we believe the queue is empty, bailout immediately without taking a
139-
// ticket to avoid unnecessary contention.
140-
if(head == tail)
141-
return NULL;
142-
143-
return ponyint_mpmcq_pop(q);
144-
}

src/libponyrt/sched/mpmcq.h

+7-5
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,17 @@ PONY_EXTERN_C_BEGIN
99

1010
typedef struct mpmcq_node_t mpmcq_node_t;
1111

12+
typedef struct mpmcq_dwcas_t
13+
{
14+
uintptr_t aba;
15+
mpmcq_node_t* node;
16+
} mpmcq_dwcas_t;
17+
1218
__pony_spec_align__(
1319
typedef struct mpmcq_t
1420
{
1521
PONY_ATOMIC(mpmcq_node_t*) head;
16-
PONY_ATOMIC(mpmcq_node_t*) tail;
17-
PONY_ATOMIC(size_t) ticket;
18-
PONY_ATOMIC(size_t) waiting_for;
22+
PONY_ATOMIC(mpmcq_dwcas_t) tail;
1923
} mpmcq_t, 64
2024
);
2125

@@ -29,8 +33,6 @@ void ponyint_mpmcq_push_single(mpmcq_t* q, void* data);
2933

3034
void* ponyint_mpmcq_pop(mpmcq_t* q);
3135

32-
void* ponyint_mpmcq_pop_bailout_immediate(mpmcq_t* q);
33-
3436
PONY_EXTERN_C_END
3537

3638
#endif

src/libponyrt/sched/scheduler.c

+1-4
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,7 @@ static void push(scheduler_t* sched, pony_actor_t* actor)
5454
*/
5555
static pony_actor_t* pop_global(scheduler_t* sched)
5656
{
57-
// The global queue is empty most of the time. We use pop_bailout_immediate
58-
// to avoid unnecessary synchronisation in that common case.
59-
pony_actor_t* actor =
60-
(pony_actor_t*)ponyint_mpmcq_pop_bailout_immediate(&inject);
57+
pony_actor_t* actor = (pony_actor_t*)ponyint_mpmcq_pop(&inject);
6158

6259
if(actor != NULL)
6360
return actor;

0 commit comments

Comments
 (0)