Revert to pre-ticketing mpmcq pop algo

SeanTAllen · SeanTAllen · commit 48e6990321e2 · 2017-01-18T12:33:20.000-05:00
The original testing for performance impact of the ticketing
algo change was done at Sendence. At the time, we didn't see
any impact. However, we weren't looking in the right place.

We have since found that the ticketing algo under some workloads
can result in very long pauses (up to a second) getting a ticket.
Further, we regularly see pauses while waiting to get a ticket
measured in the 100's of micro-seconds.

This commit reverts to the prior double word cas algo that we
found has far better performance characteristics in the "bad"
scenarios we found and the same or slightly better characteristics
under normal circumstances.

The reversion includes the work in a later commit to support GCC
4.7.

It does not include the VALGRIND awareness that was added later.
I'm not sure where to put that in, as such, I left it out.

There was a change in heap that went along with the original
ticketing algo change. That is not included as part of this commit
as we at Sendence didn't switch that and thus are unsure if it
should be switched or what the impact would be.
diff --git a/src/libponyrt/sched/mpmcq.c b/src/libponyrt/sched/mpmcq.c
@@ -4,10 +4,6 @@
 #include "../mem/pool.h"
 #include "../sched/cpu.h"
 
-#ifdef USE_VALGRIND
-#include <valgrind/helgrind.h>
-#endif
-
 typedef struct mpmcq_node_t mpmcq_node_t;
 
 struct mpmcq_node_t
@@ -22,19 +18,21 @@ void ponyint_mpmcq_init(mpmcq_t* q)
   atomic_store_explicit(&node->data, NULL, memory_order_relaxed);
   atomic_store_explicit(&node->next, NULL, memory_order_relaxed);
 
+  mpmcq_dwcas_t tail;
+  tail.node = node;
+
   atomic_store_explicit(&q->head, node, memory_order_relaxed);
-  atomic_store_explicit(&q->tail, node, memory_order_relaxed);
-  atomic_store_explicit(&q->ticket, 0, memory_order_relaxed);
-  atomic_store_explicit(&q->waiting_for, 0, memory_order_relaxed);
+  atomic_store_explicit(&q->tail, tail, memory_order_relaxed);
 }
 
 void ponyint_mpmcq_destroy(mpmcq_t* q)
 {
-  mpmcq_node_t* tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
+  mpmcq_dwcas_t tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
 
-  POOL_FREE(mpmcq_node_t, tail);
-  atomic_store_explicit(&q->head, NULL, memory_order_relaxed);
-  atomic_store_explicit(&q->tail, NULL, memory_order_relaxed);
+  POOL_FREE(mpmcq_node_t, tail.node);
+  tail.node = NULL;
+  q->head = NULL;
+  atomic_store_explicit(&q->tail, tail, memory_order_relaxed);
 }
 
 void ponyint_mpmcq_push(mpmcq_t* q, void* data)
@@ -45,9 +43,6 @@ void ponyint_mpmcq_push(mpmcq_t* q, void* data)
 
   mpmcq_node_t* prev = atomic_exchange_explicit(&q->head, node,
     memory_order_relaxed);
-#ifdef USE_VALGRIND
-  ANNOTATE_HAPPENS_BEFORE(&prev->next);
-#endif
   atomic_store_explicit(&prev->next, node, memory_order_release);
 }
 
@@ -60,85 +55,46 @@ void ponyint_mpmcq_push_single(mpmcq_t* q, void* data)
   // If we have a single producer, the swap of the head need not be atomic RMW.
   mpmcq_node_t* prev = atomic_load_explicit(&q->head, memory_order_relaxed);
   atomic_store_explicit(&q->head, node, memory_order_relaxed);
-#ifdef USE_VALGRIND
-  ANNOTATE_HAPPENS_BEFORE(&prev->next);
-#endif
   atomic_store_explicit(&prev->next, node, memory_order_release);
 }
 
 void* ponyint_mpmcq_pop(mpmcq_t* q)
 {
-  size_t my_ticket = atomic_fetch_add_explicit(&q->ticket, 1,
-    memory_order_relaxed);
+  mpmcq_dwcas_t cmp, xchg;
+  mpmcq_node_t* next;
 
-  while(my_ticket != atomic_load_explicit(&q->waiting_for,
-    memory_order_relaxed))
-    ponyint_cpu_relax();
+  cmp = atomic_load_explicit(&q->tail, memory_order_acquire);
 
-  atomic_thread_fence(memory_order_acquire);
-#ifdef USE_VALGRIND
-  ANNOTATE_HAPPENS_AFTER(&q->waiting_for);
-#endif
-
-  mpmcq_node_t* tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
-  // Get the next node rather than the tail. The tail is either a stub or has
-  // already been consumed.
-  mpmcq_node_t* next = atomic_load_explicit(&tail->next, memory_order_relaxed);
-  
-  // Bailout if we have no next node.
-  if(next == NULL)
+  do
   {
-    atomic_store_explicit(&q->waiting_for, my_ticket + 1, memory_order_relaxed);
-    return NULL;
-  }
-
-  atomic_store_explicit(&q->tail, next, memory_order_relaxed);
-#ifdef USE_VALGRIND
-  ANNOTATE_HAPPENS_BEFORE(&q->waiting_for);
-#endif
-  atomic_store_explicit(&q->waiting_for, my_ticket + 1, memory_order_release);
-
-  // Synchronise-with the push.
-  atomic_thread_fence(memory_order_acquire);
-#ifdef USE_VALGRIND
-  ANNOTATE_HAPPENS_AFTER(next);
-#endif
-  
+    // Get the next node rather than the tail. The tail is either a stub or has
+    // already been consumed.
+    next = atomic_load_explicit(&cmp.node->next, memory_order_acquire);
+
+    // Bailout if we have no next node.
+    if(next == NULL)
+      return NULL;
+
+    // Make the next node the tail, incrementing the aba counter. If this
+    // fails, cmp becomes the new tail and we retry the loop.
+    xchg.aba = cmp.aba + 1;
+    xchg.node = next;
+  } while(!atomic_compare_exchange_weak_explicit(&q->tail, &cmp, xchg,
+    memory_order_acq_rel, memory_order_acquire));
+
   // We'll return the data pointer from the next node.
-  void* data = atomic_load_explicit(&next->data, memory_order_relaxed);
+  void* data = atomic_load_explicit(&next->data, memory_order_acquire);
 
   // Since we will be freeing the old tail, we need to be sure no other
   // consumer is still reading the old tail. To do this, we set the data
   // pointer of our new tail to NULL, and we wait until the data pointer of
   // the old tail is NULL.
-#ifdef USE_VALGRIND
-  ANNOTATE_HAPPENS_BEFORE(&next->data);
-#endif
   atomic_store_explicit(&next->data, NULL, memory_order_release);
 
-  while(atomic_load_explicit(&tail->data, memory_order_relaxed) != NULL)
+  while(atomic_load_explicit(&cmp.node->data, memory_order_acquire) != NULL)
     ponyint_cpu_relax();
 
-  atomic_thread_fence(memory_order_acquire);
-#ifdef USE_VALGRIND
-  ANNOTATE_HAPPENS_AFTER(&tail->data);
-  ANNOTATE_HAPPENS_BEFORE_FORGET_ALL(tail);
-#endif
-
   // Free the old tail. The new tail is the next node.
-  POOL_FREE(mpmcq_node_t, tail);
+  POOL_FREE(mpmcq_node_t, cmp.node);
   return data;
 }
-
-void* ponyint_mpmcq_pop_bailout_immediate(mpmcq_t* q)
-{
-  mpmcq_node_t* head = atomic_load_explicit(&q->head, memory_order_relaxed);
-  mpmcq_node_t* tail = atomic_load_explicit(&q->tail, memory_order_relaxed);
-
-  // If we believe the queue is empty, bailout immediately without taking a
-  // ticket to avoid unnecessary contention.
-  if(head == tail)
-    return NULL;
-
-  return ponyint_mpmcq_pop(q);
-}
diff --git a/src/libponyrt/sched/mpmcq.h b/src/libponyrt/sched/mpmcq.h
@@ -9,13 +9,17 @@ PONY_EXTERN_C_BEGIN
 
 typedef struct mpmcq_node_t mpmcq_node_t;
 
+typedef struct mpmcq_dwcas_t
+{
+  uintptr_t aba;
+  mpmcq_node_t* node;
+} mpmcq_dwcas_t;
+
 __pony_spec_align__(
   typedef struct mpmcq_t
   {
     PONY_ATOMIC(mpmcq_node_t*) head;
-    PONY_ATOMIC(mpmcq_node_t*) tail;
-    PONY_ATOMIC(size_t) ticket;
-    PONY_ATOMIC(size_t) waiting_for;
+    PONY_ATOMIC(mpmcq_dwcas_t) tail;
   } mpmcq_t, 64
 );
 
@@ -29,8 +33,6 @@ void ponyint_mpmcq_push_single(mpmcq_t* q, void* data);
 
 void* ponyint_mpmcq_pop(mpmcq_t* q);
 
-void* ponyint_mpmcq_pop_bailout_immediate(mpmcq_t* q);
-
 PONY_EXTERN_C_END
 
 #endif
diff --git a/src/libponyrt/sched/scheduler.c b/src/libponyrt/sched/scheduler.c
@@ -54,10 +54,7 @@ static void push(scheduler_t* sched, pony_actor_t* actor)
  */
 static pony_actor_t* pop_global(scheduler_t* sched)
 {
-  // The global queue is empty most of the time. We use pop_bailout_immediate
-  // to avoid unnecessary synchronisation in that common case.
-  pony_actor_t* actor =
-    (pony_actor_t*)ponyint_mpmcq_pop_bailout_immediate(&inject);
+  pony_actor_t* actor = (pony_actor_t*)ponyint_mpmcq_pop(&inject);
 
   if(actor != NULL)
     return actor;