mirror of
https://github.com/fluencelabs/musl
synced 2025-06-29 22:51:55 +00:00
the previous spin limit of 10000 was utterly unreasonable. empirically, it could consume up to 200000 cycles, whereas a failed futex wait (EAGAIN) typically takes 1000 cycles or less, and even a true wait/wake round seems much less expensive. the new counts (100 for general wait, 200 in barrier) were simply chosen to be in the range of what's reasonable without having adverse effects on casual micro-benchmark tests I have been running. they may still be too high, from a standpoint of not wasting cpu cycles, but at least they're a lot better than before. rigorous testing across different archs and cpu models should be performed at some point to determine whether further adjustments should be made.
115 lines
2.9 KiB
C
115 lines
2.9 KiB
C
#include "pthread_impl.h"
|
|
|
|
void __vm_lock_impl(int);
|
|
void __vm_unlock_impl(void);
|
|
|
|
static int pshared_barrier_wait(pthread_barrier_t *b)
|
|
{
|
|
int limit = (b->_b_limit & INT_MAX) + 1;
|
|
int ret = 0;
|
|
int v, w;
|
|
|
|
if (limit==1) return PTHREAD_BARRIER_SERIAL_THREAD;
|
|
|
|
while ((v=a_cas(&b->_b_lock, 0, limit)))
|
|
__wait(&b->_b_lock, &b->_b_waiters, v, 0);
|
|
|
|
/* Wait for <limit> threads to get to the barrier */
|
|
if (++b->_b_count == limit) {
|
|
a_store(&b->_b_count, 0);
|
|
ret = PTHREAD_BARRIER_SERIAL_THREAD;
|
|
if (b->_b_waiters2) __wake(&b->_b_count, -1, 0);
|
|
} else {
|
|
a_store(&b->_b_lock, 0);
|
|
if (b->_b_waiters) __wake(&b->_b_lock, 1, 0);
|
|
while ((v=b->_b_count)>0)
|
|
__wait(&b->_b_count, &b->_b_waiters2, v, 0);
|
|
}
|
|
|
|
__vm_lock_impl(+1);
|
|
|
|
/* Ensure all threads have a vm lock before proceeding */
|
|
if (a_fetch_add(&b->_b_count, -1)==1-limit) {
|
|
a_store(&b->_b_count, 0);
|
|
if (b->_b_waiters2) __wake(&b->_b_count, -1, 0);
|
|
} else {
|
|
while ((v=b->_b_count))
|
|
__wait(&b->_b_count, &b->_b_waiters2, v, 0);
|
|
}
|
|
|
|
/* Perform a recursive unlock suitable for self-sync'd destruction */
|
|
do {
|
|
v = b->_b_lock;
|
|
w = b->_b_waiters;
|
|
} while (a_cas(&b->_b_lock, v, v==INT_MIN+1 ? 0 : v-1) != v);
|
|
|
|
/* Wake a thread waiting to reuse or destroy the barrier */
|
|
if (v==INT_MIN+1 || (v==1 && w))
|
|
__wake(&b->_b_lock, 1, 0);
|
|
|
|
__vm_unlock_impl();
|
|
|
|
return ret;
|
|
}
|
|
|
|
struct instance
|
|
{
|
|
int count;
|
|
int last;
|
|
int waiters;
|
|
int finished;
|
|
};
|
|
|
|
int pthread_barrier_wait(pthread_barrier_t *b)
|
|
{
|
|
int limit = b->_b_limit;
|
|
struct instance *inst;
|
|
|
|
/* Trivial case: count was set at 1 */
|
|
if (!limit) return PTHREAD_BARRIER_SERIAL_THREAD;
|
|
|
|
/* Process-shared barriers require a separate, inefficient wait */
|
|
if (limit < 0) return pshared_barrier_wait(b);
|
|
|
|
/* Otherwise we need a lock on the barrier object */
|
|
while (a_swap(&b->_b_lock, 1))
|
|
__wait(&b->_b_lock, &b->_b_waiters, 1, 1);
|
|
inst = b->_b_inst;
|
|
|
|
/* First thread to enter the barrier becomes the "instance owner" */
|
|
if (!inst) {
|
|
struct instance new_inst = { 0 };
|
|
int spins = 200;
|
|
b->_b_inst = inst = &new_inst;
|
|
a_store(&b->_b_lock, 0);
|
|
if (b->_b_waiters) __wake(&b->_b_lock, 1, 1);
|
|
while (spins-- && !inst->finished)
|
|
a_spin();
|
|
a_inc(&inst->finished);
|
|
while (inst->finished == 1)
|
|
__syscall(SYS_futex,&inst->finished,FUTEX_WAIT|128,1,0) != -ENOSYS
|
|
|| __syscall(SYS_futex,&inst->finished,FUTEX_WAIT,1,0);
|
|
return PTHREAD_BARRIER_SERIAL_THREAD;
|
|
}
|
|
|
|
/* Last thread to enter the barrier wakes all non-instance-owners */
|
|
if (++inst->count == limit) {
|
|
b->_b_inst = 0;
|
|
a_store(&b->_b_lock, 0);
|
|
if (b->_b_waiters) __wake(&b->_b_lock, 1, 1);
|
|
a_store(&inst->last, 1);
|
|
if (inst->waiters)
|
|
__wake(&inst->last, -1, 1);
|
|
} else {
|
|
a_store(&b->_b_lock, 0);
|
|
if (b->_b_waiters) __wake(&b->_b_lock, 1, 1);
|
|
__wait(&inst->last, &inst->waiters, 0, 1);
|
|
}
|
|
|
|
/* Last thread to exit the barrier wakes the instance owner */
|
|
if (a_fetch_add(&inst->count,-1)==1 && a_fetch_add(&inst->finished,1))
|
|
__wake(&inst->finished, 1, 1);
|
|
|
|
return 0;
|
|
}
|