mirror of
https://github.com/phabrics/Run-Sun3-SunOS-4.1.1.git
synced 2026-04-29 19:12:58 -04:00
6455 lines
215 KiB
C
6455 lines
215 KiB
C
/* automatically generated by memory-auto.sh, do not edit! */
|
|
|
|
/*
|
|
* Copyright (c) 2005, 2006 Matt Fredette
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
* must display the following acknowledgement:
|
|
* This product includes software developed by Matt Fredette.
|
|
* 4. The name of the author may not be used to endorse or promote products
|
|
* derived from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AS IS'' AND ANY EXPRESS OR
|
|
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
|
|
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/* includes: */
|
|
#include <tme/memory.h>
|
|
|
|
|
|
_TME_RCSID("$Id: memory-auto.sh,v 1.2 2010/02/15 15:16:28 fredette Exp $");
|
|
|
|
/* undefine the macro version of tme_memory_bus_read16: */
|
|
#undef tme_memory_bus_read16
|
|
|
|
/* the bus 16-bit read slow function: */
|
|
tme_uint16_t
|
|
tme_memory_bus_read16(_tme_const tme_shared tme_uint16_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
|
|
{
|
|
const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
|
|
unsigned int size_skip;
|
|
unsigned int size_done;
|
|
tme_uint16_t x;
|
|
#ifdef TME_HAVE_INT64_T
|
|
_tme_const tme_shared tme_uint64_t *parts64;
|
|
tme_uint64_t part64;
|
|
#endif /* TME_HAVE_INT64_T */
|
|
_tme_const tme_shared tme_uint32_t *parts32;
|
|
tme_uint32_t part32;
|
|
_tme_const tme_shared tme_uint16_t *parts16;
|
|
tme_uint16_t part16;
|
|
_tme_const tme_shared tme_uint8_t *parts8;
|
|
tme_uint8_t part8;
|
|
|
|
assert (bus_boundary != 0 && bus_boundary <= host_boundary);
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
if (host_boundary == sizeof(tme_uint64_t)) {
|
|
|
|
/* prepare to read the first 64-bit part of the memory: */
|
|
parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 64-bit part of the memory: */
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint16_t) (part64 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((part64 << size_skip) >> ((64 - 16) + 0));
|
|
}
|
|
size_done = 64 - size_skip;
|
|
|
|
/* read at most one remaining 64-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 16)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 64-bit part of the memory: */
|
|
parts64++;
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint16_t) (part64 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((part64 << 0) >> ((64 - 16) + size_done));
|
|
}
|
|
}
|
|
}
|
|
|
|
else
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
if (host_boundary == sizeof(tme_uint32_t)) {
|
|
|
|
/* prepare to read the first 32-bit part of the memory: */
|
|
parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 32-bit part of the memory: */
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint16_t) (part32 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((part32 << size_skip) >> ((32 - 16) + 0));
|
|
}
|
|
size_done = 32 - size_skip;
|
|
|
|
/* read at most one remaining 32-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 16)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 32-bit part of the memory: */
|
|
parts32++;
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint16_t) (part32 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((part32 << 0) >> ((32 - 16) + size_done));
|
|
}
|
|
}
|
|
}
|
|
|
|
else if (host_boundary == sizeof(tme_uint16_t)) {
|
|
|
|
/* prepare to read the first 16-bit part of the memory: */
|
|
parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 16-bit part of the memory: */
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint16_t) (part16 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint16_t) part16) << ((16 - 16) + size_skip)) >> 0);
|
|
}
|
|
size_done = 16 - size_skip;
|
|
|
|
/* read at most one remaining 16-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 16)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 16-bit part of the memory: */
|
|
parts16++;
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint16_t) (part16 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint16_t) part16) << ((16 - 16) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
else {
|
|
|
|
/* prepare to read the first 8-bit part of the memory: */
|
|
parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 8-bit part of the memory: */
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint16_t) (part8 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint16_t) part8) << ((16 - 8) + size_skip)) >> 0);
|
|
}
|
|
size_done = 8 - size_skip;
|
|
|
|
/* read at most one remaining 8-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 16)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 8-bit part of the memory: */
|
|
parts8++;
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint16_t) (part8 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint16_t) part8) << ((16 - 8) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (x);
|
|
}
|
|
|
|
/* undefine the macro version of tme_memory_bus_write16: */
|
|
#undef tme_memory_bus_write16
|
|
|
|
/* the bus 16-bit write slow function: */
|
|
void
|
|
tme_memory_bus_write16(tme_shared tme_uint16_t *mem, tme_uint16_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
|
|
{
|
|
const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
|
|
unsigned int size_skip;
|
|
unsigned int size_done;
|
|
#ifdef TME_HAVE_INT64_T
|
|
tme_shared tme_uint64_t *parts64;
|
|
tme_uint64_t part64;
|
|
tme_uint64_t part64_cmp;
|
|
#endif /* TME_HAVE_INT64_T */
|
|
tme_shared tme_uint32_t *parts32;
|
|
tme_uint32_t part32;
|
|
tme_uint32_t part32_cmp;
|
|
tme_shared tme_uint16_t *parts16;
|
|
tme_uint16_t part16;
|
|
tme_uint16_t part16_cmp;
|
|
tme_shared tme_uint8_t *parts8;
|
|
tme_uint8_t part8;
|
|
tme_uint8_t part8_cmp;
|
|
|
|
assert (bus_boundary != 0 && bus_boundary <= host_boundary);
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
if (host_boundary == sizeof(tme_uint64_t)) {
|
|
|
|
/* prepare to write the first 64-bit part of the memory: */
|
|
parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 64-bit part of the memory: */
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
do {
|
|
part64_cmp = part64;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
|
|
part64 |= (((tme_uint64_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((64 - 16) + 0)) >> size_skip);
|
|
part64 |= ((((tme_uint64_t) x) << (64 - 16)) >> size_skip);
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
|
|
} while (part64 != part64_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (64 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (64 - size_skip);
|
|
}
|
|
size_done = 64 - size_skip;
|
|
|
|
/* write at most one remaining 64-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 16)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 64-bit part of the memory: */
|
|
parts64++;
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
do {
|
|
part64_cmp = part64;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
|
|
part64 |= (((tme_uint64_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((64 - 16) + size_done)) >> 0);
|
|
part64 |= ((((tme_uint64_t) x) << (64 - 16)) >> 0);
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
|
|
} while (part64 != part64_cmp);
|
|
}
|
|
}
|
|
|
|
else
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
if (host_boundary == sizeof(tme_uint32_t)) {
|
|
|
|
/* prepare to write the first 32-bit part of the memory: */
|
|
parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 32-bit part of the memory: */
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
do {
|
|
part32_cmp = part32;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
|
|
part32 |= (((tme_uint32_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part32 &= ~((((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((32 - 16) + 0)) >> size_skip);
|
|
part32 |= ((((tme_uint32_t) x) << (32 - 16)) >> size_skip);
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
|
|
} while (part32 != part32_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (32 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (32 - size_skip);
|
|
}
|
|
size_done = 32 - size_skip;
|
|
|
|
/* write at most one remaining 32-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 16)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 32-bit part of the memory: */
|
|
parts32++;
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
do {
|
|
part32_cmp = part32;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
|
|
part32 |= (((tme_uint32_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part32 &= ~((((tme_uint32_t) _tme_memory_type_mask(tme_uint16_t, + 0)) << ((32 - 16) + size_done)) >> 0);
|
|
part32 |= ((((tme_uint32_t) x) << (32 - 16)) >> 0);
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
|
|
} while (part32 != part32_cmp);
|
|
}
|
|
}
|
|
|
|
else if (host_boundary == sizeof(tme_uint16_t)) {
|
|
|
|
/* prepare to write the first 16-bit part of the memory: */
|
|
parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 16-bit part of the memory: */
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
do {
|
|
part16_cmp = part16;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
|
|
part16 |= (((tme_uint16_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
|
|
part16 |= (x >> ((16 - 16) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
|
|
} while (part16 != part16_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (16 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (16 - size_skip);
|
|
}
|
|
size_done = 16 - size_skip;
|
|
|
|
/* write at most one remaining 16-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 16)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 16-bit part of the memory: */
|
|
parts16++;
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
do {
|
|
part16_cmp = part16;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
|
|
part16 |= (((tme_uint16_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
|
|
part16 |= (x >> ((16 - 16) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
|
|
} while (part16 != part16_cmp);
|
|
}
|
|
}
|
|
|
|
else {
|
|
|
|
/* prepare to write the first 8-bit part of the memory: */
|
|
parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 8-bit part of the memory: */
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
do {
|
|
part8_cmp = part8;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint16_t, << 0)) << size_skip));
|
|
part8 |= (((tme_uint8_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
|
|
part8 |= (x >> ((16 - 8) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
|
|
} while (part8 != part8_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (8 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (8 - size_skip);
|
|
}
|
|
size_done = 8 - size_skip;
|
|
|
|
/* write at most one remaining 8-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 16)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (16 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 8-bit part of the memory: */
|
|
parts8++;
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
do {
|
|
part8_cmp = part8;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint16_t, << size_done)) << 0));
|
|
part8 |= (((tme_uint8_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
|
|
part8 |= (x >> ((16 - 8) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
|
|
} while (part8 != part8_cmp);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* undefine the macro version of tme_memory_bus_read32: */
|
|
#undef tme_memory_bus_read32
|
|
|
|
/* the bus 32-bit read slow function: */
|
|
tme_uint32_t
|
|
tme_memory_bus_read32(_tme_const tme_shared tme_uint32_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
|
|
{
|
|
const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
|
|
unsigned int size_skip;
|
|
unsigned int size_done;
|
|
tme_uint32_t x;
|
|
#ifdef TME_HAVE_INT64_T
|
|
_tme_const tme_shared tme_uint64_t *parts64;
|
|
tme_uint64_t part64;
|
|
#endif /* TME_HAVE_INT64_T */
|
|
_tme_const tme_shared tme_uint32_t *parts32;
|
|
tme_uint32_t part32;
|
|
_tme_const tme_shared tme_uint16_t *parts16;
|
|
tme_uint16_t part16;
|
|
_tme_const tme_shared tme_uint8_t *parts8;
|
|
tme_uint8_t part8;
|
|
|
|
assert (bus_boundary != 0 && bus_boundary <= host_boundary);
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
if (host_boundary == sizeof(tme_uint64_t)) {
|
|
|
|
/* prepare to read the first 64-bit part of the memory: */
|
|
parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 64-bit part of the memory: */
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint32_t) (part64 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((part64 << size_skip) >> ((64 - 32) + 0));
|
|
}
|
|
size_done = 64 - size_skip;
|
|
|
|
/* read at most one remaining 64-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 32)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 64-bit part of the memory: */
|
|
parts64++;
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint32_t) (part64 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((part64 << 0) >> ((64 - 32) + size_done));
|
|
}
|
|
}
|
|
}
|
|
|
|
else
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
if (host_boundary == sizeof(tme_uint32_t)) {
|
|
|
|
/* prepare to read the first 32-bit part of the memory: */
|
|
parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 32-bit part of the memory: */
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint32_t) (part32 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint32_t) part32) << ((32 - 32) + size_skip)) >> 0);
|
|
}
|
|
size_done = 32 - size_skip;
|
|
|
|
/* read at most one remaining 32-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 32)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 32-bit part of the memory: */
|
|
parts32++;
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint32_t) (part32 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint32_t) part32) << ((32 - 32) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
else if (host_boundary == sizeof(tme_uint16_t)) {
|
|
|
|
/* prepare to read the first 16-bit part of the memory: */
|
|
parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 16-bit part of the memory: */
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint32_t) (part16 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint32_t) part16) << ((32 - 16) + size_skip)) >> 0);
|
|
}
|
|
size_done = 16 - size_skip;
|
|
|
|
/* read any remaining 16-bit parts of the memory: */
|
|
for (; size_done < 32; size_done += 16) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 16-bit part of the memory: */
|
|
parts16++;
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint32_t) (part16 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint32_t) part16) << ((32 - 16) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
else {
|
|
|
|
/* prepare to read the first 8-bit part of the memory: */
|
|
parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 8-bit part of the memory: */
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint32_t) (part8 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint32_t) part8) << ((32 - 8) + size_skip)) >> 0);
|
|
}
|
|
size_done = 8 - size_skip;
|
|
|
|
/* read any remaining 8-bit parts of the memory: */
|
|
for (; size_done < 32; size_done += 8) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 8-bit part of the memory: */
|
|
parts8++;
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint32_t) (part8 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint32_t) part8) << ((32 - 8) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (x);
|
|
}
|
|
|
|
/* undefine the macro version of tme_memory_bus_write32: */
|
|
#undef tme_memory_bus_write32
|
|
|
|
/* the bus 32-bit write slow function: */
|
|
void
|
|
tme_memory_bus_write32(tme_shared tme_uint32_t *mem, tme_uint32_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
|
|
{
|
|
const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
|
|
unsigned int size_skip;
|
|
unsigned int size_done;
|
|
#ifdef TME_HAVE_INT64_T
|
|
tme_shared tme_uint64_t *parts64;
|
|
tme_uint64_t part64;
|
|
tme_uint64_t part64_cmp;
|
|
#endif /* TME_HAVE_INT64_T */
|
|
tme_shared tme_uint32_t *parts32;
|
|
tme_uint32_t part32;
|
|
tme_uint32_t part32_cmp;
|
|
tme_shared tme_uint16_t *parts16;
|
|
tme_uint16_t part16;
|
|
tme_uint16_t part16_cmp;
|
|
tme_shared tme_uint8_t *parts8;
|
|
tme_uint8_t part8;
|
|
tme_uint8_t part8_cmp;
|
|
|
|
assert (bus_boundary != 0 && bus_boundary <= host_boundary);
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
if (host_boundary == sizeof(tme_uint64_t)) {
|
|
|
|
/* prepare to write the first 64-bit part of the memory: */
|
|
parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 64-bit part of the memory: */
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
do {
|
|
part64_cmp = part64;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
|
|
part64 |= (((tme_uint64_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, + 0)) << ((64 - 32) + 0)) >> size_skip);
|
|
part64 |= ((((tme_uint64_t) x) << (64 - 32)) >> size_skip);
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
|
|
} while (part64 != part64_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (64 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (64 - size_skip);
|
|
}
|
|
size_done = 64 - size_skip;
|
|
|
|
/* write at most one remaining 64-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 32)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 64-bit part of the memory: */
|
|
parts64++;
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
do {
|
|
part64_cmp = part64;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
|
|
part64 |= (((tme_uint64_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part64 &= ~((((tme_uint64_t) _tme_memory_type_mask(tme_uint32_t, + 0)) << ((64 - 32) + size_done)) >> 0);
|
|
part64 |= ((((tme_uint64_t) x) << (64 - 32)) >> 0);
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
|
|
} while (part64 != part64_cmp);
|
|
}
|
|
}
|
|
|
|
else
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
if (host_boundary == sizeof(tme_uint32_t)) {
|
|
|
|
/* prepare to write the first 32-bit part of the memory: */
|
|
parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 32-bit part of the memory: */
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
do {
|
|
part32_cmp = part32;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
|
|
part32 |= (((tme_uint32_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
|
|
part32 |= (x >> ((32 - 32) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
|
|
} while (part32 != part32_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (32 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (32 - size_skip);
|
|
}
|
|
size_done = 32 - size_skip;
|
|
|
|
/* write at most one remaining 32-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 32)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 32-bit part of the memory: */
|
|
parts32++;
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
do {
|
|
part32_cmp = part32;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
|
|
part32 |= (((tme_uint32_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
|
|
part32 |= (x >> ((32 - 32) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
|
|
} while (part32 != part32_cmp);
|
|
}
|
|
}
|
|
|
|
else if (host_boundary == sizeof(tme_uint16_t)) {
|
|
|
|
/* prepare to write the first 16-bit part of the memory: */
|
|
parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 16-bit part of the memory: */
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
do {
|
|
part16_cmp = part16;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
|
|
part16 |= (((tme_uint16_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
|
|
part16 |= (x >> ((32 - 16) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
|
|
} while (part16 != part16_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (16 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (16 - size_skip);
|
|
}
|
|
size_done = 16 - size_skip;
|
|
|
|
/* try to write one full 16-bit part of memory: */
|
|
if (__tme_predict_true(size_done <= (32 - 16))) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write a full 16-bit part of memory: */
|
|
part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (32 - 16)));
|
|
parts16++;
|
|
tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
|
|
size_done += 16;
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= 16;
|
|
}
|
|
else {
|
|
x <<= 16;
|
|
}
|
|
}
|
|
|
|
/* write at most one remaining 16-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 32)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 16-bit part of the memory: */
|
|
parts16++;
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
do {
|
|
part16_cmp = part16;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
|
|
part16 |= (((tme_uint16_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
|
|
part16 |= (x >> ((32 - 16) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
|
|
} while (part16 != part16_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (16 - 0);
|
|
}
|
|
else {
|
|
x <<= (16 - 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
else {
|
|
|
|
/* prepare to write the first 8-bit part of the memory: */
|
|
parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 8-bit part of the memory: */
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
do {
|
|
part8_cmp = part8;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint32_t, << 0)) << size_skip));
|
|
part8 |= (((tme_uint8_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
|
|
part8 |= (x >> ((32 - 8) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
|
|
} while (part8 != part8_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (8 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (8 - size_skip);
|
|
}
|
|
size_done = 8 - size_skip;
|
|
|
|
/* write as many full 8-bit parts of the memory as we can: */
|
|
for (; size_done <= (32 - 8); ) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write a full 8-bit part of memory: */
|
|
part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (32 - 8)));
|
|
parts8++;
|
|
tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
|
|
size_done += 8;
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= 8;
|
|
}
|
|
else {
|
|
x <<= 8;
|
|
}
|
|
}
|
|
|
|
/* write at most one remaining 8-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 32)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (32 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 8-bit part of the memory: */
|
|
parts8++;
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
do {
|
|
part8_cmp = part8;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint32_t, << size_done)) << 0));
|
|
part8 |= (((tme_uint8_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
|
|
part8 |= (x >> ((32 - 8) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
|
|
} while (part8 != part8_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (8 - 0);
|
|
}
|
|
else {
|
|
x <<= (8 - 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
/* undefine the macro version of tme_memory_bus_read64: */
|
|
#undef tme_memory_bus_read64
|
|
|
|
/* the bus 64-bit read slow function: */
|
|
tme_uint64_t
|
|
tme_memory_bus_read64(_tme_const tme_shared tme_uint64_t *mem, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
|
|
{
|
|
const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
|
|
unsigned int size_skip;
|
|
unsigned int size_done;
|
|
tme_uint64_t x;
|
|
#ifdef TME_HAVE_INT64_T
|
|
_tme_const tme_shared tme_uint64_t *parts64;
|
|
tme_uint64_t part64;
|
|
#endif /* TME_HAVE_INT64_T */
|
|
_tme_const tme_shared tme_uint32_t *parts32;
|
|
tme_uint32_t part32;
|
|
_tme_const tme_shared tme_uint16_t *parts16;
|
|
tme_uint16_t part16;
|
|
_tme_const tme_shared tme_uint8_t *parts8;
|
|
tme_uint8_t part8;
|
|
|
|
assert (bus_boundary != 0 && bus_boundary <= host_boundary);
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
if (host_boundary == sizeof(tme_uint64_t)) {
|
|
|
|
/* prepare to read the first 64-bit part of the memory: */
|
|
parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 64-bit part of the memory: */
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint64_t) (part64 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint64_t) part64) << ((64 - 64) + size_skip)) >> 0);
|
|
}
|
|
size_done = 64 - size_skip;
|
|
|
|
/* read at most one remaining 64-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 64)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 64-bit part of the memory: */
|
|
parts64++;
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint64_t) (part64 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint64_t) part64) << ((64 - 64) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
else
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
if (host_boundary == sizeof(tme_uint32_t)) {
|
|
|
|
/* prepare to read the first 32-bit part of the memory: */
|
|
parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 32-bit part of the memory: */
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint64_t) (part32 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint64_t) part32) << ((64 - 32) + size_skip)) >> 0);
|
|
}
|
|
size_done = 32 - size_skip;
|
|
|
|
/* read any remaining 32-bit parts of the memory: */
|
|
for (; size_done < 64; size_done += 32) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 32-bit part of the memory: */
|
|
parts32++;
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint64_t) (part32 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint64_t) part32) << ((64 - 32) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
else if (host_boundary == sizeof(tme_uint16_t)) {
|
|
|
|
/* prepare to read the first 16-bit part of the memory: */
|
|
parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 16-bit part of the memory: */
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint64_t) (part16 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint64_t) part16) << ((64 - 16) + size_skip)) >> 0);
|
|
}
|
|
size_done = 16 - size_skip;
|
|
|
|
/* read any remaining 16-bit parts of the memory: */
|
|
for (; size_done < 64; size_done += 16) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 16-bit part of the memory: */
|
|
parts16++;
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint64_t) (part16 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint64_t) part16) << ((64 - 16) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
else {
|
|
|
|
/* prepare to read the first 8-bit part of the memory: */
|
|
parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* read the first 8-bit part of the memory: */
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x = (((tme_uint64_t) (part8 >> size_skip)) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x = ((((tme_uint64_t) part8) << ((64 - 8) + size_skip)) >> 0);
|
|
}
|
|
size_done = 8 - size_skip;
|
|
|
|
/* read any remaining 8-bit parts of the memory: */
|
|
for (; size_done < 64; size_done += 8) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_READ_BEFORE_READ);
|
|
|
|
/* read the next 8-bit part of the memory: */
|
|
parts8++;
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
|
|
/* on a little-endian host, we shift off the skip
|
|
data on the right, and shift the remaining data
|
|
up into position in the result: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x |= (((tme_uint64_t) (part8 >> 0)) << size_done);
|
|
}
|
|
|
|
/* on a big-endian host, we shift off the skip data
|
|
on the left, and shift the remaining data down
|
|
into position in the result: */
|
|
else {
|
|
x |= ((((tme_uint64_t) part8) << ((64 - 8) + 0)) >> size_done);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (x);
|
|
}
|
|
|
|
/* undefine the macro version of tme_memory_bus_write64: */
|
|
#undef tme_memory_bus_write64
|
|
|
|
/* the bus 64-bit write slow function: */
|
|
void
|
|
tme_memory_bus_write64(tme_shared tme_uint64_t *mem, tme_uint64_t x, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
|
|
{
|
|
const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
|
|
unsigned int size_skip;
|
|
unsigned int size_done;
|
|
#ifdef TME_HAVE_INT64_T
|
|
tme_shared tme_uint64_t *parts64;
|
|
tme_uint64_t part64;
|
|
tme_uint64_t part64_cmp;
|
|
#endif /* TME_HAVE_INT64_T */
|
|
tme_shared tme_uint32_t *parts32;
|
|
tme_uint32_t part32;
|
|
tme_uint32_t part32_cmp;
|
|
tme_shared tme_uint16_t *parts16;
|
|
tme_uint16_t part16;
|
|
tme_uint16_t part16_cmp;
|
|
tme_shared tme_uint8_t *parts8;
|
|
tme_uint8_t part8;
|
|
tme_uint8_t part8_cmp;
|
|
|
|
assert (bus_boundary != 0 && bus_boundary <= host_boundary);
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
if (host_boundary == sizeof(tme_uint64_t)) {
|
|
|
|
/* prepare to write the first 64-bit part of the memory: */
|
|
parts64 = (tme_shared tme_uint64_t *) (((unsigned long) mem) & (((unsigned long) 0) - (64 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (64 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 64-bit part of the memory: */
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
do {
|
|
part64_cmp = part64;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
|
|
part64 |= (((tme_uint64_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << 0) >> size_skip);
|
|
part64 |= (x >> ((64 - 64) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
|
|
} while (part64 != part64_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (64 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (64 - size_skip);
|
|
}
|
|
size_done = 64 - size_skip;
|
|
|
|
/* write at most one remaining 64-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 64)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 64-bit part of the memory: */
|
|
parts64++;
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
do {
|
|
part64_cmp = part64;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part64 &= (_tme_memory_type_mask(tme_uint64_t, + 0) ^ (((tme_uint64_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
|
|
part64 |= (((tme_uint64_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part64 &= ~(_tme_memory_type_mask(tme_uint64_t, << size_done) >> 0);
|
|
part64 |= (x >> ((64 - 64) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
|
|
} while (part64 != part64_cmp);
|
|
}
|
|
}
|
|
|
|
else
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
if (host_boundary == sizeof(tme_uint32_t)) {
|
|
|
|
/* prepare to write the first 32-bit part of the memory: */
|
|
parts32 = (tme_shared tme_uint32_t *) (((unsigned long) mem) & (((unsigned long) 0) - (32 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (32 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 32-bit part of the memory: */
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
do {
|
|
part32_cmp = part32;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
|
|
part32 |= (((tme_uint32_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << 0) >> size_skip);
|
|
part32 |= (x >> ((64 - 32) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
|
|
} while (part32 != part32_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (32 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (32 - size_skip);
|
|
}
|
|
size_done = 32 - size_skip;
|
|
|
|
/* try to write one full 32-bit part of memory: */
|
|
if (__tme_predict_true(size_done <= (64 - 32))) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write a full 32-bit part of memory: */
|
|
part32 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 32)));
|
|
parts32++;
|
|
tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
|
|
size_done += 32;
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= 32;
|
|
}
|
|
else {
|
|
x <<= 32;
|
|
}
|
|
}
|
|
|
|
/* write at most one remaining 32-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 64)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 32-bit part of the memory: */
|
|
parts32++;
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
do {
|
|
part32_cmp = part32;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part32 &= (_tme_memory_type_mask(tme_uint32_t, + 0) ^ (((tme_uint32_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
|
|
part32 |= (((tme_uint32_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part32 &= ~(_tme_memory_type_mask(tme_uint32_t, << size_done) >> 0);
|
|
part32 |= (x >> ((64 - 32) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
|
|
} while (part32 != part32_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (32 - 0);
|
|
}
|
|
else {
|
|
x <<= (32 - 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
else if (host_boundary == sizeof(tme_uint16_t)) {
|
|
|
|
/* prepare to write the first 16-bit part of the memory: */
|
|
parts16 = (tme_shared tme_uint16_t *) (((unsigned long) mem) & (((unsigned long) 0) - (16 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (16 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 16-bit part of the memory: */
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
do {
|
|
part16_cmp = part16;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
|
|
part16 |= (((tme_uint16_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << 0) >> size_skip);
|
|
part16 |= (x >> ((64 - 16) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
|
|
} while (part16 != part16_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (16 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (16 - size_skip);
|
|
}
|
|
size_done = 16 - size_skip;
|
|
|
|
/* write as many full 16-bit parts of the memory as we can: */
|
|
for (; size_done <= (64 - 16); ) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write a full 16-bit part of memory: */
|
|
part16 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 16)));
|
|
parts16++;
|
|
tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
|
|
size_done += 16;
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= 16;
|
|
}
|
|
else {
|
|
x <<= 16;
|
|
}
|
|
}
|
|
|
|
/* write at most one remaining 16-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 64)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 16-bit part of the memory: */
|
|
parts16++;
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
do {
|
|
part16_cmp = part16;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part16 &= (_tme_memory_type_mask(tme_uint16_t, + 0) ^ (((tme_uint16_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
|
|
part16 |= (((tme_uint16_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part16 &= ~(_tme_memory_type_mask(tme_uint16_t, << size_done) >> 0);
|
|
part16 |= (x >> ((64 - 16) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
|
|
} while (part16 != part16_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (16 - 0);
|
|
}
|
|
else {
|
|
x <<= (16 - 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
else {
|
|
|
|
/* prepare to write the first 8-bit part of the memory: */
|
|
parts8 = (tme_shared tme_uint8_t *) (((unsigned long) mem) & (((unsigned long) 0) - (8 / 8)));
|
|
size_skip = (((unsigned int) (unsigned long) mem) % (8 / 8)) * 8;
|
|
size_done = 0;
|
|
|
|
/* write the first 8-bit part of the memory: */
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
do {
|
|
part8_cmp = part8;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint64_t, << 0)) << size_skip));
|
|
part8 |= (((tme_uint8_t) x) << size_skip);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << 0) >> size_skip);
|
|
part8 |= (x >> ((64 - 8) + size_skip));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
|
|
} while (part8 != part8_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (8 - size_skip);
|
|
}
|
|
else {
|
|
x <<= (8 - size_skip);
|
|
}
|
|
size_done = 8 - size_skip;
|
|
|
|
/* write as many full 8-bit parts of the memory as we can: */
|
|
for (; size_done <= (64 - 8); ) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write a full 8-bit part of memory: */
|
|
part8 = (x >> ((TME_ENDIAN_NATIVE == TME_ENDIAN_BIG) * (64 - 8)));
|
|
parts8++;
|
|
tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
|
|
size_done += 8;
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= 8;
|
|
}
|
|
else {
|
|
x <<= 8;
|
|
}
|
|
}
|
|
|
|
/* write at most one remaining 8-bit part of the memory: */
|
|
if (__tme_predict_false(size_done < 64)) {
|
|
|
|
/* make a boundary: */
|
|
tme_memory_barrier(mem, (64 / 8), TME_MEMORY_BARRIER_WRITE_BEFORE_WRITE);
|
|
|
|
/* write the next 8-bit part of the memory: */
|
|
parts8++;
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
do {
|
|
part8_cmp = part8;
|
|
|
|
/* on a little-endian host, we clear with zeroes
|
|
shifted up past the skip data, and then we
|
|
insert the data shifted up past the skip data: */
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part8 &= (_tme_memory_type_mask(tme_uint8_t, + 0) ^ (((tme_uint8_t) _tme_memory_type_mask(tme_uint64_t, << size_done)) << 0));
|
|
part8 |= (((tme_uint8_t) x) << 0);
|
|
}
|
|
|
|
/* on a big-endian host, we clear with zeroes
|
|
shifted down past the skip data, and then we
|
|
insert the data shifted down past the skip data: */
|
|
else {
|
|
part8 &= ~(_tme_memory_type_mask(tme_uint8_t, << size_done) >> 0);
|
|
part8 |= (x >> ((64 - 8) + 0));
|
|
}
|
|
|
|
/* loop until we can atomically update this part: */
|
|
part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
|
|
} while (part8 != part8_cmp);
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
x >>= (8 - 0);
|
|
}
|
|
else {
|
|
x <<= (8 - 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
/* undefine the macro version of tme_memory_bus_read_buffer: */
|
|
#undef tme_memory_bus_read_buffer
|
|
|
|
/* the bus read buffer function: */
|
|
void
|
|
tme_memory_bus_read_buffer(_tme_const tme_shared tme_uint8_t *mem, tme_uint8_t *buffer, unsigned long count, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
|
|
{
|
|
const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
|
|
_tme_const tme_uint8_t *part_buffer;
|
|
unsigned int count_done;
|
|
unsigned int count_misaligned;
|
|
unsigned int bits_misaligned;
|
|
#ifdef TME_HAVE_INT64_T
|
|
_tme_const tme_shared tme_uint64_t *parts64;
|
|
tme_uint64_t part64_buffer;
|
|
tme_uint64_t part64;
|
|
tme_uint64_t part64_next;
|
|
#endif /* TME_HAVE_INT64_T */
|
|
_tme_const tme_shared tme_uint32_t *parts32;
|
|
tme_uint32_t part32_buffer;
|
|
tme_uint32_t part32;
|
|
tme_uint32_t part32_next;
|
|
_tme_const tme_shared tme_uint16_t *parts16;
|
|
tme_uint16_t part16_buffer;
|
|
tme_uint16_t part16;
|
|
tme_uint16_t part16_next;
|
|
_tme_const tme_shared tme_uint8_t *parts8;
|
|
tme_uint8_t part8_buffer;
|
|
tme_uint8_t part8;
|
|
tme_uint8_t part8_next;
|
|
|
|
assert (count != 0);
|
|
assert (bus_boundary != 0);
|
|
|
|
/* if we are locking for all memory accesses, lock memory
|
|
around a memcpy: */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
tme_rwlock_rdlock(rwlock);
|
|
memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count));
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
|
|
/* otherwise, if the emulated bus boundary is greater than the
|
|
host's bus boundary, we are forced to stop all other threads
|
|
around a memcpy: */
|
|
else if (__tme_predict_false(bus_boundary == 0
|
|
|| bus_boundary > host_boundary)) {
|
|
tme_thread_suspend_others();
|
|
memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count) + (0 && align_min));
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
else if (host_boundary == sizeof(tme_uint64_t)) {
|
|
|
|
/* make a 64-bit pointer to the memory: */
|
|
parts64 = (_tme_const tme_shared tme_uint64_t *) mem;
|
|
|
|
/* if this pointer is not 64-bit aligned: */
|
|
if (__tme_predict_false((((unsigned long) parts64) % sizeof(tme_uint64_t)) != 0)) {
|
|
|
|
/* get the misalignment from the previous 64-bit boundary: */
|
|
count_misaligned = ((unsigned long) parts64) % sizeof(tme_uint64_t);
|
|
|
|
/* truncate this pointer to the previous 64-bit boundary: */
|
|
parts64 = (_tme_const tme_shared tme_uint64_t *) (((unsigned long) parts64) & (((unsigned long) 0) - sizeof(tme_uint64_t)));
|
|
|
|
/* get the number of bytes to read in the first 64-bit memory part: */
|
|
count_done = sizeof(tme_uint64_t) - count_misaligned;
|
|
if (__tme_predict_false(count_done > count)) {
|
|
count_done = count;
|
|
}
|
|
|
|
/* read the first 64-bit memory part: */
|
|
part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
parts64++;
|
|
|
|
/* copy to the buffer the bytes to read in the first
|
|
64-bit memory part: */
|
|
part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
|
|
/* if we have full 64-bit parts to read: */
|
|
if (__tme_predict_true(count >= sizeof(tme_uint64_t))) {
|
|
|
|
/* if the buffer is 64-bit aligned: */
|
|
if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint64_t)) == 0)) {
|
|
|
|
/* read full 64-bit parts without shifting: */
|
|
do {
|
|
part64 = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
|
|
|
|
/* advance: */
|
|
parts64++;
|
|
buffer += sizeof(tme_uint64_t);
|
|
count -= sizeof(tme_uint64_t);
|
|
} while (count >= sizeof(tme_uint64_t));
|
|
}
|
|
|
|
/* otherwise, the buffer is not 64-bit aligned: */
|
|
else {
|
|
|
|
/* get the misalignment to the next 64-bit boundary: */
|
|
count_misaligned = (sizeof(tme_uint64_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint64_t);
|
|
|
|
/* read the next 64-bit memory part: */
|
|
part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
parts64++;
|
|
|
|
/* copy to the buffer until it is aligned: */
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
|
|
count_done = count_misaligned;
|
|
count -= count_misaligned;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* read full 64-bit words with shifting: */
|
|
bits_misaligned = count_misaligned * 8;
|
|
part64
|
|
= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? (part64_buffer >> bits_misaligned)
|
|
: (part64_buffer << bits_misaligned));
|
|
for (; count >= sizeof(tme_uint64_t); ) {
|
|
part64_next = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part64 |= (part64_next << (64 - bits_misaligned));
|
|
tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
|
|
part64 = (part64_next >> bits_misaligned);
|
|
}
|
|
else {
|
|
part64 |= (part64_next >> (64 - bits_misaligned));
|
|
tme_memory_write64((tme_uint64_t *) buffer, part64, sizeof(tme_uint64_t));
|
|
part64 = (part64_next << bits_misaligned);
|
|
}
|
|
|
|
/* advance: */
|
|
parts64++;
|
|
buffer += sizeof(tme_uint64_t);
|
|
count -= sizeof(tme_uint64_t);
|
|
}
|
|
|
|
/* calculate how many more bytes there are to read in this
|
|
64-bit memory part: */
|
|
count_done = sizeof(tme_uint64_t) - count_misaligned;
|
|
part64_buffer = part64;
|
|
|
|
/* copy to the buffer the remaining bytes in this 64-bit part: */
|
|
if (count_done > count) {
|
|
count_done = count;
|
|
}
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
|
|
count -= count_done;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
}
|
|
|
|
/* if we still have bytes to read: */
|
|
if (__tme_predict_false(count > 0)) {
|
|
|
|
/* we must have less than a full 64-bit part to read: */
|
|
assert (count < sizeof(tme_uint64_t));
|
|
|
|
/* read the last 64-bit memory part: */
|
|
part64_buffer = tme_memory_atomic_read64(parts64, rwlock, sizeof(tme_uint64_t));
|
|
|
|
/* copy to the buffer the bytes to read in the first
|
|
64-bit memory part: */
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part64_buffer);
|
|
count_done = count;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
|
|
}
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
else if (host_boundary == sizeof(tme_uint32_t)) {
|
|
|
|
/* make a 32-bit pointer to the memory: */
|
|
parts32 = (_tme_const tme_shared tme_uint32_t *) mem;
|
|
|
|
/* if this pointer is not 32-bit aligned: */
|
|
if (__tme_predict_false((((unsigned long) parts32) % sizeof(tme_uint32_t)) != 0)) {
|
|
|
|
/* get the misalignment from the previous 32-bit boundary: */
|
|
count_misaligned = ((unsigned long) parts32) % sizeof(tme_uint32_t);
|
|
|
|
/* truncate this pointer to the previous 32-bit boundary: */
|
|
parts32 = (_tme_const tme_shared tme_uint32_t *) (((unsigned long) parts32) & (((unsigned long) 0) - sizeof(tme_uint32_t)));
|
|
|
|
/* get the number of bytes to read in the first 32-bit memory part: */
|
|
count_done = sizeof(tme_uint32_t) - count_misaligned;
|
|
if (__tme_predict_false(count_done > count)) {
|
|
count_done = count;
|
|
}
|
|
|
|
/* read the first 32-bit memory part: */
|
|
part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
parts32++;
|
|
|
|
/* copy to the buffer the bytes to read in the first
|
|
32-bit memory part: */
|
|
part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
|
|
/* if we have full 32-bit parts to read: */
|
|
if (__tme_predict_true(count >= sizeof(tme_uint32_t))) {
|
|
|
|
/* if the buffer is 32-bit aligned: */
|
|
if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint32_t)) == 0)) {
|
|
|
|
/* read full 32-bit parts without shifting: */
|
|
do {
|
|
part32 = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
|
|
|
|
/* advance: */
|
|
parts32++;
|
|
buffer += sizeof(tme_uint32_t);
|
|
count -= sizeof(tme_uint32_t);
|
|
} while (count >= sizeof(tme_uint32_t));
|
|
}
|
|
|
|
/* otherwise, the buffer is not 32-bit aligned: */
|
|
else {
|
|
|
|
/* get the misalignment to the next 32-bit boundary: */
|
|
count_misaligned = (sizeof(tme_uint32_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint32_t);
|
|
|
|
/* read the next 32-bit memory part: */
|
|
part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
parts32++;
|
|
|
|
/* copy to the buffer until it is aligned: */
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
|
|
count_done = count_misaligned;
|
|
count -= count_misaligned;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* read full 32-bit words with shifting: */
|
|
bits_misaligned = count_misaligned * 8;
|
|
part32
|
|
= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? (part32_buffer >> bits_misaligned)
|
|
: (part32_buffer << bits_misaligned));
|
|
for (; count >= sizeof(tme_uint32_t); ) {
|
|
part32_next = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part32 |= (part32_next << (32 - bits_misaligned));
|
|
tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
|
|
part32 = (part32_next >> bits_misaligned);
|
|
}
|
|
else {
|
|
part32 |= (part32_next >> (32 - bits_misaligned));
|
|
tme_memory_write32((tme_uint32_t *) buffer, part32, sizeof(tme_uint32_t));
|
|
part32 = (part32_next << bits_misaligned);
|
|
}
|
|
|
|
/* advance: */
|
|
parts32++;
|
|
buffer += sizeof(tme_uint32_t);
|
|
count -= sizeof(tme_uint32_t);
|
|
}
|
|
|
|
/* calculate how many more bytes there are to read in this
|
|
32-bit memory part: */
|
|
count_done = sizeof(tme_uint32_t) - count_misaligned;
|
|
part32_buffer = part32;
|
|
|
|
/* copy to the buffer the remaining bytes in this 32-bit part: */
|
|
if (count_done > count) {
|
|
count_done = count;
|
|
}
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
|
|
count -= count_done;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
}
|
|
|
|
/* if we still have bytes to read: */
|
|
if (__tme_predict_false(count > 0)) {
|
|
|
|
/* we must have less than a full 32-bit part to read: */
|
|
assert (count < sizeof(tme_uint32_t));
|
|
|
|
/* read the last 32-bit memory part: */
|
|
part32_buffer = tme_memory_atomic_read32(parts32, rwlock, sizeof(tme_uint32_t));
|
|
|
|
/* copy to the buffer the bytes to read in the first
|
|
32-bit memory part: */
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part32_buffer);
|
|
count_done = count;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
|
|
}
|
|
|
|
else if (host_boundary == sizeof(tme_uint16_t)) {
|
|
|
|
/* make a 16-bit pointer to the memory: */
|
|
parts16 = (_tme_const tme_shared tme_uint16_t *) mem;
|
|
|
|
/* if this pointer is not 16-bit aligned: */
|
|
if (__tme_predict_false((((unsigned long) parts16) % sizeof(tme_uint16_t)) != 0)) {
|
|
|
|
/* get the misalignment from the previous 16-bit boundary: */
|
|
count_misaligned = ((unsigned long) parts16) % sizeof(tme_uint16_t);
|
|
|
|
/* truncate this pointer to the previous 16-bit boundary: */
|
|
parts16 = (_tme_const tme_shared tme_uint16_t *) (((unsigned long) parts16) & (((unsigned long) 0) - sizeof(tme_uint16_t)));
|
|
|
|
/* get the number of bytes to read in the first 16-bit memory part: */
|
|
count_done = sizeof(tme_uint16_t) - count_misaligned;
|
|
if (__tme_predict_false(count_done > count)) {
|
|
count_done = count;
|
|
}
|
|
|
|
/* read the first 16-bit memory part: */
|
|
part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
parts16++;
|
|
|
|
/* copy to the buffer the bytes to read in the first
|
|
16-bit memory part: */
|
|
part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
|
|
/* if we have full 16-bit parts to read: */
|
|
if (__tme_predict_true(count >= sizeof(tme_uint16_t))) {
|
|
|
|
/* if the buffer is 16-bit aligned: */
|
|
if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint16_t)) == 0)) {
|
|
|
|
/* read full 16-bit parts without shifting: */
|
|
do {
|
|
part16 = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
|
|
|
|
/* advance: */
|
|
parts16++;
|
|
buffer += sizeof(tme_uint16_t);
|
|
count -= sizeof(tme_uint16_t);
|
|
} while (count >= sizeof(tme_uint16_t));
|
|
}
|
|
|
|
/* otherwise, the buffer is not 16-bit aligned: */
|
|
else {
|
|
|
|
/* get the misalignment to the next 16-bit boundary: */
|
|
count_misaligned = (sizeof(tme_uint16_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint16_t);
|
|
|
|
/* read the next 16-bit memory part: */
|
|
part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
parts16++;
|
|
|
|
/* copy to the buffer until it is aligned: */
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
|
|
count_done = count_misaligned;
|
|
count -= count_misaligned;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* read full 16-bit words with shifting: */
|
|
bits_misaligned = count_misaligned * 8;
|
|
part16
|
|
= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? (part16_buffer >> bits_misaligned)
|
|
: (part16_buffer << bits_misaligned));
|
|
for (; count >= sizeof(tme_uint16_t); ) {
|
|
part16_next = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part16 |= (part16_next << (16 - bits_misaligned));
|
|
tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
|
|
part16 = (part16_next >> bits_misaligned);
|
|
}
|
|
else {
|
|
part16 |= (part16_next >> (16 - bits_misaligned));
|
|
tme_memory_write16((tme_uint16_t *) buffer, part16, sizeof(tme_uint16_t));
|
|
part16 = (part16_next << bits_misaligned);
|
|
}
|
|
|
|
/* advance: */
|
|
parts16++;
|
|
buffer += sizeof(tme_uint16_t);
|
|
count -= sizeof(tme_uint16_t);
|
|
}
|
|
|
|
/* calculate how many more bytes there are to read in this
|
|
16-bit memory part: */
|
|
count_done = sizeof(tme_uint16_t) - count_misaligned;
|
|
part16_buffer = part16;
|
|
|
|
/* copy to the buffer the remaining bytes in this 16-bit part: */
|
|
if (count_done > count) {
|
|
count_done = count;
|
|
}
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
|
|
count -= count_done;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
}
|
|
|
|
/* if we still have bytes to read: */
|
|
if (__tme_predict_false(count > 0)) {
|
|
|
|
/* we must have less than a full 16-bit part to read: */
|
|
assert (count < sizeof(tme_uint16_t));
|
|
|
|
/* read the last 16-bit memory part: */
|
|
part16_buffer = tme_memory_atomic_read16(parts16, rwlock, sizeof(tme_uint16_t));
|
|
|
|
/* copy to the buffer the bytes to read in the first
|
|
16-bit memory part: */
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part16_buffer);
|
|
count_done = count;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* make a 8-bit pointer to the memory: */
|
|
parts8 = (_tme_const tme_shared tme_uint8_t *) mem;
|
|
|
|
/* if this pointer is not 8-bit aligned: */
|
|
if (__tme_predict_false((((unsigned long) parts8) % sizeof(tme_uint8_t)) != 0)) {
|
|
|
|
/* get the misalignment from the previous 8-bit boundary: */
|
|
count_misaligned = ((unsigned long) parts8) % sizeof(tme_uint8_t);
|
|
|
|
/* truncate this pointer to the previous 8-bit boundary: */
|
|
parts8 = (_tme_const tme_shared tme_uint8_t *) (((unsigned long) parts8) & (((unsigned long) 0) - sizeof(tme_uint8_t)));
|
|
|
|
/* get the number of bytes to read in the first 8-bit memory part: */
|
|
count_done = sizeof(tme_uint8_t) - count_misaligned;
|
|
if (__tme_predict_false(count_done > count)) {
|
|
count_done = count;
|
|
}
|
|
|
|
/* read the first 8-bit memory part: */
|
|
part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
parts8++;
|
|
|
|
/* copy to the buffer the bytes to read in the first
|
|
8-bit memory part: */
|
|
part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
|
|
/* if we have full 8-bit parts to read: */
|
|
if (__tme_predict_true(count >= sizeof(tme_uint8_t))) {
|
|
|
|
/* if the buffer is 8-bit aligned: */
|
|
if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint8_t)) == 0)) {
|
|
|
|
/* read full 8-bit parts without shifting: */
|
|
do {
|
|
part8 = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
|
|
|
|
/* advance: */
|
|
parts8++;
|
|
buffer += sizeof(tme_uint8_t);
|
|
count -= sizeof(tme_uint8_t);
|
|
} while (count >= sizeof(tme_uint8_t));
|
|
}
|
|
|
|
/* otherwise, the buffer is not 8-bit aligned: */
|
|
else {
|
|
|
|
/* get the misalignment to the next 8-bit boundary: */
|
|
count_misaligned = (sizeof(tme_uint8_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint8_t);
|
|
|
|
/* read the next 8-bit memory part: */
|
|
part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
parts8++;
|
|
|
|
/* copy to the buffer until it is aligned: */
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
|
|
count_done = count_misaligned;
|
|
count -= count_misaligned;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* read full 8-bit words with shifting: */
|
|
bits_misaligned = count_misaligned * 8;
|
|
part8
|
|
= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? (part8_buffer >> bits_misaligned)
|
|
: (part8_buffer << bits_misaligned));
|
|
for (; count >= sizeof(tme_uint8_t); ) {
|
|
part8_next = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part8 |= (part8_next << (8 - bits_misaligned));
|
|
tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
|
|
part8 = (part8_next >> bits_misaligned);
|
|
}
|
|
else {
|
|
part8 |= (part8_next >> (8 - bits_misaligned));
|
|
tme_memory_write8((tme_uint8_t *) buffer, part8, sizeof(tme_uint8_t));
|
|
part8 = (part8_next << bits_misaligned);
|
|
}
|
|
|
|
/* advance: */
|
|
parts8++;
|
|
buffer += sizeof(tme_uint8_t);
|
|
count -= sizeof(tme_uint8_t);
|
|
}
|
|
|
|
/* calculate how many more bytes there are to read in this
|
|
8-bit memory part: */
|
|
count_done = sizeof(tme_uint8_t) - count_misaligned;
|
|
part8_buffer = part8;
|
|
|
|
/* copy to the buffer the remaining bytes in this 8-bit part: */
|
|
if (count_done > count) {
|
|
count_done = count;
|
|
}
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
|
|
count -= count_done;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
}
|
|
|
|
/* if we still have bytes to read: */
|
|
if (__tme_predict_false(count > 0)) {
|
|
|
|
/* we must have less than a full 8-bit part to read: */
|
|
assert (count < sizeof(tme_uint8_t));
|
|
|
|
/* read the last 8-bit memory part: */
|
|
part8_buffer = tme_memory_atomic_read8(parts8, rwlock, sizeof(tme_uint8_t));
|
|
|
|
/* copy to the buffer the bytes to read in the first
|
|
8-bit memory part: */
|
|
part_buffer = ((_tme_const tme_uint8_t *) &part8_buffer);
|
|
count_done = count;
|
|
do {
|
|
*buffer = *part_buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
/* undefine the macro version of tme_memory_bus_write_buffer: */
|
|
#undef tme_memory_bus_write_buffer
|
|
|
|
/* the bus write buffer function: */
|
|
void
|
|
tme_memory_bus_write_buffer(tme_shared tme_uint8_t *mem, _tme_const tme_uint8_t *buffer, unsigned long count, tme_rwlock_t *rwlock, unsigned int align_min, unsigned int bus_boundary)
|
|
{
|
|
const unsigned int host_boundary = TME_MEMORY_BUS_BOUNDARY;
|
|
tme_uint8_t *part_buffer;
|
|
unsigned int count_done;
|
|
unsigned int count_misaligned;
|
|
unsigned int bits_misaligned;
|
|
#ifdef TME_HAVE_INT64_T
|
|
tme_shared tme_uint64_t *parts64;
|
|
tme_uint64_t part64_buffer;
|
|
tme_uint64_t part64;
|
|
tme_uint64_t part64_next;
|
|
tme_uint64_t part64_mask;
|
|
tme_uint64_t part64_cmp;
|
|
#endif /* TME_HAVE_INT64_T */
|
|
tme_shared tme_uint32_t *parts32;
|
|
tme_uint32_t part32_buffer;
|
|
tme_uint32_t part32;
|
|
tme_uint32_t part32_next;
|
|
tme_uint32_t part32_mask;
|
|
tme_uint32_t part32_cmp;
|
|
tme_shared tme_uint16_t *parts16;
|
|
tme_uint16_t part16_buffer;
|
|
tme_uint16_t part16;
|
|
tme_uint16_t part16_next;
|
|
tme_uint16_t part16_mask;
|
|
tme_uint16_t part16_cmp;
|
|
tme_shared tme_uint8_t *parts8;
|
|
tme_uint8_t part8_buffer;
|
|
tme_uint8_t part8;
|
|
tme_uint8_t part8_next;
|
|
tme_uint8_t part8_mask;
|
|
tme_uint8_t part8_cmp;
|
|
|
|
assert (count != 0);
|
|
assert (bus_boundary != 0);
|
|
|
|
/* if we are locking for all memory accesses, lock memory
|
|
around a memcpy: */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
memcpy((tme_uint8_t *) (mem), (buffer), (count));
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
|
|
/* otherwise, if the emulated bus boundary is greater than the
|
|
host's bus boundary, we are forced to stop all other threads
|
|
around a memcpy: */
|
|
else if (__tme_predict_false(bus_boundary == 0
|
|
|| bus_boundary > host_boundary)) {
|
|
tme_thread_suspend_others();
|
|
memcpy((tme_uint8_t *) (mem), (buffer), (count) + (0 && align_min));
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
else if (host_boundary == sizeof(tme_uint64_t)) {
|
|
|
|
/* make a 64-bit pointer to the memory: */
|
|
parts64 = (tme_shared tme_uint64_t *) mem;
|
|
|
|
/* if this pointer is not 64-bit aligned: */
|
|
if (__tme_predict_false((((unsigned long) parts64) % sizeof(tme_uint64_t)) != 0)) {
|
|
|
|
/* get the misalignment from the previous 64-bit boundary: */
|
|
count_misaligned = ((unsigned long) parts64) % sizeof(tme_uint64_t);
|
|
|
|
/* truncate this pointer to the previous 64-bit boundary: */
|
|
parts64 = (tme_shared tme_uint64_t *) (((unsigned long) parts64) & (((unsigned long) 0) - sizeof(tme_uint64_t)));
|
|
|
|
/* get the number of bytes to write in the first 64-bit memory part: */
|
|
count_done = sizeof(tme_uint64_t) - count_misaligned;
|
|
if (__tme_predict_false(count_done > count)) {
|
|
count_done = count;
|
|
}
|
|
|
|
/* make a mask that clears for the data to write in the
|
|
first 64-bit memory part: */
|
|
part64_mask = 1;
|
|
part64_mask = (part64_mask << (count_done * 8)) - 1;
|
|
part64_mask
|
|
<<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? (count_misaligned * 8)
|
|
: (64 - ((count_misaligned + count_done) * 8)));
|
|
part64_mask = ~part64_mask;
|
|
|
|
/* copy from the buffer the bytes to write in the first
|
|
64-bit memory part: */
|
|
part64_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* compare-and-exchange the first 64-bit memory part: */
|
|
part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
|
|
do {
|
|
part64_cmp = part64;
|
|
part64 = (part64 & part64_mask) | part64_buffer;
|
|
part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
|
|
} while (part64 != part64_cmp);
|
|
parts64++;
|
|
}
|
|
|
|
/* if we have full 64-bit parts to write: */
|
|
if (__tme_predict_true(count >= sizeof(tme_uint64_t))) {
|
|
|
|
/* if the buffer is 64-bit aligned: */
|
|
if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint64_t)) == 0)) {
|
|
|
|
/* write full 64-bit parts without shifting: */
|
|
do {
|
|
part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
|
|
tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
|
|
|
|
/* advance: */
|
|
parts64++;
|
|
buffer += sizeof(tme_uint64_t);
|
|
count -= sizeof(tme_uint64_t);
|
|
} while (count >= sizeof(tme_uint64_t));
|
|
}
|
|
|
|
/* otherwise, the buffer is not 64-bit aligned: */
|
|
else {
|
|
|
|
/* get the misalignment to the next 64-bit boundary: */
|
|
count_misaligned = (sizeof(tme_uint64_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint64_t);
|
|
|
|
/* copy from the buffer until it is aligned: */
|
|
part64_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part64_buffer);
|
|
count_done = count_misaligned;
|
|
count -= count_misaligned;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* write full 64-bit words with shifting: */
|
|
bits_misaligned = count_misaligned * 8;
|
|
part64 = part64_buffer;
|
|
for (; count >= sizeof(tme_uint64_t); ) {
|
|
part64_next = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part64 |= (part64_next << bits_misaligned);
|
|
tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
|
|
part64 = (part64_next >> (64 - bits_misaligned));
|
|
}
|
|
else {
|
|
part64 |= (part64_next >> bits_misaligned);
|
|
tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
|
|
part64 = (part64_next << (64 - bits_misaligned));
|
|
}
|
|
|
|
/* advance: */
|
|
parts64++;
|
|
buffer += sizeof(tme_uint64_t);
|
|
count -= sizeof(tme_uint64_t);
|
|
}
|
|
|
|
/* calculate how many more bytes there are to write in this
|
|
64-bit memory part: */
|
|
count_done = sizeof(tme_uint64_t) - count_misaligned;
|
|
part64_buffer = part64;
|
|
|
|
/* if we can't write one more full 64-bit memory part: */
|
|
if (count_done > count) {
|
|
|
|
/* we will reread this data to write below: */
|
|
buffer -= count_misaligned;
|
|
count += count_misaligned;
|
|
}
|
|
|
|
/* otherwise, we can write one more full 64-bit memory part: */
|
|
else {
|
|
|
|
/* copy from the buffer until we have the full 64-bit part: */
|
|
part_buffer = ((tme_uint8_t *) &part64_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* write the last full 64-bit memory part: */
|
|
part64 = part64_buffer;
|
|
tme_memory_atomic_write64(parts64, part64, rwlock, sizeof(tme_uint64_t));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* if we still have bytes to write: */
|
|
if (__tme_predict_false(count > 0)) {
|
|
|
|
/* we must have less than a full 64-bit part to write: */
|
|
assert (count < sizeof(tme_uint64_t));
|
|
|
|
/* make a mask that clears for the data to write in the last
|
|
64-bit memory part: */
|
|
part64_mask
|
|
= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? _tme_memory_type_mask(tme_uint64_t, << (count * 8))
|
|
: _tme_memory_type_mask(tme_uint64_t, >> (count * 8)));
|
|
|
|
/* copy from the buffer the bytes to write in the last
|
|
64-bit memory part: */
|
|
part64_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part64_buffer);
|
|
count_done = count;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* compare-and-exchange the last 64-bit memory part: */
|
|
part64 = tme_memory_read64((const tme_uint64_t *) buffer, sizeof(tme_uint64_t));
|
|
do {
|
|
part64_cmp = part64;
|
|
part64 = (part64 & part64_mask) | part64_buffer;
|
|
part64 = tme_memory_atomic_cx64(parts64, part64_cmp, part64, rwlock, sizeof(tme_uint64_t));
|
|
} while (part64 != part64_cmp);
|
|
}
|
|
|
|
}
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|
|
|
|
else if (host_boundary == sizeof(tme_uint32_t)) {
|
|
|
|
/* make a 32-bit pointer to the memory: */
|
|
parts32 = (tme_shared tme_uint32_t *) mem;
|
|
|
|
/* if this pointer is not 32-bit aligned: */
|
|
if (__tme_predict_false((((unsigned long) parts32) % sizeof(tme_uint32_t)) != 0)) {
|
|
|
|
/* get the misalignment from the previous 32-bit boundary: */
|
|
count_misaligned = ((unsigned long) parts32) % sizeof(tme_uint32_t);
|
|
|
|
/* truncate this pointer to the previous 32-bit boundary: */
|
|
parts32 = (tme_shared tme_uint32_t *) (((unsigned long) parts32) & (((unsigned long) 0) - sizeof(tme_uint32_t)));
|
|
|
|
/* get the number of bytes to write in the first 32-bit memory part: */
|
|
count_done = sizeof(tme_uint32_t) - count_misaligned;
|
|
if (__tme_predict_false(count_done > count)) {
|
|
count_done = count;
|
|
}
|
|
|
|
/* make a mask that clears for the data to write in the
|
|
first 32-bit memory part: */
|
|
part32_mask = 1;
|
|
part32_mask = (part32_mask << (count_done * 8)) - 1;
|
|
part32_mask
|
|
<<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? (count_misaligned * 8)
|
|
: (32 - ((count_misaligned + count_done) * 8)));
|
|
part32_mask = ~part32_mask;
|
|
|
|
/* copy from the buffer the bytes to write in the first
|
|
32-bit memory part: */
|
|
part32_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* compare-and-exchange the first 32-bit memory part: */
|
|
part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
|
|
do {
|
|
part32_cmp = part32;
|
|
part32 = (part32 & part32_mask) | part32_buffer;
|
|
part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
|
|
} while (part32 != part32_cmp);
|
|
parts32++;
|
|
}
|
|
|
|
/* if we have full 32-bit parts to write: */
|
|
if (__tme_predict_true(count >= sizeof(tme_uint32_t))) {
|
|
|
|
/* if the buffer is 32-bit aligned: */
|
|
if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint32_t)) == 0)) {
|
|
|
|
/* write full 32-bit parts without shifting: */
|
|
do {
|
|
part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
|
|
tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
|
|
|
|
/* advance: */
|
|
parts32++;
|
|
buffer += sizeof(tme_uint32_t);
|
|
count -= sizeof(tme_uint32_t);
|
|
} while (count >= sizeof(tme_uint32_t));
|
|
}
|
|
|
|
/* otherwise, the buffer is not 32-bit aligned: */
|
|
else {
|
|
|
|
/* get the misalignment to the next 32-bit boundary: */
|
|
count_misaligned = (sizeof(tme_uint32_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint32_t);
|
|
|
|
/* copy from the buffer until it is aligned: */
|
|
part32_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part32_buffer);
|
|
count_done = count_misaligned;
|
|
count -= count_misaligned;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* write full 32-bit words with shifting: */
|
|
bits_misaligned = count_misaligned * 8;
|
|
part32 = part32_buffer;
|
|
for (; count >= sizeof(tme_uint32_t); ) {
|
|
part32_next = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part32 |= (part32_next << bits_misaligned);
|
|
tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
|
|
part32 = (part32_next >> (32 - bits_misaligned));
|
|
}
|
|
else {
|
|
part32 |= (part32_next >> bits_misaligned);
|
|
tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
|
|
part32 = (part32_next << (32 - bits_misaligned));
|
|
}
|
|
|
|
/* advance: */
|
|
parts32++;
|
|
buffer += sizeof(tme_uint32_t);
|
|
count -= sizeof(tme_uint32_t);
|
|
}
|
|
|
|
/* calculate how many more bytes there are to write in this
|
|
32-bit memory part: */
|
|
count_done = sizeof(tme_uint32_t) - count_misaligned;
|
|
part32_buffer = part32;
|
|
|
|
/* if we can't write one more full 32-bit memory part: */
|
|
if (count_done > count) {
|
|
|
|
/* we will reread this data to write below: */
|
|
buffer -= count_misaligned;
|
|
count += count_misaligned;
|
|
}
|
|
|
|
/* otherwise, we can write one more full 32-bit memory part: */
|
|
else {
|
|
|
|
/* copy from the buffer until we have the full 32-bit part: */
|
|
part_buffer = ((tme_uint8_t *) &part32_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* write the last full 32-bit memory part: */
|
|
part32 = part32_buffer;
|
|
tme_memory_atomic_write32(parts32, part32, rwlock, sizeof(tme_uint32_t));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* if we still have bytes to write: */
|
|
if (__tme_predict_false(count > 0)) {
|
|
|
|
/* we must have less than a full 32-bit part to write: */
|
|
assert (count < sizeof(tme_uint32_t));
|
|
|
|
/* make a mask that clears for the data to write in the last
|
|
32-bit memory part: */
|
|
part32_mask
|
|
= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? _tme_memory_type_mask(tme_uint32_t, << (count * 8))
|
|
: _tme_memory_type_mask(tme_uint32_t, >> (count * 8)));
|
|
|
|
/* copy from the buffer the bytes to write in the last
|
|
32-bit memory part: */
|
|
part32_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part32_buffer);
|
|
count_done = count;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* compare-and-exchange the last 32-bit memory part: */
|
|
part32 = tme_memory_read32((const tme_uint32_t *) buffer, sizeof(tme_uint32_t));
|
|
do {
|
|
part32_cmp = part32;
|
|
part32 = (part32 & part32_mask) | part32_buffer;
|
|
part32 = tme_memory_atomic_cx32(parts32, part32_cmp, part32, rwlock, sizeof(tme_uint32_t));
|
|
} while (part32 != part32_cmp);
|
|
}
|
|
|
|
}
|
|
|
|
else if (host_boundary == sizeof(tme_uint16_t)) {
|
|
|
|
/* make a 16-bit pointer to the memory: */
|
|
parts16 = (tme_shared tme_uint16_t *) mem;
|
|
|
|
/* if this pointer is not 16-bit aligned: */
|
|
if (__tme_predict_false((((unsigned long) parts16) % sizeof(tme_uint16_t)) != 0)) {
|
|
|
|
/* get the misalignment from the previous 16-bit boundary: */
|
|
count_misaligned = ((unsigned long) parts16) % sizeof(tme_uint16_t);
|
|
|
|
/* truncate this pointer to the previous 16-bit boundary: */
|
|
parts16 = (tme_shared tme_uint16_t *) (((unsigned long) parts16) & (((unsigned long) 0) - sizeof(tme_uint16_t)));
|
|
|
|
/* get the number of bytes to write in the first 16-bit memory part: */
|
|
count_done = sizeof(tme_uint16_t) - count_misaligned;
|
|
if (__tme_predict_false(count_done > count)) {
|
|
count_done = count;
|
|
}
|
|
|
|
/* make a mask that clears for the data to write in the
|
|
first 16-bit memory part: */
|
|
part16_mask = 1;
|
|
part16_mask = (part16_mask << (count_done * 8)) - 1;
|
|
part16_mask
|
|
<<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? (count_misaligned * 8)
|
|
: (16 - ((count_misaligned + count_done) * 8)));
|
|
part16_mask = ~part16_mask;
|
|
|
|
/* copy from the buffer the bytes to write in the first
|
|
16-bit memory part: */
|
|
part16_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* compare-and-exchange the first 16-bit memory part: */
|
|
part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
|
|
do {
|
|
part16_cmp = part16;
|
|
part16 = (part16 & part16_mask) | part16_buffer;
|
|
part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
|
|
} while (part16 != part16_cmp);
|
|
parts16++;
|
|
}
|
|
|
|
/* if we have full 16-bit parts to write: */
|
|
if (__tme_predict_true(count >= sizeof(tme_uint16_t))) {
|
|
|
|
/* if the buffer is 16-bit aligned: */
|
|
if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint16_t)) == 0)) {
|
|
|
|
/* write full 16-bit parts without shifting: */
|
|
do {
|
|
part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
|
|
tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
|
|
|
|
/* advance: */
|
|
parts16++;
|
|
buffer += sizeof(tme_uint16_t);
|
|
count -= sizeof(tme_uint16_t);
|
|
} while (count >= sizeof(tme_uint16_t));
|
|
}
|
|
|
|
/* otherwise, the buffer is not 16-bit aligned: */
|
|
else {
|
|
|
|
/* get the misalignment to the next 16-bit boundary: */
|
|
count_misaligned = (sizeof(tme_uint16_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint16_t);
|
|
|
|
/* copy from the buffer until it is aligned: */
|
|
part16_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part16_buffer);
|
|
count_done = count_misaligned;
|
|
count -= count_misaligned;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* write full 16-bit words with shifting: */
|
|
bits_misaligned = count_misaligned * 8;
|
|
part16 = part16_buffer;
|
|
for (; count >= sizeof(tme_uint16_t); ) {
|
|
part16_next = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part16 |= (part16_next << bits_misaligned);
|
|
tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
|
|
part16 = (part16_next >> (16 - bits_misaligned));
|
|
}
|
|
else {
|
|
part16 |= (part16_next >> bits_misaligned);
|
|
tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
|
|
part16 = (part16_next << (16 - bits_misaligned));
|
|
}
|
|
|
|
/* advance: */
|
|
parts16++;
|
|
buffer += sizeof(tme_uint16_t);
|
|
count -= sizeof(tme_uint16_t);
|
|
}
|
|
|
|
/* calculate how many more bytes there are to write in this
|
|
16-bit memory part: */
|
|
count_done = sizeof(tme_uint16_t) - count_misaligned;
|
|
part16_buffer = part16;
|
|
|
|
/* if we can't write one more full 16-bit memory part: */
|
|
if (count_done > count) {
|
|
|
|
/* we will reread this data to write below: */
|
|
buffer -= count_misaligned;
|
|
count += count_misaligned;
|
|
}
|
|
|
|
/* otherwise, we can write one more full 16-bit memory part: */
|
|
else {
|
|
|
|
/* copy from the buffer until we have the full 16-bit part: */
|
|
part_buffer = ((tme_uint8_t *) &part16_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* write the last full 16-bit memory part: */
|
|
part16 = part16_buffer;
|
|
tme_memory_atomic_write16(parts16, part16, rwlock, sizeof(tme_uint16_t));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* if we still have bytes to write: */
|
|
if (__tme_predict_false(count > 0)) {
|
|
|
|
/* we must have less than a full 16-bit part to write: */
|
|
assert (count < sizeof(tme_uint16_t));
|
|
|
|
/* make a mask that clears for the data to write in the last
|
|
16-bit memory part: */
|
|
part16_mask
|
|
= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? _tme_memory_type_mask(tme_uint16_t, << (count * 8))
|
|
: _tme_memory_type_mask(tme_uint16_t, >> (count * 8)));
|
|
|
|
/* copy from the buffer the bytes to write in the last
|
|
16-bit memory part: */
|
|
part16_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part16_buffer);
|
|
count_done = count;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* compare-and-exchange the last 16-bit memory part: */
|
|
part16 = tme_memory_read16((const tme_uint16_t *) buffer, sizeof(tme_uint16_t));
|
|
do {
|
|
part16_cmp = part16;
|
|
part16 = (part16 & part16_mask) | part16_buffer;
|
|
part16 = tme_memory_atomic_cx16(parts16, part16_cmp, part16, rwlock, sizeof(tme_uint16_t));
|
|
} while (part16 != part16_cmp);
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
/* make a 8-bit pointer to the memory: */
|
|
parts8 = (tme_shared tme_uint8_t *) mem;
|
|
|
|
/* if this pointer is not 8-bit aligned: */
|
|
if (__tme_predict_false((((unsigned long) parts8) % sizeof(tme_uint8_t)) != 0)) {
|
|
|
|
/* get the misalignment from the previous 8-bit boundary: */
|
|
count_misaligned = ((unsigned long) parts8) % sizeof(tme_uint8_t);
|
|
|
|
/* truncate this pointer to the previous 8-bit boundary: */
|
|
parts8 = (tme_shared tme_uint8_t *) (((unsigned long) parts8) & (((unsigned long) 0) - sizeof(tme_uint8_t)));
|
|
|
|
/* get the number of bytes to write in the first 8-bit memory part: */
|
|
count_done = sizeof(tme_uint8_t) - count_misaligned;
|
|
if (__tme_predict_false(count_done > count)) {
|
|
count_done = count;
|
|
}
|
|
|
|
/* make a mask that clears for the data to write in the
|
|
first 8-bit memory part: */
|
|
part8_mask = 1;
|
|
part8_mask = (part8_mask << (count_done * 8)) - 1;
|
|
part8_mask
|
|
<<= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? (count_misaligned * 8)
|
|
: (8 - ((count_misaligned + count_done) * 8)));
|
|
part8_mask = ~part8_mask;
|
|
|
|
/* copy from the buffer the bytes to write in the first
|
|
8-bit memory part: */
|
|
part8_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* compare-and-exchange the first 8-bit memory part: */
|
|
part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
|
|
do {
|
|
part8_cmp = part8;
|
|
part8 = (part8 & part8_mask) | part8_buffer;
|
|
part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
|
|
} while (part8 != part8_cmp);
|
|
parts8++;
|
|
}
|
|
|
|
/* if we have full 8-bit parts to write: */
|
|
if (__tme_predict_true(count >= sizeof(tme_uint8_t))) {
|
|
|
|
/* if the buffer is 8-bit aligned: */
|
|
if (__tme_predict_true((((unsigned long) buffer) % sizeof(tme_uint8_t)) == 0)) {
|
|
|
|
/* write full 8-bit parts without shifting: */
|
|
do {
|
|
part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
|
|
tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
|
|
|
|
/* advance: */
|
|
parts8++;
|
|
buffer += sizeof(tme_uint8_t);
|
|
count -= sizeof(tme_uint8_t);
|
|
} while (count >= sizeof(tme_uint8_t));
|
|
}
|
|
|
|
/* otherwise, the buffer is not 8-bit aligned: */
|
|
else {
|
|
|
|
/* get the misalignment to the next 8-bit boundary: */
|
|
count_misaligned = (sizeof(tme_uint8_t) - ((unsigned int) (unsigned long) buffer)) % sizeof(tme_uint8_t);
|
|
|
|
/* copy from the buffer until it is aligned: */
|
|
part8_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part8_buffer);
|
|
count_done = count_misaligned;
|
|
count -= count_misaligned;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* write full 8-bit words with shifting: */
|
|
bits_misaligned = count_misaligned * 8;
|
|
part8 = part8_buffer;
|
|
for (; count >= sizeof(tme_uint8_t); ) {
|
|
part8_next = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
|
|
if (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE) {
|
|
part8 |= (part8_next << bits_misaligned);
|
|
tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
|
|
part8 = (part8_next >> (8 - bits_misaligned));
|
|
}
|
|
else {
|
|
part8 |= (part8_next >> bits_misaligned);
|
|
tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
|
|
part8 = (part8_next << (8 - bits_misaligned));
|
|
}
|
|
|
|
/* advance: */
|
|
parts8++;
|
|
buffer += sizeof(tme_uint8_t);
|
|
count -= sizeof(tme_uint8_t);
|
|
}
|
|
|
|
/* calculate how many more bytes there are to write in this
|
|
8-bit memory part: */
|
|
count_done = sizeof(tme_uint8_t) - count_misaligned;
|
|
part8_buffer = part8;
|
|
|
|
/* if we can't write one more full 8-bit memory part: */
|
|
if (count_done > count) {
|
|
|
|
/* we will reread this data to write below: */
|
|
buffer -= count_misaligned;
|
|
count += count_misaligned;
|
|
}
|
|
|
|
/* otherwise, we can write one more full 8-bit memory part: */
|
|
else {
|
|
|
|
/* copy from the buffer until we have the full 8-bit part: */
|
|
part_buffer = ((tme_uint8_t *) &part8_buffer) + count_misaligned;
|
|
count -= count_done;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* write the last full 8-bit memory part: */
|
|
part8 = part8_buffer;
|
|
tme_memory_atomic_write8(parts8, part8, rwlock, sizeof(tme_uint8_t));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* if we still have bytes to write: */
|
|
if (__tme_predict_false(count > 0)) {
|
|
|
|
/* we must have less than a full 8-bit part to write: */
|
|
assert (count < sizeof(tme_uint8_t));
|
|
|
|
/* make a mask that clears for the data to write in the last
|
|
8-bit memory part: */
|
|
part8_mask
|
|
= (TME_ENDIAN_NATIVE == TME_ENDIAN_LITTLE
|
|
? _tme_memory_type_mask(tme_uint8_t, << (count * 8))
|
|
: _tme_memory_type_mask(tme_uint8_t, >> (count * 8)));
|
|
|
|
/* copy from the buffer the bytes to write in the last
|
|
8-bit memory part: */
|
|
part8_buffer = 0;
|
|
part_buffer = ((tme_uint8_t *) &part8_buffer);
|
|
count_done = count;
|
|
do {
|
|
*part_buffer = *buffer;
|
|
part_buffer++;
|
|
buffer++;
|
|
} while (--count_done != 0);
|
|
|
|
/* compare-and-exchange the last 8-bit memory part: */
|
|
part8 = tme_memory_read8((const tme_uint8_t *) buffer, sizeof(tme_uint8_t));
|
|
do {
|
|
part8_cmp = part8;
|
|
part8 = (part8 & part8_mask) | part8_buffer;
|
|
part8 = tme_memory_atomic_cx8(parts8, part8_cmp, part8, rwlock, sizeof(tme_uint8_t));
|
|
} while (part8 != part8_cmp);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
/* the 8-bit atomic operations: */
|
|
|
|
/* undefine any macro version of tme_memory_atomic_add8: */
|
|
#undef tme_memory_atomic_add8
|
|
|
|
/* the 8-bit atomic add function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_add8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = value_read + operand;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
add at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit add with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the add in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read + operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_sub8: */
|
|
#undef tme_memory_atomic_sub8
|
|
|
|
/* the 8-bit atomic sub function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_sub8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = value_read - operand;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
sub at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit sub with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the sub in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read - operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_mul8: */
|
|
#undef tme_memory_atomic_mul8
|
|
|
|
/* the 8-bit atomic mul function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_mul8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = value_read * operand;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
mul at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit mul with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the mul in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read * operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_div8: */
|
|
#undef tme_memory_atomic_div8
|
|
|
|
/* the 8-bit atomic div function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_div8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = value_read / operand;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
div at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit div with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the div in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read / operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_and8: */
|
|
#undef tme_memory_atomic_and8
|
|
|
|
/* the 8-bit atomic and function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_and8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = value_read & operand;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
and at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit and with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the and in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read & operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_or8: */
|
|
#undef tme_memory_atomic_or8
|
|
|
|
/* the 8-bit atomic or function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_or8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = value_read | operand;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
or at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit or with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the or in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read | operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_xor8: */
|
|
#undef tme_memory_atomic_xor8
|
|
|
|
/* the 8-bit atomic xor function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_xor8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = value_read ^ operand;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
xor at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit xor with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the xor in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read ^ operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_not8: */
|
|
#undef tme_memory_atomic_not8
|
|
|
|
/* the 8-bit atomic not function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_not8(tme_shared tme_uint8_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = ~value_read;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
not at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit not with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the not in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = ~value_read;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_neg8: */
|
|
#undef tme_memory_atomic_neg8
|
|
|
|
/* the 8-bit atomic neg function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_neg8(tme_shared tme_uint8_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_written;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
value_written = 0 - value_read;
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
neg at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit neg with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the neg in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = 0 - value_read;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_xchg8: */
|
|
#undef tme_memory_atomic_xchg8
|
|
|
|
/* the 8-bit atomic xchg function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_xchg8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
tme_uint8_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
xchg at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 8-bit xchg with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read8(memory, rwlock, align_min);
|
|
|
|
/* spin the xchg in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx8(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_cx8: */
|
|
#undef tme_memory_atomic_cx8
|
|
|
|
/* the 8-bit atomic cx function: */
|
|
tme_uint8_t
|
|
tme_memory_atomic_cx8(tme_shared tme_uint8_t *memory,
|
|
tme_uint8_t value_cmp,
|
|
tme_uint8_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint8_t value_read;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
if (value_read == value_cmp) {
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
}
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 8-bit
|
|
cx at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic cxs are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the cx: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
value_read = tme_memory_read8((_tme_const tme_uint8_t *) memory, align_min);
|
|
if (value_read == value_cmp) {
|
|
tme_memory_write8((tme_uint8_t *) memory, value_written, align_min);
|
|
}
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* the 16-bit atomic operations: */
|
|
|
|
/* undefine any macro version of tme_memory_atomic_add16: */
|
|
#undef tme_memory_atomic_add16
|
|
|
|
/* the 16-bit atomic add function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_add16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = value_read + operand;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
add at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit add with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the add in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read + operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_sub16: */
|
|
#undef tme_memory_atomic_sub16
|
|
|
|
/* the 16-bit atomic sub function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_sub16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = value_read - operand;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
sub at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit sub with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the sub in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read - operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_mul16: */
|
|
#undef tme_memory_atomic_mul16
|
|
|
|
/* the 16-bit atomic mul function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_mul16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = value_read * operand;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
mul at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit mul with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the mul in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read * operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_div16: */
|
|
#undef tme_memory_atomic_div16
|
|
|
|
/* the 16-bit atomic div function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_div16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = value_read / operand;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
div at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit div with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the div in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read / operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_and16: */
|
|
#undef tme_memory_atomic_and16
|
|
|
|
/* the 16-bit atomic and function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_and16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = value_read & operand;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
and at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit and with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the and in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read & operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_or16: */
|
|
#undef tme_memory_atomic_or16
|
|
|
|
/* the 16-bit atomic or function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_or16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = value_read | operand;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
or at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit or with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the or in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read | operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_xor16: */
|
|
#undef tme_memory_atomic_xor16
|
|
|
|
/* the 16-bit atomic xor function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_xor16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = value_read ^ operand;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
xor at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit xor with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the xor in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read ^ operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_not16: */
|
|
#undef tme_memory_atomic_not16
|
|
|
|
/* the 16-bit atomic not function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_not16(tme_shared tme_uint16_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = ~value_read;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
not at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit not with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the not in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = ~value_read;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_neg16: */
|
|
#undef tme_memory_atomic_neg16
|
|
|
|
/* the 16-bit atomic neg function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_neg16(tme_shared tme_uint16_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_written;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
value_written = 0 - value_read;
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
neg at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit neg with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the neg in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = 0 - value_read;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_xchg16: */
|
|
#undef tme_memory_atomic_xchg16
|
|
|
|
/* the 16-bit atomic xchg function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_xchg16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
tme_uint16_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
xchg at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 16-bit xchg with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read16(memory, rwlock, align_min);
|
|
|
|
/* spin the xchg in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx16(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_cx16: */
|
|
#undef tme_memory_atomic_cx16
|
|
|
|
/* the 16-bit atomic cx function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_cx16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t value_cmp,
|
|
tme_uint16_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
if (value_read == value_cmp) {
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
}
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
cx at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic cxs are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the cx: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
if (value_read == value_cmp) {
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
}
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_read16: */
|
|
#undef tme_memory_atomic_read16
|
|
|
|
/* the 16-bit atomic read function: */
|
|
tme_uint16_t
|
|
tme_memory_atomic_read16(_tme_const tme_shared tme_uint16_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint16_t value_read;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_rdlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
read at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic reads are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the read: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
value_read = tme_memory_read16((_tme_const tme_uint16_t *) memory, align_min);
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_write16: */
|
|
#undef tme_memory_atomic_write16
|
|
|
|
/* the 16-bit atomic write function: */
|
|
void
|
|
tme_memory_atomic_write16(tme_shared tme_uint16_t *memory,
|
|
tme_uint16_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 16-bit
|
|
write at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic writes are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the write: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
tme_memory_write16((tme_uint16_t *) memory, value_written, align_min);
|
|
tme_thread_resume_others();
|
|
}
|
|
}
|
|
|
|
/* the 32-bit atomic operations: */
|
|
|
|
/* undefine any macro version of tme_memory_atomic_add32: */
|
|
#undef tme_memory_atomic_add32
|
|
|
|
/* the 32-bit atomic add function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_add32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = value_read + operand;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
add at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit add with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the add in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read + operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_sub32: */
|
|
#undef tme_memory_atomic_sub32
|
|
|
|
/* the 32-bit atomic sub function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_sub32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = value_read - operand;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
sub at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit sub with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the sub in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read - operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_mul32: */
|
|
#undef tme_memory_atomic_mul32
|
|
|
|
/* the 32-bit atomic mul function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_mul32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = value_read * operand;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
mul at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit mul with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the mul in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read * operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_div32: */
|
|
#undef tme_memory_atomic_div32
|
|
|
|
/* the 32-bit atomic div function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_div32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = value_read / operand;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
div at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit div with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the div in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read / operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_and32: */
|
|
#undef tme_memory_atomic_and32
|
|
|
|
/* the 32-bit atomic and function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_and32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = value_read & operand;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
and at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit and with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the and in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read & operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_or32: */
|
|
#undef tme_memory_atomic_or32
|
|
|
|
/* the 32-bit atomic or function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_or32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = value_read | operand;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
or at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit or with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the or in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read | operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_xor32: */
|
|
#undef tme_memory_atomic_xor32
|
|
|
|
/* the 32-bit atomic xor function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_xor32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = value_read ^ operand;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
xor at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit xor with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the xor in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read ^ operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_not32: */
|
|
#undef tme_memory_atomic_not32
|
|
|
|
/* the 32-bit atomic not function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_not32(tme_shared tme_uint32_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = ~value_read;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
not at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit not with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the not in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = ~value_read;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_neg32: */
|
|
#undef tme_memory_atomic_neg32
|
|
|
|
/* the 32-bit atomic neg function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_neg32(tme_shared tme_uint32_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_written;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
value_written = 0 - value_read;
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
neg at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit neg with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the neg in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = 0 - value_read;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_xchg32: */
|
|
#undef tme_memory_atomic_xchg32
|
|
|
|
/* the 32-bit atomic xchg function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_xchg32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
tme_uint32_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
xchg at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 32-bit xchg with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read32(memory, rwlock, align_min);
|
|
|
|
/* spin the xchg in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx32(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_cx32: */
|
|
#undef tme_memory_atomic_cx32
|
|
|
|
/* the 32-bit atomic cx function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_cx32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t value_cmp,
|
|
tme_uint32_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
if (value_read == value_cmp) {
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
}
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
cx at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic cxs are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the cx: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
if (value_read == value_cmp) {
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
}
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_read32: */
|
|
#undef tme_memory_atomic_read32
|
|
|
|
/* the 32-bit atomic read function: */
|
|
tme_uint32_t
|
|
tme_memory_atomic_read32(_tme_const tme_shared tme_uint32_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint32_t value_read;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_rdlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
read at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic reads are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the read: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
value_read = tme_memory_read32((_tme_const tme_uint32_t *) memory, align_min);
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_write32: */
|
|
#undef tme_memory_atomic_write32
|
|
|
|
/* the 32-bit atomic write function: */
|
|
void
|
|
tme_memory_atomic_write32(tme_shared tme_uint32_t *memory,
|
|
tme_uint32_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 32-bit
|
|
write at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic writes are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the write: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
tme_memory_write32((tme_uint32_t *) memory, value_written, align_min);
|
|
tme_thread_resume_others();
|
|
}
|
|
}
|
|
|
|
#ifdef TME_HAVE_INT64_T
|
|
|
|
/* the 64-bit atomic operations: */
|
|
|
|
/* undefine any macro version of tme_memory_atomic_add64: */
|
|
#undef tme_memory_atomic_add64
|
|
|
|
/* the 64-bit atomic add function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_add64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = value_read + operand;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
add at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit add with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the add in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read + operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_sub64: */
|
|
#undef tme_memory_atomic_sub64
|
|
|
|
/* the 64-bit atomic sub function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_sub64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = value_read - operand;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
sub at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit sub with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the sub in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read - operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_mul64: */
|
|
#undef tme_memory_atomic_mul64
|
|
|
|
/* the 64-bit atomic mul function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_mul64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = value_read * operand;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
mul at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit mul with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the mul in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read * operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_div64: */
|
|
#undef tme_memory_atomic_div64
|
|
|
|
/* the 64-bit atomic div function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_div64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = value_read / operand;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
div at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit div with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the div in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read / operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_and64: */
|
|
#undef tme_memory_atomic_and64
|
|
|
|
/* the 64-bit atomic and function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_and64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = value_read & operand;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
and at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit and with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the and in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read & operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_or64: */
|
|
#undef tme_memory_atomic_or64
|
|
|
|
/* the 64-bit atomic or function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_or64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = value_read | operand;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
or at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit or with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the or in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read | operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_xor64: */
|
|
#undef tme_memory_atomic_xor64
|
|
|
|
/* the 64-bit atomic xor function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_xor64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t operand,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = value_read ^ operand;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
xor at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit xor with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the xor in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = value_read ^ operand;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_not64: */
|
|
#undef tme_memory_atomic_not64
|
|
|
|
/* the 64-bit atomic not function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_not64(tme_shared tme_uint64_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = ~value_read;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
not at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit not with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the not in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = ~value_read;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_neg64: */
|
|
#undef tme_memory_atomic_neg64
|
|
|
|
/* the 64-bit atomic neg function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_neg64(tme_shared tme_uint64_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_written;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
value_written = 0 - value_read;
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
neg at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit neg with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the neg in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* make the value to write: */
|
|
value_written = 0 - value_read;
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_xchg64: */
|
|
#undef tme_memory_atomic_xchg64
|
|
|
|
/* the 64-bit atomic xchg function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_xchg64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
tme_uint64_t value_read_verify;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
xchg at all, or if it can't do it at this alignment.
|
|
|
|
we emulate the atomic 64-bit xchg with a compare-and-exchange: */
|
|
else {
|
|
|
|
/* do an atomic read of the memory: */
|
|
value_read = tme_memory_atomic_read64(memory, rwlock, align_min);
|
|
|
|
/* spin the xchg in a compare-and-exchange loop: */
|
|
for (;;) {
|
|
|
|
/* try the compare-and-exchange: */
|
|
value_read_verify = tme_memory_atomic_cx64(memory, value_read, value_written, rwlock, align_min);
|
|
|
|
/* if the compare-and-exchange failed: */
|
|
if (__tme_predict_false(value_read_verify != value_read)) {
|
|
|
|
/* loop with the new value read from the memory: */
|
|
value_read = value_read_verify;
|
|
continue;
|
|
}
|
|
|
|
/* stop now: */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_cx64: */
|
|
#undef tme_memory_atomic_cx64
|
|
|
|
/* the 64-bit atomic cx function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_cx64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t value_cmp,
|
|
tme_uint64_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
if (value_read == value_cmp) {
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
}
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
cx at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic cxs are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the cx: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
if (value_read == value_cmp) {
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
}
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_read64: */
|
|
#undef tme_memory_atomic_read64
|
|
|
|
/* the 64-bit atomic read function: */
|
|
tme_uint64_t
|
|
tme_memory_atomic_read64(_tme_const tme_shared tme_uint64_t *memory,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
tme_uint64_t value_read;
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_rdlock(rwlock);
|
|
}
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
read at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic reads are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the read: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
value_read = tme_memory_read64((_tme_const tme_uint64_t *) memory, align_min);
|
|
tme_thread_resume_others();
|
|
}
|
|
|
|
/* return the value read: */
|
|
return (value_read);
|
|
}
|
|
|
|
/* undefine any macro version of tme_memory_atomic_write64: */
|
|
#undef tme_memory_atomic_write64
|
|
|
|
/* the 64-bit atomic write function: */
|
|
void
|
|
tme_memory_atomic_write64(tme_shared tme_uint64_t *memory,
|
|
tme_uint64_t value_written,
|
|
tme_rwlock_t *rwlock,
|
|
unsigned int align_min)
|
|
{
|
|
|
|
/* if we can't make direct accesses at all, all atomic
|
|
accesses must be done under lock. (when threads are
|
|
cooperative the actual locking isn't needed): */
|
|
if (TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0) {
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_wrlock(rwlock);
|
|
}
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
if (!TME_THREADS_COOPERATIVE) {
|
|
tme_rwlock_unlock(rwlock);
|
|
}
|
|
}
|
|
|
|
/* otherwise, threads are not cooperative and this host CPU
|
|
can make atomic accesses to at least the most common memory
|
|
size.
|
|
|
|
in that case, the only reason this function should get
|
|
called is if the host CPU can't do an atomic 64-bit
|
|
write at all, or if it can't do it at this alignment.
|
|
|
|
we assume that these problematic atomic writes are rare,
|
|
and to emulate them we simply stop all other threads while
|
|
doing the write: */
|
|
else {
|
|
tme_thread_suspend_others();
|
|
tme_memory_write64((tme_uint64_t *) memory, value_written, align_min);
|
|
tme_thread_resume_others();
|
|
}
|
|
}
|
|
|
|
#endif /* TME_HAVE_INT64_T */
|