Run-Sun3-SunOS-4.1.1/tme-0.8_up/libtme/memory-auto.h

/* automatically generated by memory-auto.sh, do not edit! */

/*
 * Copyright (c) 2005, 2006 Matt Fredette
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Matt Fredette.
 * 4. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

_TME_RCSID("$Id: memory-auto.sh,v 1.2 2010/02/15 15:16:28 fredette Exp $");

/* macros: */

/* the plain partial read internal macro: */
#define _tme_memory_read(type_whole, type_part, mem, offset)		\
  (((type_whole)							\
    *((_tme_const type_part *)						\
      (_tme_cast_pointer_const(tme_uint8_t *, type_whole *, mem)	\
       + (offset))))							\
   << (8 * (TME_ENDIAN_NATIVE == TME_ENDIAN_BIG				\
	    ? (sizeof(type_whole)					\
	       - ((offset) + sizeof(type_part)))			\
	    : (offset))))

/* the plain partial write internal macro: */
#define _tme_memory_write(type_whole, type_part, mem, offset, x)	\
  do {									\
    *((type_part *)							\
      (_tme_cast_pointer(tme_uint8_t *, type_whole *, mem)		\
       + (offset)))							\
      = (type_part)							\
        (((type_whole) (x))						\
	 >> (8 * (TME_ENDIAN_NATIVE == TME_ENDIAN_BIG			\
		  ? (sizeof(type_whole)					\
		     - ((offset) + sizeof(type_part)))			\
		  : (offset))));					\
  } while (/* CONSTCOND */ 0)

/* this tests bits in a memory address: */
#define _tme_memory_address_test(mem, bits, align_min)			\
  (((bits) & ~((align_min - 1))) & ((unsigned long) (mem)))

/* this returns a mask of all-bits-one in given type: */
#define _tme_memory_type_mask(type, shift)				\
  ((type) ((((type) 0) - ((type) 1)) shift))


/* the bus 16-bit read slow function: */
tme_uint16_t tme_memory_bus_read16 _TME_P((_tme_const tme_shared tme_uint16_t *, tme_rwlock_t *, unsigned int, unsigned int));

/* the bus 16-bit write slow function: */
void tme_memory_bus_write16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int, unsigned int));

/* the bus 32-bit read slow function: */
tme_uint32_t tme_memory_bus_read32 _TME_P((_tme_const tme_shared tme_uint32_t *, tme_rwlock_t *, unsigned int, unsigned int));

/* the bus 32-bit write slow function: */
void tme_memory_bus_write32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int, unsigned int));

#ifdef TME_HAVE_INT64_T

/* the bus 64-bit read slow function: */
tme_uint64_t tme_memory_bus_read64 _TME_P((_tme_const tme_shared tme_uint64_t *, tme_rwlock_t *, unsigned int, unsigned int));

/* the bus 64-bit write slow function: */
void tme_memory_bus_write64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int, unsigned int));

#endif /* TME_HAVE_INT64_T */

/* the bus read buffer function and default macro implementation: */
void tme_memory_bus_read_buffer _TME_P((_tme_const tme_shared tme_uint8_t *, tme_uint8_t *, unsigned long, tme_rwlock_t *, unsigned int, unsigned int));
#define tme_memory_bus_read_buffer(mem, buffer, count, rwlock, align_min, bus_boundary) \
  do { \
    if (TME_THREADS_COOPERATIVE) { \
      memcpy((buffer), ((_tme_const tme_uint8_t *) (mem)), (count)); \
    } \
    else { \
      tme_memory_bus_read_buffer(((_tme_const tme_shared tme_uint8_t *) (mem)), ((tme_uint8_t *) _tme_audit_pointer(buffer)), (count), (rwlock), (align_min), (bus_boundary)); \
    } \
  } while (/* CONSTCOND */ 0)

/* the bus write buffer function and default macro implementation: */
void tme_memory_bus_write_buffer _TME_P((tme_shared tme_uint8_t *, _tme_const tme_uint8_t *, unsigned long, tme_rwlock_t *, unsigned int, unsigned int));
#define tme_memory_bus_write_buffer(mem, buffer, count, rwlock, align_min, bus_boundary) \
  do { \
    if (TME_THREADS_COOPERATIVE) { \
      memcpy((tme_uint8_t *) (mem), (buffer), (count)); \
    } \
    else { \
      tme_memory_bus_write_buffer(((tme_shared tme_uint8_t *) _tme_audit_pointer_shared(mem)), ((_tme_const tme_uint8_t *) _tme_audit_pointer_const(buffer)), (count), (rwlock), (align_min), (bus_boundary)); \
    } \
  } while (/* CONSTCOND */ 0)

/* the 8-bit atomic operations: */
tme_uint8_t tme_memory_atomic_add8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_sub8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_mul8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_div8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_and8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_or8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_xor8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_not8 _TME_P((tme_shared tme_uint8_t *, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_neg8 _TME_P((tme_shared tme_uint8_t *, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_xchg8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_rwlock_t *, unsigned int));
tme_uint8_t tme_memory_atomic_cx8 _TME_P((tme_shared tme_uint8_t *, tme_uint8_t, tme_uint8_t, tme_rwlock_t *, unsigned int));

/* the default 16-bit memory plain read macro: */
#define tme_memory_read16(mem, align_min) \
  ( \
   /* if we know at compile time that the memory is aligned \
      enough to read directly, do the single direct read. \
   \
      otherwise, if we know at compile time that the memory \
      is less aligned than the smallest acceptable parts size, \
      test if the memory is aligned enough to read directly, \
      and do the single direct read if it is: */ \
   (__tme_predict_true((_TME_ALIGNOF_INT16_T == 1 \
                        || (align_min) >= _TME_ALIGNOF_INT16_T) \
                       || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint16_t) \
                           && _tme_memory_address_test(mem, _TME_ALIGNOF_INT16_T - 1, align_min) == 0))) \
   ? \
     _tme_memory_read(tme_uint16_t, tme_uint16_t, mem, 0) \
   : \
     (_tme_memory_read(tme_uint16_t, tme_uint8_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint16_t, tme_uint8_t, mem, (8 / 8))) \
  )

/* the default 16-bit memory plain write macro: */
#define tme_memory_write16(mem, x, align_min) \
  do { \
    if \
      /* if we know at compile time that the memory is aligned \
         enough to write directly, do the single direct write. \
      \
         otherwise, if we know at compile time that the memory \
         is less aligned than the smallest acceptable parts size, \
         test if the memory is aligned enough to write directly, \
         and do the single direct write if it is: */ \
      (__tme_predict_true((_TME_ALIGNOF_INT16_T == 1 \
                           || (align_min) >= _TME_ALIGNOF_INT16_T) \
                          || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint16_t) \
                              && _tme_memory_address_test(mem, _TME_ALIGNOF_INT16_T - 1, align_min) == 0))) \
      { \
        _tme_memory_write(tme_uint16_t, tme_uint16_t, mem, 0, x); \
      } \
    else \
      { \
        _tme_memory_write(tme_uint16_t, tme_uint8_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint16_t, tme_uint8_t, mem, (8 / 8), x); \
      } \
  } while (/* CONSTCOND */ 0)

/* the default 16-bit memory atomic read macro: */
#define tme_memory_atomic_read16(mem, lock, align_min) \
  ( \
   /* if threads are cooperative, do a plain read: */ \
   (TME_THREADS_COOPERATIVE) \
   ? \
     tme_memory_read16((_tme_const tme_uint16_t *) _tme_audit_type(mem, tme_uint16_t *), align_min) \
   /* otherwise, if we aren't locking for all memory accesses, and we can \
      make direct 16-bit accesses, and this memory is aligned \
      enough to make a single direct atomic access, do the single \
      direct atomic read: */ \
   : \
   (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                       && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) != 0 \
                       && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) - 1, align_min) == 0)) \
   ? \
     (*_tme_audit_type(mem, tme_uint16_t *)) \
   /* otherwise, we must do a slow indirect atomic read: */ \
   : \
     tme_memory_atomic_read16(mem, lock, align_min) \
  )

/* the default 16-bit memory atomic write macro: */
#define tme_memory_atomic_write16(mem, x, lock, align_min) \
  do { \
    if \
      /* if threads are cooperative, do a plain write: */ \
      (TME_THREADS_COOPERATIVE) \
      { \
        tme_memory_write16((tme_uint16_t *) _tme_cast_pointer_shared(tme_uint16_t *, tme_uint16_t *, mem), x, align_min); \
      /* otherwise, if we aren't locking for all memory accesses, and we can \
         make direct 16-bit accesses, and this memory is aligned \
         enough to make a single direct atomic access, do the single \
         direct atomic write: */ \
      } \
    else if \
      (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                          && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) != 0 \
                          && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) - 1, align_min) == 0)) \
      { \
        (*_tme_audit_type(mem, tme_uint16_t *)) \
          = (x); \
      /* otherwise, we must do a slow indirect atomic write: */ \
      } \
    else \
      { \
        tme_memory_atomic_write16(mem, x, lock, align_min); \
      } \
  } while (/* CONSTCOND */ 0)

/* the default 16-bit memory bus read macro: */
#define tme_memory_bus_read16(mem, lock, align_min, bus_boundary) \
  ( \
   /* if threads are cooperative, do a plain read: */ \
   (TME_THREADS_COOPERATIVE) \
   ? \
     tme_memory_read16((_tme_const tme_uint16_t *) _tme_audit_type(mem, tme_uint16_t *), align_min) \
   /* otherwise, if we aren't locking for all memory accesses, the \
      host supports misaligned 16-bit accesses, the host's bus \
      boundary is greater than or equal to the emulated bus \
      boundary, and this memory is aligned enough, do a single \
      direct bus read: */ \
   : \
   (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                       && _TME_ALIGNOF_INT16_T < sizeof(tme_uint16_t) \
                       && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
                       && _tme_memory_address_test(mem, _TME_ALIGNOF_INT16_T - 1, align_min) == 0)) \
   ? \
     (*_tme_audit_type(mem, tme_uint16_t *)) \
   /* otherwise, if we're locking for all memory accesses, or \
      if this memory must cross at least one host bus boundary \
      and the host bus boundary is less than the emulated bus \
      boundary, do a slow indirect atomic read: */ \
   : \
   (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
                        || (sizeof(tme_uint16_t) > TME_MEMORY_BUS_BOUNDARY \
                            && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
   ? \
     tme_memory_atomic_read16(mem, lock, align_min) \
   /* otherwise, if the memory is not larger than the emulated \
      bus boundary, or if size-alignment would mean an atomic \
      host access and it is size-aligned, do a single atomic \
      read, which may be direct or slow: */ \
   : \
   (__tme_predict_true((sizeof(tme_uint16_t) <= (bus_boundary) \
                        || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) != 0 \
                            && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) <= sizeof(tme_uint16_t))) \
                       && _tme_memory_address_test(mem, sizeof(tme_uint16_t) - 1, align_min) == 0)) \
   ? \
     tme_memory_atomic_read16(mem, lock, sizeof(tme_uint16_t)) \
   /* otherwise, we must do a slow bus read: */ \
   : \
     tme_memory_bus_read16(mem, lock, align_min, bus_boundary) \
  )

/* the default 16-bit memory bus write macro: */
#define tme_memory_bus_write16(mem, x, lock, align_min, bus_boundary) \
  do { \
    if \
      /* if threads are cooperative, do a plain write: */ \
      (TME_THREADS_COOPERATIVE) \
      { \
        tme_memory_write16((tme_uint16_t *) _tme_cast_pointer_shared(tme_uint16_t *, tme_uint16_t *, mem), x, align_min); \
      /* otherwise, if we aren't locking for all memory accesses, the \
         host supports misaligned 16-bit accesses, the host's bus \
         boundary is greater than or equal to the emulated bus \
         boundary, and this memory is aligned enough, do a single \
         direct bus write: */ \
      } \
    else if \
      (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                          && _TME_ALIGNOF_INT16_T < sizeof(tme_uint16_t) \
                          && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
                          && _tme_memory_address_test(mem, _TME_ALIGNOF_INT16_T - 1, align_min) == 0)) \
      { \
        (*_tme_audit_type(mem, tme_uint16_t *)) \
          = (x); \
      /* otherwise, if we're locking for all memory accesses, or \
         if this memory must cross at least one host bus boundary \
         and the host bus boundary is less than the emulated bus \
         boundary, do a slow indirect atomic write: */ \
      } \
    else if \
      (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
                           || (sizeof(tme_uint16_t) > TME_MEMORY_BUS_BOUNDARY \
                               && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
      { \
        tme_memory_atomic_write16(mem, x, lock, align_min); \
      /* otherwise, if the memory is not larger than the emulated \
         bus boundary, or if size-alignment would mean an atomic \
         host access and it is size-aligned, do a single atomic \
         write, which may be direct or slow: */ \
      } \
    else if \
      (__tme_predict_true((sizeof(tme_uint16_t) <= (bus_boundary) \
                           || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) != 0 \
                               && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint16_t) <= sizeof(tme_uint16_t))) \
                          && _tme_memory_address_test(mem, sizeof(tme_uint16_t) - 1, align_min) == 0)) \
      { \
        tme_memory_atomic_write16(mem, x, lock, sizeof(tme_uint16_t)); \
      /* otherwise, we must do a slow bus write: */ \
      } \
    else \
      { \
        tme_memory_bus_write16(mem, x, lock, align_min, bus_boundary); \
      } \
  } while (/* CONSTCOND */ 0)

/* the 16-bit atomic operations: */
tme_uint16_t tme_memory_atomic_add16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_sub16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_mul16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_div16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_and16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_or16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_xor16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_not16 _TME_P((tme_shared tme_uint16_t *, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_neg16 _TME_P((tme_shared tme_uint16_t *, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_xchg16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_cx16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_uint16_t, tme_rwlock_t *, unsigned int));
tme_uint16_t tme_memory_atomic_read16 _TME_P((_tme_const tme_shared tme_uint16_t *, tme_rwlock_t *, unsigned int));
void tme_memory_atomic_write16 _TME_P((tme_shared tme_uint16_t *, tme_uint16_t, tme_rwlock_t *, unsigned int));

/* the default 32-bit memory plain read macro: */
#define tme_memory_read32(mem, align_min) \
  ( \
   /* if we know at compile time that the memory is aligned \
      enough to read directly, do the single direct read. \
   \
      otherwise, if we know at compile time that the memory \
      is less aligned than the smallest acceptable parts size, \
      test if the memory is aligned enough to read directly, \
      and do the single direct read if it is: */ \
   (__tme_predict_true((_TME_ALIGNOF_INT32_T == 1 \
                        || (align_min) >= _TME_ALIGNOF_INT32_T) \
                       || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint32_t) \
                           && _tme_memory_address_test(mem, _TME_ALIGNOF_INT32_T - 1, align_min) == 0))) \
   ? \
     _tme_memory_read(tme_uint32_t, tme_uint32_t, mem, 0) \
   : \
   ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint32_t) <= sizeof(tme_uint8_t)) \
    && ((align_min) <= sizeof(tme_uint8_t))) \
   ? \
     (_tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (8 / 8)) \
      | _tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (16 / 8)) \
      | _tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (24 / 8))) \
   : \
   (_tme_memory_address_test(mem, sizeof(tme_uint8_t), align_min) != 0) \
   ? \
     (_tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint32_t, tme_uint16_t, mem, (8 / 8)) \
      | _tme_memory_read(tme_uint32_t, tme_uint8_t, mem, (24 / 8))) \
   : \
     (_tme_memory_read(tme_uint32_t, tme_uint16_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint32_t, tme_uint16_t, mem, (16 / 8))) \
  )

/* the default 32-bit memory plain write macro: */
#define tme_memory_write32(mem, x, align_min) \
  do { \
    if \
      /* if we know at compile time that the memory is aligned \
         enough to write directly, do the single direct write. \
      \
         otherwise, if we know at compile time that the memory \
         is less aligned than the smallest acceptable parts size, \
         test if the memory is aligned enough to write directly, \
         and do the single direct write if it is: */ \
      (__tme_predict_true((_TME_ALIGNOF_INT32_T == 1 \
                           || (align_min) >= _TME_ALIGNOF_INT32_T) \
                          || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint32_t) \
                              && _tme_memory_address_test(mem, _TME_ALIGNOF_INT32_T - 1, align_min) == 0))) \
      { \
        _tme_memory_write(tme_uint32_t, tme_uint32_t, mem, 0, x); \
      } \
    else if \
      ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint32_t) <= sizeof(tme_uint8_t)) \
       && ((align_min) <= sizeof(tme_uint8_t))) \
      { \
        _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (8 / 8), x); \
        _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (16 / 8), x); \
        _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (24 / 8), x); \
      } \
    else if \
      (_tme_memory_address_test(mem, sizeof(tme_uint8_t), align_min) != 0) \
      { \
        _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint32_t, tme_uint16_t, mem, (8 / 8), x); \
        _tme_memory_write(tme_uint32_t, tme_uint8_t, mem, (24 / 8), x); \
      } \
    else \
      { \
        _tme_memory_write(tme_uint32_t, tme_uint16_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint32_t, tme_uint16_t, mem, (16 / 8), x); \
      } \
  } while (/* CONSTCOND */ 0)

/* the default 32-bit memory atomic read macro: */
#define tme_memory_atomic_read32(mem, lock, align_min) \
  ( \
   /* if threads are cooperative, do a plain read: */ \
   (TME_THREADS_COOPERATIVE) \
   ? \
     tme_memory_read32((_tme_const tme_uint32_t *) _tme_audit_type(mem, tme_uint32_t *), align_min) \
   /* otherwise, if we aren't locking for all memory accesses, and we can \
      make direct 32-bit accesses, and this memory is aligned \
      enough to make a single direct atomic access, do the single \
      direct atomic read: */ \
   : \
   (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                       && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) != 0 \
                       && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) - 1, align_min) == 0)) \
   ? \
     (*_tme_audit_type(mem, tme_uint32_t *)) \
   /* otherwise, we must do a slow indirect atomic read: */ \
   : \
     tme_memory_atomic_read32(mem, lock, align_min) \
  )

/* the default 32-bit memory atomic write macro: */
#define tme_memory_atomic_write32(mem, x, lock, align_min) \
  do { \
    if \
      /* if threads are cooperative, do a plain write: */ \
      (TME_THREADS_COOPERATIVE) \
      { \
        tme_memory_write32((tme_uint32_t *) _tme_cast_pointer_shared(tme_uint32_t *, tme_uint32_t *, mem), x, align_min); \
      /* otherwise, if we aren't locking for all memory accesses, and we can \
         make direct 32-bit accesses, and this memory is aligned \
         enough to make a single direct atomic access, do the single \
         direct atomic write: */ \
      } \
    else if \
      (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                          && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) != 0 \
                          && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) - 1, align_min) == 0)) \
      { \
        (*_tme_audit_type(mem, tme_uint32_t *)) \
          = (x); \
      /* otherwise, we must do a slow indirect atomic write: */ \
      } \
    else \
      { \
        tme_memory_atomic_write32(mem, x, lock, align_min); \
      } \
  } while (/* CONSTCOND */ 0)

/* the default 32-bit memory bus read macro: */
#define tme_memory_bus_read32(mem, lock, align_min, bus_boundary) \
  ( \
   /* if threads are cooperative, do a plain read: */ \
   (TME_THREADS_COOPERATIVE) \
   ? \
     tme_memory_read32((_tme_const tme_uint32_t *) _tme_audit_type(mem, tme_uint32_t *), align_min) \
   /* otherwise, if we aren't locking for all memory accesses, the \
      host supports misaligned 32-bit accesses, the host's bus \
      boundary is greater than or equal to the emulated bus \
      boundary, and this memory is aligned enough, do a single \
      direct bus read: */ \
   : \
   (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                       && _TME_ALIGNOF_INT32_T < sizeof(tme_uint32_t) \
                       && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
                       && _tme_memory_address_test(mem, _TME_ALIGNOF_INT32_T - 1, align_min) == 0)) \
   ? \
     (*_tme_audit_type(mem, tme_uint32_t *)) \
   /* otherwise, if we're locking for all memory accesses, or \
      if this memory must cross at least one host bus boundary \
      and the host bus boundary is less than the emulated bus \
      boundary, do a slow indirect atomic read: */ \
   : \
   (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
                        || (sizeof(tme_uint32_t) > TME_MEMORY_BUS_BOUNDARY \
                            && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
   ? \
     tme_memory_atomic_read32(mem, lock, align_min) \
   /* otherwise, if the memory is not larger than the emulated \
      bus boundary, or if size-alignment would mean an atomic \
      host access and it is size-aligned, do a single atomic \
      read, which may be direct or slow: */ \
   : \
   (__tme_predict_true((sizeof(tme_uint32_t) <= (bus_boundary) \
                        || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) != 0 \
                            && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) <= sizeof(tme_uint32_t))) \
                       && _tme_memory_address_test(mem, sizeof(tme_uint32_t) - 1, align_min) == 0)) \
   ? \
     tme_memory_atomic_read32(mem, lock, sizeof(tme_uint32_t)) \
   /* otherwise, we must do a slow bus read: */ \
   : \
     tme_memory_bus_read32(mem, lock, align_min, bus_boundary) \
  )

/* the default 32-bit memory bus write macro: */
#define tme_memory_bus_write32(mem, x, lock, align_min, bus_boundary) \
  do { \
    if \
      /* if threads are cooperative, do a plain write: */ \
      (TME_THREADS_COOPERATIVE) \
      { \
        tme_memory_write32((tme_uint32_t *) _tme_cast_pointer_shared(tme_uint32_t *, tme_uint32_t *, mem), x, align_min); \
      /* otherwise, if we aren't locking for all memory accesses, the \
         host supports misaligned 32-bit accesses, the host's bus \
         boundary is greater than or equal to the emulated bus \
         boundary, and this memory is aligned enough, do a single \
         direct bus write: */ \
      } \
    else if \
      (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                          && _TME_ALIGNOF_INT32_T < sizeof(tme_uint32_t) \
                          && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
                          && _tme_memory_address_test(mem, _TME_ALIGNOF_INT32_T - 1, align_min) == 0)) \
      { \
        (*_tme_audit_type(mem, tme_uint32_t *)) \
          = (x); \
      /* otherwise, if we're locking for all memory accesses, or \
         if this memory must cross at least one host bus boundary \
         and the host bus boundary is less than the emulated bus \
         boundary, do a slow indirect atomic write: */ \
      } \
    else if \
      (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
                           || (sizeof(tme_uint32_t) > TME_MEMORY_BUS_BOUNDARY \
                               && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
      { \
        tme_memory_atomic_write32(mem, x, lock, align_min); \
      /* otherwise, if the memory is not larger than the emulated \
         bus boundary, or if size-alignment would mean an atomic \
         host access and it is size-aligned, do a single atomic \
         write, which may be direct or slow: */ \
      } \
    else if \
      (__tme_predict_true((sizeof(tme_uint32_t) <= (bus_boundary) \
                           || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) != 0 \
                               && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint32_t) <= sizeof(tme_uint32_t))) \
                          && _tme_memory_address_test(mem, sizeof(tme_uint32_t) - 1, align_min) == 0)) \
      { \
        tme_memory_atomic_write32(mem, x, lock, sizeof(tme_uint32_t)); \
      /* otherwise, we must do a slow bus write: */ \
      } \
    else \
      { \
        tme_memory_bus_write32(mem, x, lock, align_min, bus_boundary); \
      } \
  } while (/* CONSTCOND */ 0)

/* the 32-bit atomic operations: */
tme_uint32_t tme_memory_atomic_add32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_sub32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_mul32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_div32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_and32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_or32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_xor32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_not32 _TME_P((tme_shared tme_uint32_t *, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_neg32 _TME_P((tme_shared tme_uint32_t *, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_xchg32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_cx32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_uint32_t, tme_rwlock_t *, unsigned int));
tme_uint32_t tme_memory_atomic_read32 _TME_P((_tme_const tme_shared tme_uint32_t *, tme_rwlock_t *, unsigned int));
void tme_memory_atomic_write32 _TME_P((tme_shared tme_uint32_t *, tme_uint32_t, tme_rwlock_t *, unsigned int));

#ifdef TME_HAVE_INT64_T

/* the default 64-bit memory plain read macro: */
#define tme_memory_read64(mem, align_min) \
  ( \
   /* if we know at compile time that the memory is aligned \
      enough to read directly, do the single direct read. \
   \
      otherwise, if we know at compile time that the memory \
      is less aligned than the smallest acceptable parts size, \
      test if the memory is aligned enough to read directly, \
      and do the single direct read if it is: */ \
   (__tme_predict_true((_TME_ALIGNOF_INT64_T == 1 \
                        || (align_min) >= _TME_ALIGNOF_INT64_T) \
                       || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) \
                           && _tme_memory_address_test(mem, _TME_ALIGNOF_INT64_T - 1, align_min) == 0))) \
   ? \
     _tme_memory_read(tme_uint64_t, tme_uint64_t, mem, 0) \
   : \
   ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) <= sizeof(tme_uint8_t)) \
    && ((align_min) <= sizeof(tme_uint8_t))) \
   ? \
     (_tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (8 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (16 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (24 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (32 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (40 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (48 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (56 / 8))) \
   : \
   (_tme_memory_address_test(mem, sizeof(tme_uint8_t), align_min) != 0) \
   ? \
     (_tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (8 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (24 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (40 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint8_t, mem, (56 / 8))) \
   : \
   ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) <= sizeof(tme_uint16_t)) \
    && ((align_min) <= sizeof(tme_uint16_t))) \
   ? \
     (_tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (16 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (32 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (48 / 8))) \
   : \
   (_tme_memory_address_test(mem, sizeof(tme_uint16_t), align_min) != 0) \
   ? \
     (_tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint32_t, mem, (16 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint16_t, mem, (48 / 8))) \
   : \
     (_tme_memory_read(tme_uint64_t, tme_uint32_t, mem, (0 / 8)) \
      | _tme_memory_read(tme_uint64_t, tme_uint32_t, mem, (32 / 8))) \
  )

/* the default 64-bit memory plain write macro: */
#define tme_memory_write64(mem, x, align_min) \
  do { \
    if \
      /* if we know at compile time that the memory is aligned \
         enough to write directly, do the single direct write. \
      \
         otherwise, if we know at compile time that the memory \
         is less aligned than the smallest acceptable parts size, \
         test if the memory is aligned enough to write directly, \
         and do the single direct write if it is: */ \
      (__tme_predict_true((_TME_ALIGNOF_INT64_T == 1 \
                           || (align_min) >= _TME_ALIGNOF_INT64_T) \
                          || ((align_min) < TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) \
                              && _tme_memory_address_test(mem, _TME_ALIGNOF_INT64_T - 1, align_min) == 0))) \
      { \
        _tme_memory_write(tme_uint64_t, tme_uint64_t, mem, 0, x); \
      } \
    else if \
      ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) <= sizeof(tme_uint8_t)) \
       && ((align_min) <= sizeof(tme_uint8_t))) \
      { \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (8 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (16 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (24 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (32 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (40 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (48 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (56 / 8), x); \
      } \
    else if \
      (_tme_memory_address_test(mem, sizeof(tme_uint8_t), align_min) != 0) \
      { \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (8 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (24 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (40 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint8_t, mem, (56 / 8), x); \
      } \
    else if \
      ((TME_MEMORY_ALIGNMENT_ACCEPT(tme_uint64_t) <= sizeof(tme_uint16_t)) \
       && ((align_min) <= sizeof(tme_uint16_t))) \
      { \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (16 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (32 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (48 / 8), x); \
      } \
    else if \
      (_tme_memory_address_test(mem, sizeof(tme_uint16_t), align_min) != 0) \
      { \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint32_t, mem, (16 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint16_t, mem, (48 / 8), x); \
      } \
    else \
      { \
        _tme_memory_write(tme_uint64_t, tme_uint32_t, mem, (0 / 8), x); \
        _tme_memory_write(tme_uint64_t, tme_uint32_t, mem, (32 / 8), x); \
      } \
  } while (/* CONSTCOND */ 0)

/* the default 64-bit memory atomic read macro: */
#define tme_memory_atomic_read64(mem, lock, align_min) \
  ( \
   /* if threads are cooperative, do a plain read: */ \
   (TME_THREADS_COOPERATIVE) \
   ? \
     tme_memory_read64((_tme_const tme_uint64_t *) _tme_audit_type(mem, tme_uint64_t *), align_min) \
   /* otherwise, if we aren't locking for all memory accesses, and we can \
      make direct 64-bit accesses, and this memory is aligned \
      enough to make a single direct atomic access, do the single \
      direct atomic read: */ \
   : \
   (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                       && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) != 0 \
                       && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) - 1, align_min) == 0)) \
   ? \
     (*_tme_audit_type(mem, tme_uint64_t *)) \
   /* otherwise, we must do a slow indirect atomic read: */ \
   : \
     tme_memory_atomic_read64(mem, lock, align_min) \
  )

/* the default 64-bit memory atomic write macro: */
#define tme_memory_atomic_write64(mem, x, lock, align_min) \
  do { \
    if \
      /* if threads are cooperative, do a plain write: */ \
      (TME_THREADS_COOPERATIVE) \
      { \
        tme_memory_write64((tme_uint64_t *) _tme_cast_pointer_shared(tme_uint64_t *, tme_uint64_t *, mem), x, align_min); \
      /* otherwise, if we aren't locking for all memory accesses, and we can \
         make direct 64-bit accesses, and this memory is aligned \
         enough to make a single direct atomic access, do the single \
         direct atomic write: */ \
      } \
    else if \
      (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                          && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) != 0 \
                          && _tme_memory_address_test(mem, TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) - 1, align_min) == 0)) \
      { \
        (*_tme_audit_type(mem, tme_uint64_t *)) \
          = (x); \
      /* otherwise, we must do a slow indirect atomic write: */ \
      } \
    else \
      { \
        tme_memory_atomic_write64(mem, x, lock, align_min); \
      } \
  } while (/* CONSTCOND */ 0)

/* the default 64-bit memory bus read macro: */
#define tme_memory_bus_read64(mem, lock, align_min, bus_boundary) \
  ( \
   /* if threads are cooperative, do a plain read: */ \
   (TME_THREADS_COOPERATIVE) \
   ? \
     tme_memory_read64((_tme_const tme_uint64_t *) _tme_audit_type(mem, tme_uint64_t *), align_min) \
   /* otherwise, if we aren't locking for all memory accesses, the \
      host supports misaligned 64-bit accesses, the host's bus \
      boundary is greater than or equal to the emulated bus \
      boundary, and this memory is aligned enough, do a single \
      direct bus read: */ \
   : \
   (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                       && _TME_ALIGNOF_INT64_T < sizeof(tme_uint64_t) \
                       && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
                       && _tme_memory_address_test(mem, _TME_ALIGNOF_INT64_T - 1, align_min) == 0)) \
   ? \
     (*_tme_audit_type(mem, tme_uint64_t *)) \
   /* otherwise, if we're locking for all memory accesses, or \
      if this memory must cross at least one host bus boundary \
      and the host bus boundary is less than the emulated bus \
      boundary, do a slow indirect atomic read: */ \
   : \
   (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
                        || (sizeof(tme_uint64_t) > TME_MEMORY_BUS_BOUNDARY \
                            && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
   ? \
     tme_memory_atomic_read64(mem, lock, align_min) \
   /* otherwise, if the memory is not larger than the emulated \
      bus boundary, or if size-alignment would mean an atomic \
      host access and it is size-aligned, do a single atomic \
      read, which may be direct or slow: */ \
   : \
   (__tme_predict_true((sizeof(tme_uint64_t) <= (bus_boundary) \
                        || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) != 0 \
                            && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) <= sizeof(tme_uint64_t))) \
                       && _tme_memory_address_test(mem, sizeof(tme_uint64_t) - 1, align_min) == 0)) \
   ? \
     tme_memory_atomic_read64(mem, lock, sizeof(tme_uint64_t)) \
   /* otherwise, we must do a slow bus read: */ \
   : \
     tme_memory_bus_read64(mem, lock, align_min, bus_boundary) \
  )

/* the default 64-bit memory bus write macro: */
#define tme_memory_bus_write64(mem, x, lock, align_min, bus_boundary) \
  do { \
    if \
      /* if threads are cooperative, do a plain write: */ \
      (TME_THREADS_COOPERATIVE) \
      { \
        tme_memory_write64((tme_uint64_t *) _tme_cast_pointer_shared(tme_uint64_t *, tme_uint64_t *, mem), x, align_min); \
      /* otherwise, if we aren't locking for all memory accesses, the \
         host supports misaligned 64-bit accesses, the host's bus \
         boundary is greater than or equal to the emulated bus \
         boundary, and this memory is aligned enough, do a single \
         direct bus write: */ \
      } \
    else if \
      (__tme_predict_true(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) != 0 \
                          && _TME_ALIGNOF_INT64_T < sizeof(tme_uint64_t) \
                          && TME_MEMORY_BUS_BOUNDARY >= (bus_boundary) \
                          && _tme_memory_address_test(mem, _TME_ALIGNOF_INT64_T - 1, align_min) == 0)) \
      { \
        (*_tme_audit_type(mem, tme_uint64_t *)) \
          = (x); \
      /* otherwise, if we're locking for all memory accesses, or \
         if this memory must cross at least one host bus boundary \
         and the host bus boundary is less than the emulated bus \
         boundary, do a slow indirect atomic write: */ \
      } \
    else if \
      (__tme_predict_false(TME_MEMORY_ALIGNMENT_ATOMIC(TME_MEMORY_TYPE_COMMON) == 0 \
                           || (sizeof(tme_uint64_t) > TME_MEMORY_BUS_BOUNDARY \
                               && TME_MEMORY_BUS_BOUNDARY < (bus_boundary)))) \
      { \
        tme_memory_atomic_write64(mem, x, lock, align_min); \
      /* otherwise, if the memory is not larger than the emulated \
         bus boundary, or if size-alignment would mean an atomic \
         host access and it is size-aligned, do a single atomic \
         write, which may be direct or slow: */ \
      } \
    else if \
      (__tme_predict_true((sizeof(tme_uint64_t) <= (bus_boundary) \
                           || (TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) != 0 \
                               && TME_MEMORY_ALIGNMENT_ATOMIC(tme_uint64_t) <= sizeof(tme_uint64_t))) \
                          && _tme_memory_address_test(mem, sizeof(tme_uint64_t) - 1, align_min) == 0)) \
      { \
        tme_memory_atomic_write64(mem, x, lock, sizeof(tme_uint64_t)); \
      /* otherwise, we must do a slow bus write: */ \
      } \
    else \
      { \
        tme_memory_bus_write64(mem, x, lock, align_min, bus_boundary); \
      } \
  } while (/* CONSTCOND */ 0)

/* the 64-bit atomic operations: */
tme_uint64_t tme_memory_atomic_add64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_sub64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_mul64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_div64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_and64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_or64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_xor64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_not64 _TME_P((tme_shared tme_uint64_t *, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_neg64 _TME_P((tme_shared tme_uint64_t *, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_xchg64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_cx64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_uint64_t, tme_rwlock_t *, unsigned int));
tme_uint64_t tme_memory_atomic_read64 _TME_P((_tme_const tme_shared tme_uint64_t *, tme_rwlock_t *, unsigned int));
void tme_memory_atomic_write64 _TME_P((tme_shared tme_uint64_t *, tme_uint64_t, tme_rwlock_t *, unsigned int));

#endif /* TME_HAVE_INT64_T */