3 files changed, 656 insertions, 0 deletions
diff --git a/src/third-party/ArenaAlloc/arenaalloc.h b/src/third-party/ArenaAlloc/arenaalloc.h
new file mode 100644
index 0000000..dfd648d
--- /dev/null
+++ b/src/third-party/ArenaAlloc/arenaalloc.h
@@ -0,0 +1,186 @@
+// -*- c++ -*-
+/******************************************************************************
+ *  arenaalloc.h
+ *  
+ *  Arena allocator based on the example logic provided by Nicolai Josuttis
+ *  and available at http://www.josuttis.com/libbook/examples.html.
+ *  This enhanced work is provided under the terms of the MIT license.
+ *  
+ *****************************************************************************/
+
+#ifndef _ARENA_ALLOC_H
+#define _ARENA_ALLOC_H
+
+#include <limits>
+#include <memory>
+
+#if __cplusplus >= 201103L
+#include <type_traits>
+#include <utility>
+#endif
+
+// Define macro ARENA_ALLOC_DEBUG to enable some tracing of the allocator
+#include "arenaallocimpl.h"
+
+namespace ArenaAlloc 
+{  
+  
+  struct _newAllocatorImpl
+  {
+    // these two functions should be supported by a specialized
+    // allocator for shared memory or another source of specialized
+    // memory such as device mapped memory.
+    void* allocate( size_t numBytes ) { return new char[ numBytes ]; }
+    void deallocate( void* ptr ) { delete[]( (char*)ptr ); }
+  };
+  
+  template <class T, 
+	    class AllocatorImpl = _newAllocatorImpl, 
+	    class MemblockImpl = _memblockimpl<AllocatorImpl> >
+  class Alloc {
+    
+  private:        
+    MemblockImpl* m_impl;    
+    
+  public:
+    // type definitions
+    typedef T        value_type;
+    typedef T*       pointer;
+    typedef const T* const_pointer;
+    typedef T&       reference;
+    typedef const T& const_reference;
+    typedef std::size_t    size_type;
+    typedef std::ptrdiff_t difference_type;
+    
+#if __cplusplus >= 201103L
+    // when containers are swapped, (i.e. vector.swap)
+    // swap the allocators also.  This was not specified in c++98
+    // thus users of this code not using c++11 must
+    // exercise caution when using the swap algorithm or
+    // specialized swap member function.  Specifically,
+    // don't swap containers not sharing the same
+    // allocator internal implementation in c++98.  This is ok
+    // in c++11.
+    typedef std::true_type propagate_on_container_swap;
+
+    // container moves should move the allocator also.
+    typedef std::true_type propagate_on_container_move_assignment;    
+#endif
+    
+    // rebind allocator to type U
+    template <class U>
+    struct rebind {
+      typedef Alloc<U,AllocatorImpl,MemblockImpl> other;
+    };
+   
+    // return address of values
+    pointer address (reference value) const {
+      return &value;
+    }
+    const_pointer address (const_reference value) const {
+      return &value;
+    }
+
+    Alloc( std::size_t defaultSize = 32768, AllocatorImpl allocImpl = AllocatorImpl() ) throw():
+      m_impl( MemblockImpl::create( defaultSize, allocImpl ) )
+    {      
+    }
+    
+    Alloc(const Alloc& src)  throw(): 
+      m_impl( src.m_impl )
+    {
+      m_impl->incrementRefCount();
+    }
+    
+    template <class U>
+    Alloc (const Alloc<U,AllocatorImpl,MemblockImpl>& src) throw(): 
+      m_impl( 0 )
+    {
+      MemblockImpl::assign( src, m_impl );
+      m_impl->incrementRefCount();
+    }
+    
+    ~Alloc() throw() 
+    {
+      m_impl->decrementRefCount();
+    }
+
+    // return maximum number of elements that can be allocated
+    size_type max_size () const throw() 
+    {
+      return std::numeric_limits<std::size_t>::max() / sizeof(T);
+    }
+
+    // allocate but don't initialize num elements of type T
+    pointer allocate (size_type num, const void* = 0) 
+    {
+      return reinterpret_cast<pointer>( m_impl->allocate(num*sizeof(T)) );
+    }
+
+    // initialize elements of allocated storage p with value value
+#if __cplusplus >= 201103L
+
+    // use c++11 style forwarding to construct the object    
+    template< typename P, typename... Args>
+    void construct( P* obj, Args&&... args )
+    {
+      ::new((void*) obj ) P( std::forward<Args>( args )... );
+    }
+
+    template< typename P >
+    void destroy( P* obj ) { obj->~P(); }
+    
+#else
+    void construct (pointer p, const T& value) 
+    {
+      new((void*)p)T(value);
+    }
+    void destroy (pointer p) { p->~T(); }
+#endif
+
+    // deallocate storage p of deleted elements
+    void deallocate (pointer p, size_type num) 
+    {
+      m_impl->deallocate( p );
+    }
+    
+    bool equals( const MemblockImpl * impl ) const
+    {
+      return impl == m_impl;
+    }
+  
+    bool operator == ( const Alloc& t2 ) const
+    {
+      return m_impl == t2.m_impl;
+    }
+  
+    friend MemblockImpl;
+    
+    template< typename Other >
+    bool operator == ( const Alloc< Other, AllocatorImpl, MemblockImpl >& t2 )
+    {
+      return t2.equals( m_impl );
+    }
+  
+    template< typename Other >
+    bool operator != ( const Alloc< Other, AllocatorImpl, MemblockImpl >& t2 )
+    {
+      return !t2.equals( m_impl );
+    }
+  
+    // These are extension functions not required for an stl allocator
+    size_t getNumAllocations() { return m_impl->getNumAllocations(); }
+    size_t getNumDeallocations() { return m_impl->getNumDeallocations(); }
+    size_t getNumBytesAllocated() { return m_impl->getNumBytesAllocated(); }    
+  };
+  
+  template<typename A>
+  template<typename T>
+  void _memblockimpl<A>::assign( const Alloc<T,A, _memblockimpl<A> >& src, _memblockimpl<A> *& dest )
+  {
+    dest = const_cast<_memblockimpl<A>* >(src.m_impl);
+  }
+  
+}
+
+#endif
diff --git a/src/third-party/ArenaAlloc/arenaallocimpl.h b/src/third-party/ArenaAlloc/arenaallocimpl.h
new file mode 100644
index 0000000..12484f0
--- /dev/null
+++ b/src/third-party/ArenaAlloc/arenaallocimpl.h
@@ -0,0 +1,286 @@
+// -*- c++ -*-
+/******************************************************************************
+ **  arenaallocimpl.h
+ **  
+ **  Internal implementation types of the arena allocator
+ **  MIT license
+ *****************************************************************************/
+
+#ifndef _ARENA_ALLOC_IMPL_H
+#define _ARENA_ALLOC_IMPL_H
+
+#ifdef ARENA_ALLOC_DEBUG
+#include <stdio.h>
+#endif
+
+namespace ArenaAlloc
+{
+
+  template< typename T, typename A, typename M >
+  class Alloc;
+  
+  // internal structure for tracking memory blocks
+  template < typename AllocImpl >
+  struct _memblock
+  {
+    // allocations are rounded up to a multiple of the size of this
+      // struct to maintain proper alignment for any pointer and double
+      // values stored in the allocation.
+      // A future goal is to support even stricter alignment for example
+      // to support cache alignment, special device  dependent mappings,
+      // or GPU ops.
+      union _roundsize {
+          double d;
+          void* p;
+      };
+
+      _memblock* m_next{nullptr};  // blocks kept link listed for cleanup at end
+      std::size_t m_bufferSize;  // size of the buffer
+      std::size_t m_index;  // index of next allocatable byte in the block
+      char* m_buffer;  // pointer to large block to allocate from
+
+      _memblock(std::size_t bufferSize, AllocImpl& allocImpl)
+          : m_bufferSize(roundSize(bufferSize)), m_index(0),
+            m_buffer(reinterpret_cast<char*>(allocImpl.allocate(
+                bufferSize)))  // this works b/c of order of decl
+      {
+      }
+
+      std::size_t roundSize( std::size_t numBytes )
+    {
+      // this is subject to overflow.  calling logic should not permit
+      // an attempt to allocate a really massive size.
+      // i.e. an attempt to allocate 10s of terabytes should be an error      
+      return ( ( numBytes + sizeof( _roundsize ) - 1 ) / 
+	       sizeof( _roundsize ) ) * sizeof( _roundsize );
+    }
+
+    char * allocate( std::size_t numBytes )
+    {
+      std::size_t roundedSize = roundSize( numBytes );
+      if( roundedSize + m_index > m_bufferSize )
+	return 0;
+
+      char * ptrToReturn = &m_buffer[ m_index ];
+      m_index += roundedSize;
+      return ptrToReturn;
+    }
+  
+    void dispose( AllocImpl& impl )
+    {
+      impl.deallocate( m_buffer );
+    }
+
+    ~_memblock()
+    {
+    }    
+  };
+  
+  template< typename AllocatorImpl, typename Derived >
+  struct _memblockimplbase
+  {
+    AllocatorImpl m_alloc;
+    std::size_t m_refCount; // when refs -> 0 delete this
+    std::size_t m_defaultSize;
+        
+    std::size_t m_numAllocate; // number of times allocate called
+    std::size_t m_numDeallocate; // number of time deallocate called
+    std::size_t m_numBytesAllocated; // A good estimate of amount of space used
+    
+    _memblock<AllocatorImpl> * m_head;
+    _memblock<AllocatorImpl> * m_current;
+
+    // round up 2 next power of 2 if not already
+    // a power of 2
+    std::size_t roundpow2( std::size_t value )
+    {
+      // note this works because subtracting 1 is equivalent to
+      // inverting the lowest set bit and complementing any
+      // bits lower than that.  only a power of 2
+      // will yield 0 in the following check
+      if( 0 == ( value & ( value - 1 ) ) )
+	return value; // already a power of 2
+      
+      // fold t over itself. This will set all bits after the highest set bit of t to 1
+      // who said bit twiddling wasn't practical?
+      value |= value >> 1;
+      value |= value >> 2;
+      value |= value >> 4;
+      value |= value >> 8;
+      value |= value >> 16;
+      value |= value >> 32;
+
+      return value + 1;            
+    }
+
+    _memblockimplbase( std::size_t defaultSize, AllocatorImpl& allocator ):
+      m_alloc( allocator ),
+      m_refCount( 1 ),
+      m_defaultSize( defaultSize ),
+      m_numAllocate( 0 ),
+      m_numDeallocate( 0 ),
+      m_numBytesAllocated( 0 ),
+      m_head( 0 ),
+      m_current( 0 )
+    {      
+      if( m_defaultSize < 256 )
+      {
+	m_defaultSize = 256; // anything less is academic. a more practical size is 4k or more
+      }
+      else if ( m_defaultSize > 1024UL*1024*1024*16 ) 
+      {
+	// when this becomes a problem, this package has succeeded beyond my wildest expectations
+	m_defaultSize = 1024UL*1024*1024*16;
+      }
+      
+      // for convenience block size should be a power of 2
+      // round up to next power of 2
+      m_defaultSize = roundpow2( m_defaultSize );
+      allocateNewBlock( m_defaultSize );      
+    }
+        
+    char * allocate( std::size_t numBytes )
+    {
+      char * ptrToReturn = m_current->allocate( numBytes );
+      if( !ptrToReturn )
+      {
+	allocateNewBlock( numBytes > m_defaultSize / 2 ? roundpow2( numBytes*2 ) : 
+			  m_defaultSize );
+	
+	ptrToReturn = m_current->allocate( numBytes );	
+      }
+      
+#ifdef ARENA_ALLOC_DEBUG
+      fprintf( stdout, "_memblockimpl=%p allocated %ld bytes at address=%p\n", this, numBytes, ptrToReturn );
+#endif
+
+      ++ m_numAllocate;
+      m_numBytesAllocated += numBytes; // does not account for the small overhead in tracking the allocation
+      
+      return ptrToReturn;
+    }
+    
+    void allocateNewBlock( std::size_t blockSize )
+    {      
+      _memblock<AllocatorImpl> * newBlock = new ( m_alloc.allocate( sizeof( _memblock<AllocatorImpl> ) ) )
+	_memblock<AllocatorImpl>( blockSize, m_alloc );
+						  
+#ifdef ARENA_ALLOC_DEBUG
+      fprintf( stdout, "_memblockimplbase=%p allocating a new block of size=%ld\n", this, blockSize );
+#endif      
+      
+      if( m_head == 0 )
+      {
+	m_head = m_current = newBlock;
+      }
+      else
+      {
+	m_current->m_next = newBlock;
+	m_current = newBlock;
+      }      
+    }    
+    
+    void deallocate( void * ptr )
+    {
+      ++ m_numDeallocate;
+    }
+    
+    size_t getNumAllocations() { return m_numAllocate; }
+    size_t getNumDeallocations() { return m_numDeallocate; }
+    size_t getNumBytesAllocated() { return m_numBytesAllocated; }
+  
+    void clear()
+    {
+      _memblock<AllocatorImpl> * block = m_head;
+      while( block )
+      {
+	_memblock<AllocatorImpl> * curr = block;
+	block = block->m_next;
+	curr->dispose( m_alloc );
+	curr->~_memblock<AllocatorImpl>();
+	m_alloc.deallocate( curr );
+      }      
+    }    
+
+    // The ref counting model does not permit the sharing of 
+    // this object across multiple threads unless an external locking mechanism is applied 
+    // to ensure the atomicity of the reference count.  
+    void incrementRefCount() 
+    { 
+      ++m_refCount; 
+#ifdef ARENA_ALLOC_DEBUG
+      fprintf( stdout, "ref count on _memblockimplbase=%p incremented to %ld\n", this, m_refCount );
+#endif      
+    }
+
+    void decrementRefCount()
+    {
+      --m_refCount;
+#ifdef ARENA_ALLOC_DEBUG
+      fprintf( stdout, "ref count on _memblockimplbase=%p decremented to %ld\n", this, m_refCount );
+#endif      
+      
+      if( m_refCount == 0 )
+      {
+	Derived::destroy( static_cast<Derived*>(this) );
+      }
+    }                      
+  };
+
+
+  // Each allocator points to an instance of _memblockimpl which
+  // contains the list of _memblock objects and other tracking info
+  // including a refcount.
+  // This object is instantiated in space obtained from the allocator
+  // implementation. The allocator implementation is the component
+  // on which allocate/deallocate are called to obtain storage from.
+  template< typename AllocatorImpl >
+  struct _memblockimpl : public _memblockimplbase<AllocatorImpl, _memblockimpl<AllocatorImpl> >
+  {     
+  private:
+
+    typedef struct _memblockimplbase< AllocatorImpl, _memblockimpl<AllocatorImpl> > base_t;
+    friend struct _memblockimplbase< AllocatorImpl, _memblockimpl<AllocatorImpl> >;
+    
+    // to get around some sticky access issues between Alloc<T1> and Alloc<T2> when sharing
+    // the implementation.
+    template <typename U, typename A, typename M >
+    friend class Alloc;
+    
+    template< typename T >
+    static void assign( const Alloc<T,AllocatorImpl, _memblockimpl<AllocatorImpl> >& src, 
+			  _memblockimpl *& dest );
+        
+    static _memblockimpl<AllocatorImpl> * create( size_t defaultSize, AllocatorImpl& alloc )
+    {
+      return new ( alloc.allocate( sizeof( _memblockimpl ) ) ) _memblockimpl<AllocatorImpl>( defaultSize, 
+											     alloc );
+    }
+   
+    static void destroy( _memblockimpl<AllocatorImpl> * objToDestroy )
+    {      
+      AllocatorImpl allocImpl = objToDestroy->m_alloc;
+      objToDestroy-> ~_memblockimpl<AllocatorImpl>();
+      allocImpl.deallocate( objToDestroy );      
+    }
+    
+    _memblockimpl( std::size_t defaultSize, AllocatorImpl& allocImpl ):
+      _memblockimplbase<AllocatorImpl, _memblockimpl<AllocatorImpl> >( defaultSize, allocImpl )
+    {
+#ifdef ARENA_ALLOC_DEBUG
+      fprintf( stdout, "_memblockimpl=%p constructed with default size=%ld\n", this, 
+	       base_t::m_defaultSize );
+#endif
+    }
+  
+    ~_memblockimpl( )
+    {
+#ifdef ARENA_ALLOC_DEBUG
+      fprintf( stdout, "~memblockimpl() called on _memblockimpl=%p\n", this );
+#endif      
+      base_t::clear();
+    }  
+  };
+}
+
+#endif
diff --git a/src/third-party/ArenaAlloc/recyclealloc.h b/src/third-party/ArenaAlloc/recyclealloc.h
new file mode 100644
index 0000000..129e43b
--- /dev/null
+++ b/src/third-party/ArenaAlloc/recyclealloc.h
@@ -0,0 +1,184 @@
+// -*- c++ -*-
+/******************************************************************************
+ **  recyclealloc.h
+ **
+ **  Arena allocator with some modest recycling of freed resources.  
+ **  MIT license
+ **
+ *****************************************************************************/
+#ifndef _RECYCLE_ALLOC_H
+#define _RECYCLE_ALLOC_H
+
+#include "arenaalloc.h"
+#include <string.h>
+#include <inttypes.h>
+
+namespace ArenaAlloc
+{
+  
+  // todo:
+  // attempt refactor of boilerplate in _memblockimpl and _recycleallocimpl
+  template< typename AllocatorImpl, uint16_t StepSize = 16, uint16_t NumBuckets = 256 >
+  struct _recycleallocimpl : public _memblockimplbase<AllocatorImpl, _recycleallocimpl<AllocatorImpl> >
+  {     
+  private:
+    
+    static_assert( ( StepSize >= 16 && NumBuckets >= 16 ), "Min step size=16, Min num buckets=16" );
+    static_assert( !( StepSize & ( StepSize - 1 ) ), "Step size must be a power of 2" );
+    
+    struct _freeEntry
+    {
+      // note: order of declaration matters
+      std::size_t m_size;
+      _freeEntry * m_next;      
+    };
+    
+    _freeEntry * m_buckets[ NumBuckets ]; // m_buckets[ NumBuckets - 1 ] is the oversize bucket
+    
+    typedef struct _memblockimplbase< AllocatorImpl, _recycleallocimpl<AllocatorImpl> > base_t;
+    friend struct _memblockimplbase< AllocatorImpl, _recycleallocimpl<AllocatorImpl> >;
+    
+    // to get around some sticky access issues between Alloc<T1> and Alloc<T2> when sharing
+    // the implementation.
+    template <typename U, typename A, typename M >
+    friend class Alloc;
+    
+    template< typename T >
+    static void assign( const Alloc<T,AllocatorImpl, _recycleallocimpl<AllocatorImpl> >& src, 
+			  _recycleallocimpl *& dest )
+    {
+      dest = const_cast< _recycleallocimpl<AllocatorImpl>* >( src.m_impl );
+    }
+        
+    static _recycleallocimpl<AllocatorImpl> * create( std::size_t defaultSize, AllocatorImpl& alloc )
+    {
+      return new ( 
+	alloc.allocate( sizeof( _recycleallocimpl ) ) ) _recycleallocimpl<AllocatorImpl>( defaultSize, 
+											  alloc );
+    }
+   
+    static void destroy( _recycleallocimpl<AllocatorImpl> * objToDestroy )
+    {      
+      AllocatorImpl allocImpl = objToDestroy->m_alloc;
+      objToDestroy-> ~_recycleallocimpl<AllocatorImpl>();
+      allocImpl.deallocate( objToDestroy );      
+    }
+    
+    _recycleallocimpl( std::size_t defaultSize, AllocatorImpl& allocImpl ):
+      _memblockimplbase<AllocatorImpl, _recycleallocimpl<AllocatorImpl> >( defaultSize, allocImpl )
+    {
+      memset( m_buckets, 0, sizeof( m_buckets ) );
+
+#ifdef ARENA_ALLOC_DEBUG
+      fprintf( stdout, "_recycleallocimpl=%p constructed with default size=%ld\n", this, 
+	       base_t::m_defaultSize );
+#endif
+    }
+  
+    ~_recycleallocimpl( )
+    {
+#ifdef ARENA_ALLOC_DEBUG
+      fprintf( stdout, "~_recycleallocimpl() called on _recycleallocimpl=%p\n", this );
+#endif      
+      base_t::clear();
+    }  
+
+    char * allocate( std::size_t numBytes )
+    {      
+      
+      numBytes = ( (numBytes + sizeof( std::size_t ) + StepSize - 1) / StepSize ) * StepSize;
+      
+      char * returnValue = allocateInternal( numBytes );
+      if( !returnValue )
+      {
+	char * allocValue = base_t::allocate( numBytes );
+	
+	if( !allocValue )
+	  return 0; //allocation failure
+
+	*((std::size_t*)allocValue ) = numBytes; // that includes the header
+	return allocValue + sizeof( std::size_t );
+      }
+      
+      return returnValue;
+    }
+    
+    void deallocate( void * ptr )
+    {      
+      deallocateInternal( reinterpret_cast<char*>(ptr) );
+      base_t::deallocate( ptr ); // this is called b/c it is known this just updates stats
+    }
+
+    char * allocateInternal( std::size_t numBytes )
+    {      
+      // numBytes must already be rounded to a multiple of stepsize and have an
+      // extra sizeof( std::size_t ) bytes tacked on for the header
+      // pointer returned points sizeof( std::size_t ) bytes into the allocation
+      // bucket 0 is always null in this scheme. 
+      
+      uint16_t bucketNumber = numBytes / StepSize;
+      
+      if( bucketNumber > NumBuckets - 1 )
+	bucketNumber = NumBuckets - 1; // oversize alloc
+      
+      // search max 3 consecutive buckets for an item large enough.
+      // in the oversize bucket and only in the oversize bucket,
+      // search upto 3 items into the linked list for an entry
+      // large enough for the specified size
+      for( uint16_t bkt = bucketNumber, i = 0; i < 3 && bkt < NumBuckets; ++i, ++bkt )
+      {
+	if( m_buckets[ bkt ] )
+	  return allocateFrom( numBytes, m_buckets[ bkt ] );
+      }
+      
+      return 0;
+    }
+
+    char * allocateFrom( std::size_t numBytes, _freeEntry *& head )
+    {      
+      _freeEntry * current = head;
+      _freeEntry * prev = 0;
+      
+      int count = 0;
+
+      while( current && count < 3 )
+      {
+	if( current->m_size >= numBytes )
+	{
+	  if( prev == 0 )
+	    head = current->m_next;
+	  else
+	    prev->m_next = current->m_next;
+	  
+	  return reinterpret_cast<char*>(&current->m_next);
+	}
+	
+	++count;
+	prev = current;
+	current = current->m_next;
+      }
+      
+      return 0;
+    }
+
+    void deallocateInternal( char * ptr )
+    {
+      _freeEntry * v = reinterpret_cast< _freeEntry* >( ptr - sizeof( std::size_t ) );
+      uint16_t bucketNumber = v->m_size / StepSize;
+      
+      if( bucketNumber > NumBuckets - 1 )
+	bucketNumber = NumBuckets - 1;
+      
+      _freeEntry * next = m_buckets[ bucketNumber ];
+      v->m_next = next;
+      m_buckets[ bucketNumber ] = v;
+    }
+    
+  };
+  
+  template< typename T, typename Allocator = _newAllocatorImpl >
+  using RecycleAlloc = Alloc< T, Allocator, _recycleallocimpl<Allocator> >;
+  
+}
+
+#endif