// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
#define CALLOC
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace FASTER.core
{
///
/// Memory allocator for objects
///
///
public unsafe class MallocFixedPageSize : IDisposable
{
private const bool ForceUnpinnedAllocation = false;
private const int PageSizeBits = 16;
private const int PageSize = 1 << PageSizeBits;
private const int PageSizeMask = PageSize - 1;
private const int LevelSizeBits = 12;
private const int LevelSize = 1 << LevelSizeBits;
private const int LevelSizeMask = LevelSize - 1;
private T[][] values = new T[LevelSize][];
private GCHandle[] handles = new GCHandle[LevelSize];
private IntPtr[] pointers = new IntPtr[LevelSize];
private T[] values0;
private readonly GCHandle handles0;
private readonly IntPtr pointers0;
private readonly int RecordSize;
private readonly int AlignedPageSize;
private volatile int writeCacheLevel;
private volatile int count;
private readonly bool IsPinned;
private readonly bool ReturnPhysicalAddress;
private CountdownEvent checkpointEvent;
private readonly LightEpoch epoch;
private readonly bool ownedEpoch;
private FastThreadLocal> freeList;
///
/// Create new instance
///
///
///
public MallocFixedPageSize(bool returnPhysicalAddress = false, LightEpoch epoch = null)
{
freeList = new FastThreadLocal>();
if (epoch == null)
{
this.epoch = new LightEpoch();
ownedEpoch = true;
}
else
this.epoch = epoch;
values[0] = new T[PageSize];
#if !(CALLOC)
Array.Clear(values[0], 0, PageSize);
#endif
ReturnPhysicalAddress = returnPhysicalAddress;
if (ForceUnpinnedAllocation)
{
IsPinned = false;
ReturnPhysicalAddress = false;
}
else
{
IsPinned = true;
if (default(T) == null)
{
IsPinned = false;
ReturnPhysicalAddress = false;
}
else
{
// The surefire way to check if a type is blittable
// it to try GCHandle.Alloc with a handle type of Pinned.
// If it throws an exception, we know the type is not blittable.
try
{
handles[0] = GCHandle.Alloc(values[0], GCHandleType.Pinned);
pointers[0] = handles[0].AddrOfPinnedObject();
handles0 = handles[0];
pointers0 = pointers[0];
RecordSize = Marshal.SizeOf(values[0][0]);
AlignedPageSize = RecordSize * PageSize;
}
catch (Exception)
{
IsPinned = false;
ReturnPhysicalAddress = false;
}
}
}
values0 = values[0];
writeCacheLevel = -1;
Interlocked.MemoryBarrier();
BulkAllocate(); // null pointer
}
///
/// Get physical address
///
///
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public long GetPhysicalAddress(long address)
{
if (ReturnPhysicalAddress)
{
return address;
}
else
{
return
(long)pointers[address >> PageSizeBits]
+ (long)(address & PageSizeMask) * RecordSize;
}
}
///
/// Get object
///
///
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ref T Get(long index)
{
if (this.ReturnPhysicalAddress)
throw new Exception("Physical pointer returned by allocator: de-reference pointer to get records instead of calling Get");
return ref values
[index >> PageSizeBits]
[index & PageSizeMask];
}
///
/// Set object
///
///
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Set(long index, ref T value)
{
if (this.ReturnPhysicalAddress)
throw new Exception("Physical pointer returned by allocator: de-reference pointer to set records instead of calling Set (otherwise, set ForceUnpinnedAllocation to true)");
values
[index >> PageSizeBits]
[index & PageSizeMask]
= value;
}
///
/// Free object
///
///
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void FreeAtEpoch(long pointer, int removed_epoch = -1)
{
if (!ReturnPhysicalAddress)
{
values[pointer >> PageSizeBits][pointer & PageSizeMask] = default(T);
}
freeList.InitializeThread();
if (freeList.Value == null)
freeList.Value = new Queue();
freeList.Value.Enqueue(new FreeItem { removed_item = pointer, removal_epoch = removed_epoch });
}
private const int kAllocateChunkSize = 16;
///
/// Warning: cannot mix 'n' match use of
/// Allocate and BulkAllocate
///
///
public long BulkAllocate()
{
// Determine insertion index.
// ReSharper disable once CSharpWarnings::CS0420
#pragma warning disable 420
int index = Interlocked.Add(ref count, kAllocateChunkSize) - kAllocateChunkSize;
#pragma warning restore 420
int offset = index & PageSizeMask;
int baseAddr = index >> PageSizeBits;
// Handle indexes in first batch specially because they do not use write cache.
if (baseAddr == 0)
{
// If index 0, then allocate space for next level.
if (index == 0)
{
var tmp = new T[PageSize];
#if !(CALLOC)
Array.Clear(tmp, 0, PageSize);
#endif
if (IsPinned)
{
handles[1] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
pointers[1] = handles[1].AddrOfPinnedObject();
}
values[1] = tmp;
Interlocked.MemoryBarrier();
}
// Return location.
if (ReturnPhysicalAddress)
return (((long)pointers0) + index * RecordSize);
else
return index;
}
// See if write cache contains corresponding array.
var cache = writeCacheLevel;
T[] array;
if (cache != -1)
{
// Write cache is correct array only if index is within [arrayCapacity, 2*arrayCapacity).
if (cache == baseAddr)
{
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers[baseAddr]) + (long)offset * RecordSize;
else
return index;
}
}
// Write cache did not work, so get level information from index.
// int level = GetLevelFromIndex(index);
// Spin-wait until level has an allocated array.
var spinner = new SpinWait();
while (true)
{
array = values[baseAddr];
if (array != null)
{
break;
}
spinner.SpinOnce();
}
// Perform extra actions if inserting at offset 0 of level.
if (offset == 0)
{
// Update write cache to point to current level.
writeCacheLevel = baseAddr;
Interlocked.MemoryBarrier();
// Allocate for next page
int newBaseAddr = baseAddr + 1;
var tmp = new T[PageSize];
#if !(CALLOC)
Array.Clear(tmp, 0, PageSize);
#endif
if (IsPinned)
{
handles[newBaseAddr] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
pointers[newBaseAddr] = handles[newBaseAddr].AddrOfPinnedObject();
}
values[newBaseAddr] = tmp;
Interlocked.MemoryBarrier();
}
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers[baseAddr]) + (long)offset * RecordSize;
else
return index;
}
///
/// Allocate
///
///
public long Allocate()
{
freeList.InitializeThread();
if (freeList.Value == null)
{
freeList.Value = new Queue();
}
if (freeList.Value.Count > 0)
{
if (freeList.Value.Peek().removal_epoch <= epoch.SafeToReclaimEpoch)
return freeList.Value.Dequeue().removed_item;
//if (freeList.Count % 64 == 0)
// LightEpoch.Instance.BumpCurrentEpoch();
}
// Determine insertion index.
// ReSharper disable once CSharpWarnings::CS0420
#pragma warning disable 420
int index = Interlocked.Increment(ref count) - 1;
#pragma warning restore 420
int offset = index & PageSizeMask;
int baseAddr = index >> PageSizeBits;
// Handle indexes in first batch specially because they do not use write cache.
if (baseAddr == 0)
{
// If index 0, then allocate space for next level.
if (index == 0)
{
var tmp = new T[PageSize];
#if !(CALLOC)
Array.Clear(tmp, 0, PageSize);
#endif
if (IsPinned)
{
handles[1] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
pointers[1] = handles[1].AddrOfPinnedObject();
}
values[1] = tmp;
Interlocked.MemoryBarrier();
}
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers0) + index * RecordSize;
else
return index;
}
// See if write cache contains corresponding array.
var cache = writeCacheLevel;
T[] array;
if (cache != -1)
{
// Write cache is correct array only if index is within [arrayCapacity, 2*arrayCapacity).
if (cache == baseAddr)
{
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers[baseAddr]) + (long)offset * RecordSize;
else
return index;
}
}
// Write cache did not work, so get level information from index.
// int level = GetLevelFromIndex(index);
// Spin-wait until level has an allocated array.
var spinner = new SpinWait();
while (true)
{
array = values[baseAddr];
if (array != null)
{
break;
}
spinner.SpinOnce();
}
// Perform extra actions if inserting at offset 0 of level.
if (offset == 0)
{
// Update write cache to point to current level.
writeCacheLevel = baseAddr;
Interlocked.MemoryBarrier();
// Allocate for next page
int newBaseAddr = baseAddr + 1;
var tmp = new T[PageSize];
#if !(CALLOC)
Array.Clear(tmp, 0, PageSize);
#endif
if (IsPinned)
{
handles[newBaseAddr] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
pointers[newBaseAddr] = handles[newBaseAddr].AddrOfPinnedObject();
}
values[newBaseAddr] = tmp;
Interlocked.MemoryBarrier();
}
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers[baseAddr]) + (long)offset * RecordSize;
else
return index;
}
///
/// Acquire thread
///
public void Acquire()
{
if (ownedEpoch)
epoch.Acquire();
freeList.InitializeThread();
}
///
/// Release thread
///
public void Release()
{
if (ownedEpoch)
epoch.Release();
freeList.DisposeThread();
}
///
/// Dispose
///
public void Dispose()
{
for (int i = 0; i < values.Length; i++)
{
if (IsPinned && (handles[i].IsAllocated)) handles[i].Free();
values[i] = null;
}
handles = null;
pointers = null;
values = null;
values0 = null;
count = 0;
if (ownedEpoch)
epoch.Dispose();
freeList.Dispose();
}
#region Checkpoint
///
/// Public facing persistence API
///
///
///
///
public void TakeCheckpoint(IDevice device, ulong start_offset, out ulong numBytes)
{
BeginCheckpoint(device, start_offset, out numBytes);
}
///
/// Is checkpoint complete
///
///
///
public bool IsCheckpointCompleted(bool waitUntilComplete = false)
{
bool completed = checkpointEvent.IsSet;
if (!completed && waitUntilComplete)
{
checkpointEvent.Wait();
return true;
}
return completed;
}
internal void BeginCheckpoint(IDevice device, ulong offset, out ulong numBytesWritten)
{
int localCount = count;
int recordsCountInLastLevel = localCount & PageSizeMask;
int numCompleteLevels = localCount >> PageSizeBits;
int numLevels = numCompleteLevels + (recordsCountInLastLevel > 0 ? 1 : 0);
checkpointEvent = new CountdownEvent(numLevels);
uint alignedPageSize = PageSize * (uint)RecordSize;
uint lastLevelSize = (uint)recordsCountInLastLevel * (uint)RecordSize;
int sectorSize = (int)device.SectorSize;
numBytesWritten = 0;
for (int i = 0; i < numLevels; i++)
{
OverflowPagesFlushAsyncResult result = default(OverflowPagesFlushAsyncResult);
uint writeSize = (uint)((i == numCompleteLevels) ? (lastLevelSize + (sectorSize - 1)) & ~(sectorSize - 1) : alignedPageSize);
device.WriteAsync(pointers[i], offset + numBytesWritten, writeSize, AsyncFlushCallback, result);
numBytesWritten += writeSize;
}
}
private void AsyncFlushCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
try
{
if (errorCode != 0)
{
System.Diagnostics.Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
}
catch (Exception ex)
{
System.Diagnostics.Trace.TraceError("Completion Callback error, {0}", ex.Message);
}
finally
{
checkpointEvent.Signal();
Overlapped.Free(overlap);
}
}
///
/// Max valid address
///
///
public int GetMaxValidAddress()
{
return count;
}
///
/// Get page size
///
///
public int GetPageSize()
{
return PageSize;
}
#endregion
#region Recover
///
/// Recover
///
///
///
///
///
public void Recover(IDevice device, ulong offset, int buckets, ulong numBytes)
{
BeginRecovery(device, offset, buckets, numBytes, out ulong numBytesRead);
}
///
/// Check if recovery complete
///
///
///
public bool IsRecoveryCompleted(bool waitUntilComplete = false)
{
bool completed = (numLevelsToBeRecovered == 0);
if (!completed && waitUntilComplete)
{
while (numLevelsToBeRecovered != 0)
{
Thread.Sleep(10);
}
}
return completed;
}
// Implementation of asynchronous recovery
private int numLevelsToBeRecovered;
internal void BeginRecovery(IDevice device,
ulong offset,
int buckets,
ulong numBytesToRead,
out ulong numBytesRead)
{
// Allocate as many records in memory
while (count < buckets)
{
Allocate();
}
int numRecords = (int)numBytesToRead / RecordSize;
int recordsCountInLastLevel = numRecords & PageSizeMask;
int numCompleteLevels = numRecords >> PageSizeBits;
int numLevels = numCompleteLevels + (recordsCountInLastLevel > 0 ? 1 : 0);
numLevelsToBeRecovered = numLevels;
numBytesRead = 0;
uint alignedPageSize = (uint)PageSize * (uint)RecordSize;
uint lastLevelSize = (uint)recordsCountInLastLevel * (uint)RecordSize;
for (int i = 0; i < numLevels; i++)
{
//read a full page
uint length = (uint)PageSize * (uint)RecordSize; ;
OverflowPagesReadAsyncResult result = default(OverflowPagesReadAsyncResult);
device.ReadAsync(offset + numBytesRead, pointers[i], length, AsyncPageReadCallback, result);
numBytesRead += (i == numCompleteLevels) ? lastLevelSize : alignedPageSize;
}
}
private void AsyncPageReadCallback(
uint errorCode,
uint numBytes,
NativeOverlapped* overlap)
{
try
{
if (errorCode != 0)
{
System.Diagnostics.Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
}
catch (Exception ex)
{
System.Diagnostics.Trace.TraceError("Completion Callback error, {0}", ex.Message);
}
finally
{
Interlocked.Decrement(ref numLevelsToBeRecovered);
Overlapped.Free(overlap);
}
}
#endregion
}
internal struct FreeItem
{
public long removed_item;
public int removal_epoch;
}
}