Copy project 'https://github.com/microsoft/FASTER' to ZeroLevel
Append DumpStorage
pull/1/head
a.bozhenov 5 years ago
parent 391f8d43d4
commit c5dfd30ee2

@ -6,6 +6,9 @@ namespace TestApp
{
private static void Main(string[] args)
{
Configuration.Save(Configuration.ReadFromApplicationConfig());
Bootstrap.Startup<MyService>(args,
() => Configuration.ReadSetFromIniFile("config.ini"))

@ -0,0 +1,38 @@
using System;
using System.Linq;
using Xunit;
using ZeroLevel.Services.Microservices.Dump;
using ZeroLevel.UnitTests.Models;
namespace ZeroLevel.UnitTests
{
public class DumpTests
{
[Fact]
public void DumpStorageTest()
{
// Arrange
var storage = new DumpStorage<TestSerializableDTO>();
var arr = new TestSerializableDTO[] {
new TestSerializableDTO { Id = 0, Title = "#1", Timestamp = DateTime.UtcNow.Ticks },
new TestSerializableDTO { Id = 1, Title = "#2", Timestamp = DateTime.UtcNow.Ticks },
new TestSerializableDTO { Id = 2, Title = "#3", Timestamp = DateTime.UtcNow.Ticks }
};
// Act
storage.Dump(arr[0]);
storage.Dump(arr[1]);
storage.Dump(arr[2]);
// Assert
int index = 0;
foreach (var entry in storage.ReadAndTruncate())
{
Assert.True(arr[index].Equals(entry));
index++;
}
Assert.True(0 == storage.ReadAndTruncate().ToArray().Length);
}
}
}

@ -0,0 +1,41 @@
using System;
using ZeroLevel.Services.Serialization;
namespace ZeroLevel.UnitTests.Models
{
public class TestSerializableDTO
: IBinarySerializable, IEquatable<TestSerializableDTO>
{
public long Id { get; set; }
public string Title { get; set; }
public long Timestamp { get; set; }
public void Deserialize(IBinaryReader reader)
{
this.Id = reader.ReadLong();
this.Title = reader.ReadString();
this.Timestamp = reader.ReadLong();
}
public override bool Equals(object obj)
{
return this.Equals(obj as TestSerializableDTO);
}
public bool Equals(TestSerializableDTO other)
{
if (other == null) return false;
if (this.Id != other.Id) return false;
if (this.Timestamp != other.Timestamp) return false;
if (string.Compare(this.Title, other.Title, false) != 0) return false;
return true;
}
public void Serialize(IBinaryWriter writer)
{
writer.WriteLong(this.Id);
writer.WriteString(this.Title);
writer.WriteLong(this.Timestamp);
}
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,80 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Concurrent;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Async IO context for PMM
/// </summary>
public unsafe struct AsyncIOContext<Key, Value>
{
/// <summary>
/// Id
/// </summary>
public long id;
/// <summary>
/// Key
/// </summary>
public IHeapContainer<Key> request_key;
/// <summary>
/// Retrieved key
/// </summary>
public Key key;
/// <summary>
/// Retrieved value
/// </summary>
public Value value;
/// <summary>
/// Logical address
/// </summary>
public long logicalAddress;
/// <summary>
/// Record buffer
/// </summary>
public SectorAlignedMemory record;
/// <summary>
/// Object buffer
/// </summary>
public SectorAlignedMemory objBuffer;
/// <summary>
/// Callback queue
/// </summary>
public BlockingCollection<AsyncIOContext<Key, Value>> callbackQueue;
/// <summary>
/// Dispose
/// </summary>
public void Dispose()
{
// Do not dispose request_key as it is a shallow copy
// of the key in pendingContext
record.Return();
}
}
internal class SimpleReadContext : IAsyncResult
{
public long logicalAddress;
public SectorAlignedMemory record;
public SemaphoreSlim completedRead;
public object AsyncState => throw new NotImplementedException();
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
public bool CompletedSynchronously => throw new NotImplementedException();
public bool IsCompleted => throw new NotImplementedException();
}
}

@ -0,0 +1,91 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System.Threading;
using System.Runtime.InteropServices;
using System;
namespace FASTER.core
{
[StructLayout(LayoutKind.Explicit)]
struct AtomicOwner
{
[FieldOffset(0)]
int owner;
[FieldOffset(4)]
int count;
[FieldOffset(0)]
long atomic;
/// <summary>
/// Enqueue token
/// true: success + caller is new owner
/// false: success + someone else is owner
/// </summary>
/// <returns></returns>
public bool Enqueue()
{
while (true)
{
var older = this;
var newer = older;
newer.count++;
if (older.owner == 0)
newer.owner = 1;
if (Interlocked.CompareExchange(ref this.atomic, newer.atomic, older.atomic) == older.atomic)
{
return older.owner == 0;
}
}
}
/// <summary>
/// Dequeue token (caller is/remains owner)
/// true: successful dequeue
/// false: failed dequeue
/// </summary>
/// <returns></returns>
public bool Dequeue()
{
while (true)
{
var older = this;
var newer = older;
newer.count--;
if (Interlocked.CompareExchange(ref this.atomic, newer.atomic, older.atomic) == older.atomic)
{
return newer.count > 0;
}
}
}
/// <summary>
/// Release queue ownership
/// true: successful release
/// false: failed release
/// </summary>
/// <returns></returns>
public bool Release()
{
while (true)
{
var older = this;
var newer = older;
if (newer.count > 0)
return false;
if (newer.owner == 0)
throw new Exception("Invalid release by non-owner thread");
newer.owner = 0;
if (Interlocked.CompareExchange(ref this.atomic, newer.atomic, older.atomic) == older.atomic)
{
return true;
}
}
}
}
}

@ -0,0 +1,401 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Runtime.InteropServices;
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
namespace FASTER.core
{
public unsafe sealed class BlittableAllocator<Key, Value> : AllocatorBase<Key, Value>
where Key : new()
where Value : new()
{
// Circular buffer definition
private byte[][] values;
private GCHandle[] handles;
private long[] pointers;
private readonly GCHandle ptrHandle;
private readonly long* nativePointers;
// Record sizes
private static readonly int recordSize = Utility.GetSize(default(Record<Key, Value>));
private static readonly int keySize = Utility.GetSize(default(Key));
private static readonly int valueSize = Utility.GetSize(default(Value));
public BlittableAllocator(LogSettings settings, IFasterEqualityComparer<Key> comparer, Action<long, long> evictCallback = null, LightEpoch epoch = null, Action<CommitInfo> flushCallback = null)
: base(settings, comparer, evictCallback, epoch, flushCallback)
{
values = new byte[BufferSize][];
handles = new GCHandle[BufferSize];
pointers = new long[BufferSize];
ptrHandle = GCHandle.Alloc(pointers, GCHandleType.Pinned);
nativePointers = (long*)ptrHandle.AddrOfPinnedObject();
}
public override void Initialize()
{
Initialize(Constants.kFirstValidAddress);
}
public override ref RecordInfo GetInfo(long physicalAddress)
{
return ref Unsafe.AsRef<RecordInfo>((void*)physicalAddress);
}
public override ref RecordInfo GetInfoFromBytePointer(byte* ptr)
{
return ref Unsafe.AsRef<RecordInfo>(ptr);
}
public override ref Key GetKey(long physicalAddress)
{
return ref Unsafe.AsRef<Key>((byte*)physicalAddress + RecordInfo.GetLength());
}
public override ref Value GetValue(long physicalAddress)
{
return ref Unsafe.AsRef<Value>((byte*)physicalAddress + RecordInfo.GetLength() + keySize);
}
public override int GetRecordSize(long physicalAddress)
{
return recordSize;
}
public override int GetAverageRecordSize()
{
return recordSize;
}
public override int GetInitialRecordSize<Input>(ref Key key, ref Input input)
{
return recordSize;
}
public override int GetRecordSize(ref Key key, ref Value value)
{
return recordSize;
}
/// <summary>
/// Dispose memory allocator
/// </summary>
public override void Dispose()
{
if (values != null)
{
for (int i = 0; i < values.Length; i++)
{
if (handles[i].IsAllocated)
handles[i].Free();
values[i] = null;
}
}
handles = null;
pointers = null;
values = null;
base.Dispose();
}
public override AddressInfo* GetKeyAddressInfo(long physicalAddress)
{
return (AddressInfo*)((byte*)physicalAddress + RecordInfo.GetLength());
}
public override AddressInfo* GetValueAddressInfo(long physicalAddress)
{
return (AddressInfo*)((byte*)physicalAddress + RecordInfo.GetLength() + keySize);
}
/// <summary>
/// Allocate memory page, pinned in memory, and in sector aligned form, if possible
/// </summary>
/// <param name="index"></param>
internal override void AllocatePage(int index)
{
var adjustedSize = PageSize + 2 * sectorSize;
byte[] tmp = new byte[adjustedSize];
Array.Clear(tmp, 0, adjustedSize);
handles[index] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
long p = (long)handles[index].AddrOfPinnedObject();
pointers[index] = (p + (sectorSize - 1)) & ~(sectorSize - 1);
values[index] = tmp;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override long GetPhysicalAddress(long logicalAddress)
{
// Offset within page
int offset = (int)(logicalAddress & ((1L << LogPageSizeBits) - 1));
// Index of page within the circular buffer
int pageIndex = (int)((logicalAddress >> LogPageSizeBits) & (BufferSize - 1));
return *(nativePointers + pageIndex) + offset;
}
protected override bool IsAllocated(int pageIndex)
{
return values[pageIndex] != null;
}
protected override void WriteAsync<TContext>(long flushPage, IOCompletionCallback callback, PageAsyncFlushResult<TContext> asyncResult)
{
WriteAsync((IntPtr)pointers[flushPage % BufferSize],
(ulong)(AlignedPageSizeBytes * flushPage),
(uint)AlignedPageSizeBytes,
callback,
asyncResult, device);
}
protected override void WriteAsyncToDevice<TContext>
(long startPage, long flushPage, int pageSize, IOCompletionCallback callback,
PageAsyncFlushResult<TContext> asyncResult, IDevice device, IDevice objectLogDevice)
{
var alignedPageSize = (pageSize + (sectorSize - 1)) & ~(sectorSize - 1);
WriteAsync((IntPtr)pointers[flushPage % BufferSize],
(ulong)(AlignedPageSizeBytes * (flushPage - startPage)),
(uint)alignedPageSize, callback, asyncResult,
device);
}
/// <summary>
/// Get start logical address
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
public override long GetStartLogicalAddress(long page)
{
return page << LogPageSizeBits;
}
/// <summary>
/// Get first valid logical address
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
public override long GetFirstValidLogicalAddress(long page)
{
if (page == 0)
return (page << LogPageSizeBits) + Constants.kFirstValidAddress;
return page << LogPageSizeBits;
}
protected override void ClearPage(long page, int offset)
{
if (offset == 0)
Array.Clear(values[page % BufferSize], offset, values[page % BufferSize].Length - offset);
else
{
// Adjust array offset for cache alignment
offset += (int)(pointers[page % BufferSize] - (long)handles[page % BufferSize].AddrOfPinnedObject());
Array.Clear(values[page % BufferSize], offset, values[page % BufferSize].Length - offset);
}
}
/// <summary>
/// Delete in-memory portion of the log
/// </summary>
internal override void DeleteFromMemory()
{
for (int i = 0; i < values.Length; i++)
{
if (handles[i].IsAllocated)
handles[i].Free();
values[i] = null;
}
handles = null;
pointers = null;
values = null;
}
private void WriteAsync<TContext>(IntPtr alignedSourceAddress, ulong alignedDestinationAddress, uint numBytesToWrite,
IOCompletionCallback callback, PageAsyncFlushResult<TContext> asyncResult,
IDevice device)
{
if (asyncResult.partial)
{
// Write only required bytes within the page
int aligned_start = (int)((asyncResult.fromAddress - (asyncResult.page << LogPageSizeBits)));
aligned_start = (aligned_start / sectorSize) * sectorSize;
int aligned_end = (int)((asyncResult.untilAddress - (asyncResult.page << LogPageSizeBits)));
aligned_end = ((aligned_end + (sectorSize - 1)) & ~(sectorSize - 1));
numBytesToWrite = (uint)(aligned_end - aligned_start);
device.WriteAsync(alignedSourceAddress + aligned_start, alignedDestinationAddress + (ulong)aligned_start, numBytesToWrite, callback, asyncResult);
}
else
{
device.WriteAsync(alignedSourceAddress, alignedDestinationAddress,
numBytesToWrite, callback, asyncResult);
}
}
protected override void ReadAsync<TContext>(
ulong alignedSourceAddress, int destinationPageIndex, uint aligned_read_length,
IOCompletionCallback callback, PageAsyncReadResult<TContext> asyncResult, IDevice device, IDevice objlogDevice)
{
device.ReadAsync(alignedSourceAddress, (IntPtr)pointers[destinationPageIndex],
aligned_read_length, callback, asyncResult);
}
/// <summary>
/// Invoked by users to obtain a record from disk. It uses sector aligned memory to read
/// the record efficiently into memory.
/// </summary>
/// <param name="fromLogical"></param>
/// <param name="numBytes"></param>
/// <param name="callback"></param>
/// <param name="context"></param>
/// <param name="result"></param>
protected override void AsyncReadRecordObjectsToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext<Key, Value> context, SectorAlignedMemory result = default(SectorAlignedMemory))
{
throw new InvalidOperationException("AsyncReadRecordObjectsToMemory invalid for BlittableAllocator");
}
/// <summary>
/// Retrieve objects from object log
/// </summary>
/// <param name="record"></param>
/// <param name="ctx"></param>
/// <returns></returns>
protected override bool RetrievedFullRecord(byte* record, ref AsyncIOContext<Key, Value> ctx)
{
ShallowCopy(ref GetKey((long)record), ref ctx.key);
ShallowCopy(ref GetValue((long)record), ref ctx.value);
return true;
}
/// <summary>
/// Whether KVS has keys to serialize/deserialize
/// </summary>
/// <returns></returns>
public override bool KeyHasObjects()
{
return false;
}
/// <summary>
/// Whether KVS has values to serialize/deserialize
/// </summary>
/// <returns></returns>
public override bool ValueHasObjects()
{
return false;
}
public override IHeapContainer<Key> GetKeyContainer(ref Key key) => new StandardHeapContainer<Key>(ref key);
public override IHeapContainer<Value> GetValueContainer(ref Value value) => new StandardHeapContainer<Value>(ref value);
public override long[] GetSegmentOffsets()
{
return null;
}
internal override void PopulatePage(byte* src, int required_bytes, long destinationPage)
{
throw new Exception("BlittableAllocator memory pages are sector aligned - use direct copy");
// Buffer.MemoryCopy(src, (void*)pointers[destinationPage % BufferSize], required_bytes, required_bytes);
}
/// <summary>
/// Iterator interface for scanning FASTER log
/// </summary>
/// <param name="beginAddress"></param>
/// <param name="endAddress"></param>
/// <param name="scanBufferingMode"></param>
/// <returns></returns>
public override IFasterScanIterator<Key, Value> Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode)
{
return new BlittableScanIterator<Key, Value>(this, beginAddress, endAddress, scanBufferingMode);
}
/// <summary>
/// Read pages from specified device
/// </summary>
/// <typeparam name="TContext"></typeparam>
/// <param name="readPageStart"></param>
/// <param name="numPages"></param>
/// <param name="untilAddress"></param>
/// <param name="callback"></param>
/// <param name="context"></param>
/// <param name="frame"></param>
/// <param name="completed"></param>
/// <param name="devicePageOffset"></param>
/// <param name="device"></param>
/// <param name="objectLogDevice"></param>
/// <param name="cts"></param>
internal void AsyncReadPagesFromDeviceToFrame<TContext>(
long readPageStart,
int numPages,
long untilAddress,
IOCompletionCallback callback,
TContext context,
BlittableFrame frame,
out CountdownEvent completed,
long devicePageOffset = 0,
IDevice device = null,
IDevice objectLogDevice = null,
CancellationTokenSource cts = null)
{
var usedDevice = device;
IDevice usedObjlogDevice = objectLogDevice;
if (device == null)
{
usedDevice = this.device;
}
completed = new CountdownEvent(numPages);
for (long readPage = readPageStart; readPage < (readPageStart + numPages); readPage++)
{
int pageIndex = (int)(readPage % frame.frameSize);
if (frame.frame[pageIndex] == null)
{
frame.Allocate(pageIndex);
}
else
{
frame.Clear(pageIndex);
}
var asyncResult = new PageAsyncReadResult<TContext>()
{
page = readPage,
context = context,
handle = completed,
frame = frame,
cts = cts
};
ulong offsetInFile = (ulong)(AlignedPageSizeBytes * readPage);
uint readLength = (uint)AlignedPageSizeBytes;
long adjustedUntilAddress = (AlignedPageSizeBytes * (untilAddress >> LogPageSizeBits) + (untilAddress & PageSizeMask));
if (adjustedUntilAddress > 0 && ((adjustedUntilAddress - (long)offsetInFile) < PageSize))
{
readLength = (uint)(adjustedUntilAddress - (long)offsetInFile);
readLength = (uint)((readLength + (sectorSize - 1)) & ~(sectorSize - 1));
}
if (device != null)
offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset));
usedDevice.ReadAsync(offsetInFile, (IntPtr)frame.pointers[pageIndex], readLength, callback, asyncResult);
}
}
}
}

@ -0,0 +1,65 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Runtime.InteropServices;
namespace FASTER.core
{
/// <summary>
/// A frame is an in-memory circular buffer of log pages
/// </summary>
internal sealed class BlittableFrame : IDisposable
{
public readonly int frameSize, pageSize, sectorSize;
public readonly byte[][] frame;
public GCHandle[] handles;
public long[] pointers;
public BlittableFrame(int frameSize, int pageSize, int sectorSize)
{
this.frameSize = frameSize;
this.pageSize = pageSize;
this.sectorSize = sectorSize;
frame = new byte[frameSize][];
handles = new GCHandle[frameSize];
pointers = new long[frameSize];
}
public void Allocate(int index)
{
var adjustedSize = pageSize + 2 * sectorSize;
byte[] tmp = new byte[adjustedSize];
Array.Clear(tmp, 0, adjustedSize);
handles[index] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
long p = (long)handles[index].AddrOfPinnedObject();
pointers[index] = (p + (sectorSize - 1)) & ~(sectorSize - 1);
frame[index] = tmp;
}
public void Clear(int pageIndex)
{
Array.Clear(frame[pageIndex], 0, frame[pageIndex].Length);
}
public long GetPhysicalAddress(long frameNumber, long offset)
{
return pointers[frameNumber % frameSize] + offset;
}
public void Dispose()
{
for (int i = 0; i < frameSize; i++)
{
if (handles[i] != default(GCHandle))
handles[i].Free();
frame[i] = null;
pointers[i] = 0;
}
}
}
}

@ -0,0 +1,238 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Threading;
using System.Diagnostics;
namespace FASTER.core
{
/// <summary>
/// Scan iterator for hybrid log
/// </summary>
public class BlittableScanIterator<Key, Value> : IFasterScanIterator<Key, Value>
where Key : new()
where Value : new()
{
private readonly int frameSize;
private readonly BlittableAllocator<Key, Value> hlog;
private readonly long beginAddress, endAddress;
private readonly BlittableFrame frame;
private readonly CountdownEvent[] loaded;
private bool first = true;
private long currentAddress, nextAddress;
private long currentPhysicalAddress;
/// <summary>
/// Current address
/// </summary>
public long CurrentAddress => currentAddress;
/// <summary>
/// Constructor
/// </summary>
/// <param name="hlog"></param>
/// <param name="beginAddress"></param>
/// <param name="endAddress"></param>
/// <param name="scanBufferingMode"></param>
public unsafe BlittableScanIterator(BlittableAllocator<Key, Value> hlog, long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode)
{
this.hlog = hlog;
if (beginAddress == 0)
beginAddress = hlog.GetFirstValidLogicalAddress(0);
this.beginAddress = beginAddress;
this.endAddress = endAddress;
currentAddress = -1;
nextAddress = beginAddress;
if (scanBufferingMode == ScanBufferingMode.SinglePageBuffering)
frameSize = 1;
else if (scanBufferingMode == ScanBufferingMode.DoublePageBuffering)
frameSize = 2;
else if (scanBufferingMode == ScanBufferingMode.NoBuffering)
{
frameSize = 0;
return;
}
frame = new BlittableFrame(frameSize, hlog.PageSize, hlog.GetDeviceSectorSize());
loaded = new CountdownEvent[frameSize];
// Only load addresses flushed to disk
if (nextAddress < hlog.HeadAddress)
{
var frameNumber = (nextAddress >> hlog.LogPageSizeBits) % frameSize;
hlog.AsyncReadPagesFromDeviceToFrame
(nextAddress >> hlog.LogPageSizeBits,
1, endAddress, AsyncReadPagesCallback, Empty.Default,
frame, out loaded[frameNumber]);
}
}
/// <summary>
/// Gets reference to current key
/// </summary>
/// <returns></returns>
public ref Key GetKey()
{
return ref hlog.GetKey(currentPhysicalAddress);
}
/// <summary>
/// Gets reference to current value
/// </summary>
/// <returns></returns>
public ref Value GetValue()
{
return ref hlog.GetValue(currentPhysicalAddress);
}
/// <summary>
/// Get next record
/// </summary>
/// <param name="recordInfo"></param>
/// <returns>True if record found, false if end of scan</returns>
public bool GetNext(out RecordInfo recordInfo)
{
recordInfo = default(RecordInfo);
currentAddress = nextAddress;
while (true)
{
// Check for boundary conditions
if (currentAddress >= endAddress)
{
return false;
}
if (currentAddress < hlog.BeginAddress)
{
throw new Exception("Iterator address is less than log BeginAddress " + hlog.BeginAddress);
}
if (frameSize == 0 && currentAddress < hlog.HeadAddress)
{
throw new Exception("Iterator address is less than log HeadAddress in memory-scan mode");
}
var currentPage = currentAddress >> hlog.LogPageSizeBits;
var offset = currentAddress & hlog.PageSizeMask;
if (currentAddress < hlog.HeadAddress)
BufferAndLoad(currentAddress, currentPage, currentPage % frameSize);
var physicalAddress = default(long);
if (currentAddress >= hlog.HeadAddress)
physicalAddress = hlog.GetPhysicalAddress(currentAddress);
else
physicalAddress = frame.GetPhysicalAddress(currentPage % frameSize, offset);
// Check if record fits on page, if not skip to next page
var recordSize = hlog.GetRecordSize(physicalAddress);
if ((currentAddress & hlog.PageSizeMask) + recordSize > hlog.PageSize)
{
currentAddress = (1 + (currentAddress >> hlog.LogPageSizeBits)) << hlog.LogPageSizeBits;
continue;
}
ref var info = ref hlog.GetInfo(physicalAddress);
if (info.Invalid || info.IsNull())
{
currentAddress += recordSize;
continue;
}
currentPhysicalAddress = physicalAddress;
recordInfo = info;
nextAddress = currentAddress + recordSize;
return true;
}
}
/// <summary>
/// Get next record in iterator
/// </summary>
/// <param name="recordInfo"></param>
/// <param name="key"></param>
/// <param name="value"></param>
/// <returns></returns>
public bool GetNext(out RecordInfo recordInfo, out Key key, out Value value)
{
key = default(Key);
value = default(Value);
if (GetNext(out recordInfo))
{
key = GetKey();
value = GetValue();
return true;
}
return false;
}
/// <summary>
/// Dispose the iterator
/// </summary>
public void Dispose()
{
frame?.Dispose();
}
private unsafe void BufferAndLoad(long currentAddress, long currentPage, long currentFrame)
{
if (first || (currentAddress & hlog.PageSizeMask) == 0)
{
// Prefetch pages based on buffering mode
if (frameSize == 1)
{
if (!first)
{
hlog.AsyncReadPagesFromDeviceToFrame(currentAddress >> hlog.LogPageSizeBits, 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[currentFrame]);
}
}
else
{
var endPage = endAddress >> hlog.LogPageSizeBits;
if ((endPage > currentPage) &&
((endPage > currentPage + 1) || ((endAddress & hlog.PageSizeMask) != 0)))
{
hlog.AsyncReadPagesFromDeviceToFrame(1 + (currentAddress >> hlog.LogPageSizeBits), 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[(currentPage + 1) % frameSize]);
}
}
first = false;
}
loaded[currentFrame].Wait();
}
private unsafe void AsyncReadPagesCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
var result = (PageAsyncReadResult<Empty>)Overlapped.Unpack(overlap).AsyncResult;
if (result.freeBuffer1 != null)
{
hlog.PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, result.page);
result.freeBuffer1.Return();
result.freeBuffer1 = null;
}
if (result.handle != null)
{
result.handle.Signal();
}
Interlocked.MemoryBarrier();
Overlapped.Free(overlap);
}
}
}

@ -0,0 +1,63 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System.Collections.Generic;
using System.Threading;
namespace FASTER.core
{
class ErrorList
{
private readonly List<long> errorList;
public ErrorList() => errorList = new List<long>();
public void Add(long address)
{
lock (errorList)
errorList.Add(address);
}
public uint CheckAndWait(long oldFlushedUntilAddress, long currentFlushedUntilAddress)
{
bool done = false;
uint errorCode = 0;
while (!done)
{
done = true;
lock (errorList)
{
for (int i = 0; i < errorList.Count; i++)
{
if (errorList[i] >= oldFlushedUntilAddress && errorList[i] < currentFlushedUntilAddress)
{
errorCode = 1;
}
else if (errorList[i] < oldFlushedUntilAddress)
{
done = false; // spin barrier for other threads during exception
Thread.Yield();
}
}
}
}
return errorCode;
}
public void RemoveUntil(long currentFlushedUntilAddress)
{
lock (errorList)
{
for (int i = 0; i < errorList.Count; i++)
{
if (errorList[i] < currentFlushedUntilAddress)
{
errorList.RemoveAt(i);
}
}
}
}
public int Count => errorList.Count;
}
}

@ -0,0 +1,968 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Collections.Generic;
using System.IO;
using System.Diagnostics;
using System.Runtime.InteropServices;
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
namespace FASTER.core
{
[StructLayout(LayoutKind.Sequential, Pack=1)]
public struct Record<Key, Value>
{
public RecordInfo info;
public Key key;
public Value value;
}
public unsafe sealed class GenericAllocator<Key, Value> : AllocatorBase<Key, Value>
where Key : new()
where Value : new()
{
// Circular buffer definition
internal Record<Key, Value>[][] values;
// Object log related variables
private readonly IDevice objectLogDevice;
// Size of object chunks beign written to storage
private const int ObjectBlockSize = 100 * (1 << 20);
// Tail offsets per segment, in object log
public readonly long[] segmentOffsets;
// Record sizes
private static readonly int recordSize = Utility.GetSize(default(Record<Key, Value>));
private readonly SerializerSettings<Key, Value> SerializerSettings;
private readonly bool keyBlittable = Utility.IsBlittable<Key>();
private readonly bool valueBlittable = Utility.IsBlittable<Value>();
public GenericAllocator(LogSettings settings, SerializerSettings<Key, Value> serializerSettings, IFasterEqualityComparer<Key> comparer, Action<long, long> evictCallback = null, LightEpoch epoch = null, Action<CommitInfo> flushCallback = null)
: base(settings, comparer, evictCallback, epoch, flushCallback)
{
SerializerSettings = serializerSettings;
if ((!keyBlittable) && (settings.LogDevice as NullDevice == null) && ((SerializerSettings == null) || (SerializerSettings.keySerializer == null)))
{
throw new Exception("Key is not blittable, but no serializer specified via SerializerSettings");
}
if ((!valueBlittable) && (settings.LogDevice as NullDevice == null) && ((SerializerSettings == null) || (SerializerSettings.valueSerializer == null)))
{
throw new Exception("Value is not blittable, but no serializer specified via SerializerSettings");
}
values = new Record<Key, Value>[BufferSize][];
segmentOffsets = new long[SegmentBufferSize];
objectLogDevice = settings.ObjectLogDevice;
if ((settings.LogDevice as NullDevice == null) && (KeyHasObjects() || ValueHasObjects()))
{
if (objectLogDevice == null)
throw new Exception("Objects in key/value, but object log not provided during creation of FASTER instance");
}
}
public override void Initialize()
{
Initialize(recordSize);
}
/// <summary>
/// Get start logical address
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
public override long GetStartLogicalAddress(long page)
{
return page << LogPageSizeBits;
}
/// <summary>
/// Get first valid logical address
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
public override long GetFirstValidLogicalAddress(long page)
{
if (page == 0)
return (page << LogPageSizeBits) + recordSize;
return page << LogPageSizeBits;
}
public override ref RecordInfo GetInfo(long physicalAddress)
{
// Offset within page
int offset = (int)(physicalAddress & PageSizeMask);
// Index of page within the circular buffer
int pageIndex = (int)((physicalAddress >> LogPageSizeBits) & BufferSizeMask);
return ref values[pageIndex][offset/recordSize].info;
}
public override ref RecordInfo GetInfoFromBytePointer(byte* ptr)
{
return ref Unsafe.AsRef<Record<Key, Value>>(ptr).info;
}
public override ref Key GetKey(long physicalAddress)
{
// Offset within page
int offset = (int)(physicalAddress & PageSizeMask);
// Index of page within the circular buffer
int pageIndex = (int)((physicalAddress >> LogPageSizeBits) & BufferSizeMask);
return ref values[pageIndex][offset / recordSize].key;
}
public override ref Value GetValue(long physicalAddress)
{
// Offset within page
int offset = (int)(physicalAddress & PageSizeMask);
// Index of page within the circular buffer
int pageIndex = (int)((physicalAddress >> LogPageSizeBits) & BufferSizeMask);
return ref values[pageIndex][offset / recordSize].value;
}
public override int GetRecordSize(long physicalAddress)
{
return recordSize;
}
public override int GetAverageRecordSize()
{
return recordSize;
}
public override int GetInitialRecordSize<Input>(ref Key key, ref Input input)
{
return recordSize;
}
public override int GetRecordSize(ref Key key, ref Value value)
{
return recordSize;
}
/// <summary>
/// Dispose memory allocator
/// </summary>
public override void Dispose()
{
if (values != null)
{
for (int i = 0; i < values.Length; i++)
{
values[i] = null;
}
values = null;
}
base.Dispose();
}
/// <summary>
/// Delete in-memory portion of the log
/// </summary>
internal override void DeleteFromMemory()
{
for (int i = 0; i < values.Length; i++)
{
values[i] = null;
}
values = null;
}
public override AddressInfo* GetKeyAddressInfo(long physicalAddress)
{
return (AddressInfo*)Unsafe.AsPointer(ref Unsafe.AsRef<Record<Key, Value>>((byte*)physicalAddress).key);
}
public override AddressInfo* GetValueAddressInfo(long physicalAddress)
{
return (AddressInfo*)Unsafe.AsPointer(ref Unsafe.AsRef<Record<Key, Value>>((byte*)physicalAddress).value);
}
/// <summary>
/// Allocate memory page, pinned in memory, and in sector aligned form, if possible
/// </summary>
/// <param name="index"></param>
internal override void AllocatePage(int index)
{
values[index] = AllocatePage();
}
internal Record<Key, Value>[] AllocatePage()
{
Record<Key, Value>[] tmp;
if (PageSize % recordSize == 0)
tmp = new Record<Key, Value>[PageSize / recordSize];
else
tmp = new Record<Key, Value>[1 + (PageSize / recordSize)];
Array.Clear(tmp, 0, tmp.Length);
return tmp;
}
public override long GetPhysicalAddress(long logicalAddress)
{
return logicalAddress;
}
protected override bool IsAllocated(int pageIndex)
{
return values[pageIndex] != null;
}
protected override void TruncateUntilAddress(long toAddress)
{
base.TruncateUntilAddress(toAddress);
objectLogDevice.TruncateUntilAddress(toAddress);
}
protected override void WriteAsync<TContext>(long flushPage, IOCompletionCallback callback, PageAsyncFlushResult<TContext> asyncResult)
{
WriteAsync(flushPage,
(ulong)(AlignedPageSizeBytes * flushPage),
(uint)PageSize,
callback,
asyncResult, device, objectLogDevice);
}
protected override void WriteAsyncToDevice<TContext>
(long startPage, long flushPage, int pageSize, IOCompletionCallback callback,
PageAsyncFlushResult<TContext> asyncResult, IDevice device, IDevice objectLogDevice)
{
// We are writing to separate device, so use fresh segment offsets
WriteAsync(flushPage,
(ulong)(AlignedPageSizeBytes * (flushPage - startPage)),
(uint)pageSize, callback, asyncResult,
device, objectLogDevice, flushPage, new long[SegmentBufferSize]);
}
protected override void ClearPage(long page, int offset)
{
Array.Clear(values[page % BufferSize], offset / recordSize, values[page % BufferSize].Length - offset / recordSize);
// Close segments
var thisCloseSegment = page >> (LogSegmentSizeBits - LogPageSizeBits);
var nextCloseSegment = (page + 1) >> (LogSegmentSizeBits - LogPageSizeBits);
if (thisCloseSegment != nextCloseSegment)
{
// We are clearing the last page in current segment
segmentOffsets[thisCloseSegment % SegmentBufferSize] = 0;
}
}
private void WriteAsync<TContext>(long flushPage, ulong alignedDestinationAddress, uint numBytesToWrite,
IOCompletionCallback callback, PageAsyncFlushResult<TContext> asyncResult,
IDevice device, IDevice objlogDevice, long intendedDestinationPage = -1, long[] localSegmentOffsets = null)
{
// Short circuit if we are using a null device
if (device as NullDevice != null)
{
device.WriteAsync(IntPtr.Zero, 0, 0, numBytesToWrite, callback, asyncResult);
return;
}
int start = 0, aligned_start = 0, end = (int)numBytesToWrite;
if (asyncResult.partial)
{
start = (int)((asyncResult.fromAddress - (asyncResult.page << LogPageSizeBits)));
aligned_start = (start / sectorSize) * sectorSize;
end = (int)((asyncResult.untilAddress - (asyncResult.page << LogPageSizeBits)));
}
// Check if user did not override with special segment offsets
if (localSegmentOffsets == null) localSegmentOffsets = segmentOffsets;
var src = values[flushPage % BufferSize];
var buffer = bufferPool.Get((int)numBytesToWrite);
if (aligned_start < start && (KeyHasObjects() || ValueHasObjects()))
{
// Do not read back the invalid header of page 0
if ((flushPage > 0) || (start > GetFirstValidLogicalAddress(flushPage)))
{
// Get the overlapping HLOG from disk as we wrote it with
// object pointers previously. This avoids object reserialization
PageAsyncReadResult<Empty> result =
new PageAsyncReadResult<Empty>
{
handle = new CountdownEvent(1)
};
device.ReadAsync(alignedDestinationAddress + (ulong)aligned_start, (IntPtr)buffer.aligned_pointer + aligned_start,
(uint)sectorSize, AsyncReadPageCallback, result);
result.handle.Wait();
}
fixed (RecordInfo* pin = &src[0].info)
{
Debug.Assert(buffer.aligned_pointer + numBytesToWrite <= (byte*)buffer.handle.AddrOfPinnedObject() + buffer.buffer.Length);
Buffer.MemoryCopy((void*)((long)Unsafe.AsPointer(ref src[0]) + start), buffer.aligned_pointer + start,
numBytesToWrite - start, numBytesToWrite - start);
}
}
else
{
fixed (RecordInfo* pin = &src[0].info)
{
Debug.Assert(buffer.aligned_pointer + numBytesToWrite <= (byte*)buffer.handle.AddrOfPinnedObject() + buffer.buffer.Length);
Buffer.MemoryCopy((void*)((long)Unsafe.AsPointer(ref src[0]) + aligned_start), buffer.aligned_pointer + aligned_start,
numBytesToWrite - aligned_start, numBytesToWrite - aligned_start);
}
}
long ptr = (long)buffer.aligned_pointer;
List<long> addr = new List<long>();
asyncResult.freeBuffer1 = buffer;
MemoryStream ms = new MemoryStream();
IObjectSerializer<Key> keySerializer = null;
IObjectSerializer<Value> valueSerializer = null;
if (KeyHasObjects())
{
keySerializer = SerializerSettings.keySerializer();
keySerializer.BeginSerialize(ms);
}
if (ValueHasObjects())
{
valueSerializer = SerializerSettings.valueSerializer();
valueSerializer.BeginSerialize(ms);
}
for (int i=start/recordSize; i<end/recordSize; i++)
{
if (!src[i].info.Invalid)
{
if (KeyHasObjects())
{
long pos = ms.Position;
keySerializer.Serialize(ref src[i].key);
var key_address = GetKeyAddressInfo((long)(buffer.aligned_pointer + i * recordSize));
key_address->Address = pos;
key_address->Size = (int)(ms.Position - pos);
addr.Add((long)key_address);
}
if (ValueHasObjects() && !src[i].info.Tombstone)
{
long pos = ms.Position;
valueSerializer.Serialize(ref src[i].value);
var value_address = GetValueAddressInfo((long)(buffer.aligned_pointer + i * recordSize));
value_address->Address = pos;
value_address->Size = (int)(ms.Position - pos);
addr.Add((long)value_address);
}
}
if (ms.Position > ObjectBlockSize || i == (end / recordSize) - 1)
{
var memoryStreamLength = (int)ms.Position;
var _objBuffer = bufferPool.Get(memoryStreamLength);
asyncResult.done = new AutoResetEvent(false);
var _alignedLength = (memoryStreamLength + (sectorSize - 1)) & ~(sectorSize - 1);
var _objAddr = Interlocked.Add(ref localSegmentOffsets[(long)(alignedDestinationAddress >> LogSegmentSizeBits) % SegmentBufferSize], _alignedLength) - _alignedLength;
if (KeyHasObjects())
keySerializer.EndSerialize();
if (ValueHasObjects())
valueSerializer.EndSerialize();
ms.Close();
fixed (void* src_ = ms.GetBuffer())
Buffer.MemoryCopy(src_, _objBuffer.aligned_pointer, memoryStreamLength, memoryStreamLength);
foreach (var address in addr)
((AddressInfo*)address)->Address += _objAddr;
if (i < (end / recordSize) - 1)
{
ms = new MemoryStream();
if (KeyHasObjects())
keySerializer.BeginSerialize(ms);
if (ValueHasObjects())
valueSerializer.BeginSerialize(ms);
objlogDevice.WriteAsync(
(IntPtr)_objBuffer.aligned_pointer,
(int)(alignedDestinationAddress >> LogSegmentSizeBits),
(ulong)_objAddr, (uint)_alignedLength, AsyncFlushPartialObjectLogCallback<TContext>, asyncResult);
// Wait for write to complete before resuming next write
asyncResult.done.WaitOne();
_objBuffer.Return();
}
else
{
// need to write both page and object cache
Interlocked.Increment(ref asyncResult.count);
asyncResult.freeBuffer2 = _objBuffer;
objlogDevice.WriteAsync(
(IntPtr)_objBuffer.aligned_pointer,
(int)(alignedDestinationAddress >> LogSegmentSizeBits),
(ulong)_objAddr, (uint)_alignedLength, callback, asyncResult);
}
}
}
if (asyncResult.partial)
{
var aligned_end = (int)((asyncResult.untilAddress - (asyncResult.page << LogPageSizeBits)));
aligned_end = ((aligned_end + (sectorSize - 1)) & ~(sectorSize - 1));
numBytesToWrite = (uint)(aligned_end - aligned_start);
}
var alignedNumBytesToWrite = (uint)((numBytesToWrite + (sectorSize - 1)) & ~(sectorSize - 1));
// Finally write the hlog page
device.WriteAsync((IntPtr)buffer.aligned_pointer + aligned_start, alignedDestinationAddress + (ulong)aligned_start,
alignedNumBytesToWrite, callback, asyncResult);
}
private void AsyncReadPageCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
// Set the page status to flushed
var result = (PageAsyncReadResult<Empty>)Overlapped.Unpack(overlap).AsyncResult;
result.handle.Signal();
Overlapped.Free(overlap);
}
protected override void ReadAsync<TContext>(
ulong alignedSourceAddress, int destinationPageIndex, uint aligned_read_length,
IOCompletionCallback callback, PageAsyncReadResult<TContext> asyncResult, IDevice device, IDevice objlogDevice)
{
asyncResult.freeBuffer1 = bufferPool.Get((int)aligned_read_length);
asyncResult.freeBuffer1.required_bytes = (int)aligned_read_length;
if (!(KeyHasObjects() || ValueHasObjects()))
{
device.ReadAsync(alignedSourceAddress, (IntPtr)asyncResult.freeBuffer1.aligned_pointer,
aligned_read_length, callback, asyncResult);
return;
}
asyncResult.callback = callback;
if (objlogDevice == null)
{
Debug.Assert(objectLogDevice != null);
objlogDevice = objectLogDevice;
}
asyncResult.objlogDevice = objlogDevice;
device.ReadAsync(alignedSourceAddress, (IntPtr)asyncResult.freeBuffer1.aligned_pointer,
aligned_read_length, AsyncReadPageWithObjectsCallback<TContext>, asyncResult);
}
/// <summary>
/// IOCompletion callback for page flush
/// </summary>
/// <param name="errorCode"></param>
/// <param name="numBytes"></param>
/// <param name="overlap"></param>
private void AsyncFlushPartialObjectLogCallback<TContext>(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
// Set the page status to flushed
PageAsyncFlushResult<TContext> result = (PageAsyncFlushResult<TContext>)Overlapped.Unpack(overlap).AsyncResult;
result.done.Set();
Overlapped.Free(overlap);
}
private void AsyncReadPageWithObjectsCallback<TContext>(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
PageAsyncReadResult<TContext> result = (PageAsyncReadResult<TContext>)Overlapped.Unpack(overlap).AsyncResult;
Record<Key, Value>[] src;
// We are reading into a frame
if (result.frame != null)
{
var frame = (GenericFrame<Key, Value>)result.frame;
src = frame.GetPage(result.page % frame.frameSize);
}
else
src = values[result.page % BufferSize];
// Deserialize all objects until untilptr
if (result.resumePtr < result.untilPtr)
{
MemoryStream ms = new MemoryStream(result.freeBuffer2.buffer);
ms.Seek(result.freeBuffer2.offset, SeekOrigin.Begin);
Deserialize(result.freeBuffer1.GetValidPointer(), result.resumePtr, result.untilPtr, src, ms);
ms.Dispose();
result.freeBuffer2.Return();
result.freeBuffer2 = null;
result.resumePtr = result.untilPtr;
}
// If we have processed entire page, return
if (result.untilPtr >= result.maxPtr)
{
result.Free();
// Call the "real" page read callback
result.callback(errorCode, numBytes, overlap);
return;
}
// We will be re-issuing I/O, so free current overlap
Overlapped.Free(overlap);
// We will now be able to process all records until (but not including) untilPtr
GetObjectInfo(result.freeBuffer1.GetValidPointer(), ref result.untilPtr, result.maxPtr, ObjectBlockSize, out long startptr, out long size);
// Object log fragment should be aligned by construction
Debug.Assert(startptr % sectorSize == 0);
if (size > int.MaxValue)
throw new Exception("Unable to read object page, total size greater than 2GB: " + size);
var alignedLength = (size + (sectorSize - 1)) & ~(sectorSize - 1);
var objBuffer = bufferPool.Get((int)alignedLength);
result.freeBuffer2 = objBuffer;
// Request objects from objlog
result.objlogDevice.ReadAsync(
(int)(result.page >> (LogSegmentSizeBits - LogPageSizeBits)),
(ulong)startptr,
(IntPtr)objBuffer.aligned_pointer, (uint)alignedLength, AsyncReadPageWithObjectsCallback<TContext>, result);
}
/// <summary>
/// Invoked by users to obtain a record from disk. It uses sector aligned memory to read
/// the record efficiently into memory.
/// </summary>
/// <param name="fromLogical"></param>
/// <param name="numBytes"></param>
/// <param name="callback"></param>
/// <param name="context"></param>
/// <param name="result"></param>
protected override void AsyncReadRecordObjectsToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext<Key, Value> context, SectorAlignedMemory result = default(SectorAlignedMemory))
{
ulong fileOffset = (ulong)(AlignedPageSizeBytes * (fromLogical >> LogPageSizeBits) + (fromLogical & PageSizeMask));
ulong alignedFileOffset = (ulong)(((long)fileOffset / sectorSize) * sectorSize);
uint alignedReadLength = (uint)((long)fileOffset + numBytes - (long)alignedFileOffset);
alignedReadLength = (uint)((alignedReadLength + (sectorSize - 1)) & ~(sectorSize - 1));
var record = bufferPool.Get((int)alignedReadLength);
record.valid_offset = (int)(fileOffset - alignedFileOffset);
record.available_bytes = (int)(alignedReadLength - (fileOffset - alignedFileOffset));
record.required_bytes = numBytes;
var asyncResult = default(AsyncGetFromDiskResult<AsyncIOContext<Key, Value>>);
asyncResult.context = context;
asyncResult.context.record = result;
asyncResult.context.objBuffer = record;
objectLogDevice.ReadAsync(
(int)(context.logicalAddress >> LogSegmentSizeBits),
alignedFileOffset,
(IntPtr)asyncResult.context.objBuffer.aligned_pointer,
alignedReadLength,
callback,
asyncResult);
}
/// <summary>
/// Read pages from specified device
/// </summary>
/// <typeparam name="TContext"></typeparam>
/// <param name="readPageStart"></param>
/// <param name="numPages"></param>
/// <param name="untilAddress"></param>
/// <param name="callback"></param>
/// <param name="context"></param>
/// <param name="frame"></param>
/// <param name="completed"></param>
/// <param name="devicePageOffset"></param>
/// <param name="device"></param>
/// <param name="objectLogDevice"></param>
internal void AsyncReadPagesFromDeviceToFrame<TContext>(
long readPageStart,
int numPages,
long untilAddress,
IOCompletionCallback callback,
TContext context,
GenericFrame<Key, Value> frame,
out CountdownEvent completed,
long devicePageOffset = 0,
IDevice device = null, IDevice objectLogDevice = null)
{
var usedDevice = device;
IDevice usedObjlogDevice = objectLogDevice;
if (device == null)
{
usedDevice = this.device;
}
completed = new CountdownEvent(numPages);
for (long readPage = readPageStart; readPage < (readPageStart + numPages); readPage++)
{
int pageIndex = (int)(readPage % frame.frameSize);
if (frame.GetPage(pageIndex) == null)
{
frame.Allocate(pageIndex);
}
else
{
frame.Clear(pageIndex);
}
var asyncResult = new PageAsyncReadResult<TContext>()
{
page = readPage,
context = context,
handle = completed,
maxPtr = PageSize,
frame = frame,
};
ulong offsetInFile = (ulong)(AlignedPageSizeBytes * readPage);
uint readLength = (uint)AlignedPageSizeBytes;
long adjustedUntilAddress = (AlignedPageSizeBytes * (untilAddress >> LogPageSizeBits) + (untilAddress & PageSizeMask));
if (adjustedUntilAddress > 0 && ((adjustedUntilAddress - (long)offsetInFile) < PageSize))
{
readLength = (uint)(adjustedUntilAddress - (long)offsetInFile);
asyncResult.maxPtr = readLength;
readLength = (uint)((readLength + (sectorSize - 1)) & ~(sectorSize - 1));
}
if (device != null)
offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset));
ReadAsync(offsetInFile, pageIndex, readLength, callback, asyncResult, usedDevice, usedObjlogDevice);
}
}
#region Page handlers for objects
/// <summary>
/// Deseialize part of page from stream
/// </summary>
/// <param name="raw"></param>
/// <param name="ptr">From pointer</param>
/// <param name="untilptr">Until pointer</param>
/// <param name="src"></param>
/// <param name="stream">Stream</param>
public void Deserialize(byte *raw, long ptr, long untilptr, Record<Key, Value>[] src, Stream stream)
{
IObjectSerializer<Key> keySerializer = null;
IObjectSerializer<Value> valueSerializer = null;
long streamStartPos = stream.Position;
long start_addr = -1;
if (KeyHasObjects())
{
keySerializer = SerializerSettings.keySerializer();
keySerializer.BeginDeserialize(stream);
}
if (ValueHasObjects())
{
valueSerializer = SerializerSettings.valueSerializer();
valueSerializer.BeginDeserialize(stream);
}
while (ptr < untilptr)
{
ref Record<Key, Value> record = ref Unsafe.AsRef<Record<Key, Value>>(raw + ptr);
src[ptr / recordSize].info = record.info;
if (!record.info.Invalid)
{
if (KeyHasObjects())
{
var key_addr = GetKeyAddressInfo((long)raw + ptr);
if (start_addr == -1) start_addr = key_addr->Address;
if (stream.Position != streamStartPos + key_addr->Address - start_addr)
{
stream.Seek(streamStartPos + key_addr->Address - start_addr, SeekOrigin.Begin);
}
src[ptr / recordSize].key = new Key();
keySerializer.Deserialize(ref src[ptr/recordSize].key);
}
else
{
src[ptr / recordSize].key = record.key;
}
if (!record.info.Tombstone)
{
if (ValueHasObjects())
{
var value_addr = GetValueAddressInfo((long)raw + ptr);
if (start_addr == -1) start_addr = value_addr->Address;
if (stream.Position != streamStartPos + value_addr->Address - start_addr)
{
stream.Seek(streamStartPos + value_addr->Address - start_addr, SeekOrigin.Begin);
}
src[ptr / recordSize].value = new Value();
valueSerializer.Deserialize(ref src[ptr / recordSize].value);
}
else
{
src[ptr / recordSize].value = record.value;
}
}
}
ptr += GetRecordSize(ptr);
}
if (KeyHasObjects())
{
keySerializer.EndDeserialize();
}
if (ValueHasObjects())
{
valueSerializer.EndDeserialize();
}
}
/// <summary>
/// Get location and range of object log addresses for specified log page
/// </summary>
/// <param name="raw"></param>
/// <param name="ptr"></param>
/// <param name="untilptr"></param>
/// <param name="objectBlockSize"></param>
/// <param name="startptr"></param>
/// <param name="size"></param>
public void GetObjectInfo(byte* raw, ref long ptr, long untilptr, int objectBlockSize, out long startptr, out long size)
{
long minObjAddress = long.MaxValue;
long maxObjAddress = long.MinValue;
while (ptr < untilptr)
{
ref Record<Key, Value> record = ref Unsafe.AsRef<Record<Key, Value>>(raw + ptr);
if (!record.info.Invalid)
{
if (KeyHasObjects())
{
var key_addr = GetKeyAddressInfo((long)raw + ptr);
var addr = key_addr->Address;
// If object pointer is greater than kObjectSize from starting object pointer
if (minObjAddress != long.MaxValue && (addr - minObjAddress > objectBlockSize))
{
break;
}
if (addr < minObjAddress) minObjAddress = addr;
addr += key_addr->Size;
if (addr > maxObjAddress) maxObjAddress = addr;
}
if (ValueHasObjects() && !record.info.Tombstone)
{
var value_addr = GetValueAddressInfo((long)raw + ptr);
var addr = value_addr->Address;
// If object pointer is greater than kObjectSize from starting object pointer
if (minObjAddress != long.MaxValue && (addr - minObjAddress > objectBlockSize))
{
break;
}
if (addr < minObjAddress) minObjAddress = addr;
addr += value_addr->Size;
if (addr > maxObjAddress) maxObjAddress = addr;
}
}
ptr += GetRecordSize(ptr);
}
// Handle the case where no objects are to be written
if (minObjAddress == long.MaxValue && maxObjAddress == long.MinValue)
{
minObjAddress = 0;
maxObjAddress = 0;
}
startptr = minObjAddress;
size = maxObjAddress - minObjAddress;
}
/// <summary>
/// Retrieve objects from object log
/// </summary>
/// <param name="record"></param>
/// <param name="ctx"></param>
/// <returns></returns>
protected override bool RetrievedFullRecord(byte* record, ref AsyncIOContext<Key, Value> ctx)
{
if (!KeyHasObjects())
{
ShallowCopy(ref Unsafe.AsRef<Record<Key, Value>>(record).key, ref ctx.key);
}
if (!ValueHasObjects())
{
ShallowCopy(ref Unsafe.AsRef<Record<Key, Value>>(record).value, ref ctx.value);
}
if (!(KeyHasObjects() || ValueHasObjects()))
return true;
if (ctx.objBuffer == null)
{
// Issue IO for objects
long startAddress = -1;
long endAddress = -1;
if (KeyHasObjects())
{
var x = GetKeyAddressInfo((long)record);
startAddress = x->Address;
endAddress = x->Address + x->Size;
}
if (ValueHasObjects() && !GetInfoFromBytePointer(record).Tombstone)
{
var x = GetValueAddressInfo((long)record);
if (startAddress == -1)
startAddress = x->Address;
endAddress = x->Address + x->Size;
}
// We are limited to a 2GB size per key-value
if (endAddress-startAddress > int.MaxValue)
throw new Exception("Size of key-value exceeds max of 2GB: " + (endAddress - startAddress));
AsyncGetFromDisk(startAddress, (int)(endAddress - startAddress), ctx, ctx.record);
return false;
}
// Parse the key and value objects
MemoryStream ms = new MemoryStream(ctx.objBuffer.buffer);
ms.Seek(ctx.objBuffer.offset + ctx.objBuffer.valid_offset, SeekOrigin.Begin);
if (KeyHasObjects())
{
ctx.key = new Key();
var keySerializer = SerializerSettings.keySerializer();
keySerializer.BeginDeserialize(ms);
keySerializer.Deserialize(ref ctx.key);
keySerializer.EndDeserialize();
}
if (ValueHasObjects() && !GetInfoFromBytePointer(record).Tombstone)
{
ctx.value = new Value();
var valueSerializer = SerializerSettings.valueSerializer();
valueSerializer.BeginDeserialize(ms);
valueSerializer.Deserialize(ref ctx.value);
valueSerializer.EndDeserialize();
}
ctx.objBuffer.Return();
return true;
}
/// <summary>
/// Whether KVS has keys to serialize/deserialize
/// </summary>
/// <returns></returns>
public override bool KeyHasObjects()
{
return SerializerSettings.keySerializer != null;
}
/// <summary>
/// Whether KVS has values to serialize/deserialize
/// </summary>
/// <returns></returns>
public override bool ValueHasObjects()
{
return SerializerSettings.valueSerializer != null;
}
#endregion
public override IHeapContainer<Key> GetKeyContainer(ref Key key) => new StandardHeapContainer<Key>(ref key);
public override IHeapContainer<Value> GetValueContainer(ref Value value) => new StandardHeapContainer<Value>(ref value);
public override long[] GetSegmentOffsets()
{
return segmentOffsets;
}
internal override void PopulatePage(byte* src, int required_bytes, long destinationPage)
{
PopulatePage(src, required_bytes, ref values[destinationPage % BufferSize]);
}
internal void PopulatePageFrame(byte* src, int required_bytes, Record<Key, Value>[] frame)
{
PopulatePage(src, required_bytes, ref frame);
}
internal void PopulatePage(byte* src, int required_bytes, ref Record<Key, Value>[] destinationPage)
{
fixed (RecordInfo* pin = &destinationPage[0].info)
{
Debug.Assert(required_bytes <= recordSize * destinationPage.Length);
Buffer.MemoryCopy(src, Unsafe.AsPointer(ref destinationPage[0]), required_bytes, required_bytes);
}
}
/// <summary>
/// Iterator interface for scanning FASTER log
/// </summary>
/// <param name="beginAddress"></param>
/// <param name="endAddress"></param>
/// <param name="scanBufferingMode"></param>
/// <returns></returns>
public override IFasterScanIterator<Key, Value> Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode)
{
return new GenericScanIterator<Key, Value>(this, beginAddress, endAddress, scanBufferingMode);
}
}
}

@ -0,0 +1,67 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
namespace FASTER.core
{
/// <summary>
/// A frame is an in-memory circular buffer of log pages
/// </summary>
internal sealed class GenericFrame<Key, Value> : IDisposable
{
private readonly Record<Key, Value>[][] frame;
public readonly int frameSize, pageSize;
private readonly int recordSize = Utility.GetSize(default(Record<Key, Value>));
public GenericFrame(int frameSize, int pageSize)
{
this.frameSize = frameSize;
this.pageSize = pageSize;
frame = new Record<Key, Value>[frameSize][];
}
public void Allocate(int index)
{
Record<Key, Value>[] tmp;
if (pageSize % recordSize == 0)
tmp = new Record<Key, Value>[pageSize / recordSize];
else
tmp = new Record<Key, Value>[1 + (pageSize / recordSize)];
Array.Clear(tmp, 0, tmp.Length);
frame[index] = tmp;
}
public void Clear(int pageIndex)
{
Array.Clear(frame[pageIndex], 0, frame[pageIndex].Length);
}
public ref Key GetKey(long frameNumber, long offset)
{
return ref frame[frameNumber][offset].key;
}
public ref Value GetValue(long frameNumber, long offset)
{
return ref frame[frameNumber][offset].value;
}
public ref RecordInfo GetInfo(long frameNumber, long offset)
{
return ref frame[frameNumber][offset].info;
}
public ref Record<Key, Value>[] GetPage(long frameNumber)
{
return ref frame[frameNumber];
}
public void Dispose()
{
Array.Clear(frame, 0, frame.Length);
}
}
}

@ -0,0 +1,255 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Threading;
using System.Diagnostics;
namespace FASTER.core
{
/// <summary>
/// Scan iterator for hybrid log
/// </summary>
public class GenericScanIterator<Key, Value> : IFasterScanIterator<Key, Value>
where Key : new()
where Value : new()
{
private readonly int frameSize;
private readonly GenericAllocator<Key, Value> hlog;
private readonly long beginAddress, endAddress;
private readonly GenericFrame<Key, Value> frame;
private readonly CountdownEvent[] loaded;
private readonly int recordSize;
private bool first = true;
private long currentAddress, nextAddress;
private Key currentKey;
private Value currentValue;
/// <summary>
/// Current address
/// </summary>
public long CurrentAddress => currentAddress;
/// <summary>
/// Constructor
/// </summary>
/// <param name="hlog"></param>
/// <param name="beginAddress"></param>
/// <param name="endAddress"></param>
/// <param name="scanBufferingMode"></param>
public unsafe GenericScanIterator(GenericAllocator<Key, Value> hlog, long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode)
{
this.hlog = hlog;
if (beginAddress == 0)
beginAddress = hlog.GetFirstValidLogicalAddress(0);
this.beginAddress = beginAddress;
this.endAddress = endAddress;
recordSize = hlog.GetRecordSize(0);
currentAddress = -1;
nextAddress = beginAddress;
if (scanBufferingMode == ScanBufferingMode.SinglePageBuffering)
frameSize = 1;
else if (scanBufferingMode == ScanBufferingMode.DoublePageBuffering)
frameSize = 2;
else if (scanBufferingMode == ScanBufferingMode.NoBuffering)
{
frameSize = 0;
return;
}
frame = new GenericFrame<Key, Value>(frameSize, hlog.PageSize);
loaded = new CountdownEvent[frameSize];
// Only load addresses flushed to disk
if (nextAddress < hlog.HeadAddress)
{
var frameNumber = (nextAddress >> hlog.LogPageSizeBits) % frameSize;
hlog.AsyncReadPagesFromDeviceToFrame
(nextAddress >> hlog.LogPageSizeBits,
1, endAddress, AsyncReadPagesCallback, Empty.Default,
frame, out loaded[frameNumber]);
}
}
/// <summary>
/// Gets reference to current key
/// </summary>
/// <returns></returns>
public ref Key GetKey()
{
return ref currentKey;
}
/// <summary>
/// Gets reference to current value
/// </summary>
/// <returns></returns>
public ref Value GetValue()
{
return ref currentValue;
}
/// <summary>
/// Get next record in iterator
/// </summary>
/// <param name="recordInfo"></param>
/// <returns></returns>
public bool GetNext(out RecordInfo recordInfo)
{
recordInfo = default(RecordInfo);
currentKey = default(Key);
currentValue = default(Value);
currentAddress = nextAddress;
while (true)
{
// Check for boundary conditions
if (currentAddress >= endAddress)
{
return false;
}
if (currentAddress < hlog.BeginAddress)
{
throw new Exception("Iterator address is less than log BeginAddress " + hlog.BeginAddress);
}
if (frameSize == 0 && currentAddress < hlog.HeadAddress)
{
throw new Exception("Iterator address is less than log HeadAddress in memory-scan mode");
}
var currentPage = currentAddress >> hlog.LogPageSizeBits;
var offset = (currentAddress & hlog.PageSizeMask) / recordSize;
if (currentAddress < hlog.HeadAddress)
BufferAndLoad(currentAddress, currentPage, currentPage % frameSize);
// Check if record fits on page, if not skip to next page
if ((currentAddress & hlog.PageSizeMask) + recordSize > hlog.PageSize)
{
currentAddress = (1 + (currentAddress >> hlog.LogPageSizeBits)) << hlog.LogPageSizeBits;
continue;
}
if (currentAddress >= hlog.HeadAddress)
{
// Read record from cached page memory
nextAddress = currentAddress + recordSize;
var page = currentPage % hlog.BufferSize;
if (hlog.values[page][offset].info.Invalid)
continue;
recordInfo = hlog.values[page][offset].info;
currentKey = hlog.values[page][offset].key;
currentValue = hlog.values[page][offset].value;
return true;
}
nextAddress = currentAddress + recordSize;
var currentFrame = currentPage % frameSize;
if (frame.GetInfo(currentFrame, offset).Invalid)
continue;
recordInfo = frame.GetInfo(currentFrame, offset);
currentKey = frame.GetKey(currentFrame, offset);
currentValue = frame.GetValue(currentFrame, offset);
return true;
}
}
/// <summary>
/// Get next record using iterator
/// </summary>
/// <param name="recordInfo"></param>
/// <param name="key"></param>
/// <param name="value"></param>
/// <returns></returns>
public bool GetNext(out RecordInfo recordInfo, out Key key, out Value value)
{
key = default(Key);
value = default(Value);
if (GetNext(out recordInfo))
{
key = currentKey;
value = currentValue;
return true;
}
return false;
}
private unsafe void BufferAndLoad(long currentAddress, long currentPage, long currentFrame)
{
if (first || (currentAddress & hlog.PageSizeMask) == 0)
{
// Prefetch pages based on buffering mode
if (frameSize == 1)
{
if (!first)
{
hlog.AsyncReadPagesFromDeviceToFrame(currentAddress >> hlog.LogPageSizeBits, 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[currentFrame]);
}
}
else
{
var endPage = endAddress >> hlog.LogPageSizeBits;
if ((endPage > currentPage) &&
((endPage > currentPage + 1) || ((endAddress & hlog.PageSizeMask) != 0)))
{
hlog.AsyncReadPagesFromDeviceToFrame(1 + (currentAddress >> hlog.LogPageSizeBits), 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[(currentPage + 1) % frameSize]);
}
}
first = false;
}
loaded[currentFrame].Wait();
}
/// <summary>
/// Dispose iterator
/// </summary>
public void Dispose()
{
if (loaded != null)
for (int i = 0; i < frameSize; i++)
loaded[i]?.Wait();
frame?.Dispose();
}
private unsafe void AsyncReadPagesCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
var result = (PageAsyncReadResult<Empty>)Overlapped.Unpack(overlap).AsyncResult;
if (result.freeBuffer1 != null)
{
hlog.PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, ref frame.GetPage(result.page % frame.frameSize));
result.freeBuffer1.Return();
}
if (result.handle != null)
{
result.handle.Signal();
}
Interlocked.MemoryBarrier();
Overlapped.Free(overlap);
}
}
}

@ -0,0 +1,69 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
namespace FASTER.core
{
/// <summary>
/// Scan buffering mode
/// </summary>
public enum ScanBufferingMode
{
/// <summary>
/// Buffer only current page being scanned
/// </summary>
SinglePageBuffering,
/// <summary>
/// Buffer current and next page in scan sequence
/// </summary>
DoublePageBuffering,
/// <summary>
/// Do not buffer - with this mode, you can only scan records already in main memory
/// </summary>
NoBuffering
}
/// <summary>
/// Scan iterator interface for FASTER log
/// </summary>
/// <typeparam name="Key"></typeparam>
/// <typeparam name="Value"></typeparam>
public interface IFasterScanIterator<Key, Value> : IDisposable
{
/// <summary>
/// Gets reference to current key
/// </summary>
/// <returns></returns>
ref Key GetKey();
/// <summary>
/// Gets reference to current value
/// </summary>
/// <returns></returns>
ref Value GetValue();
/// <summary>
/// Get next record
/// </summary>
/// <param name="recordInfo"></param>
/// <returns>True if record found, false if end of scan</returns>
bool GetNext(out RecordInfo recordInfo);
/// <summary>
/// Get next record
/// </summary>
/// <param name="recordInfo"></param>
/// <param name="key"></param>
/// <param name="value"></param>
/// <returns>True if record found, false if end of scan</returns>
bool GetNext(out RecordInfo recordInfo, out Key key, out Value value);
/// <summary>
/// Current address
/// </summary>
long CurrentAddress { get; }
}
}

@ -0,0 +1,656 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
#define CALLOC
using System;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Memory allocator for objects
/// </summary>
/// <typeparam name="T"></typeparam>
public unsafe class MallocFixedPageSize<T> : IDisposable
{
private const bool ForceUnpinnedAllocation = false;
private const int PageSizeBits = 16;
private const int PageSize = 1 << PageSizeBits;
private const int PageSizeMask = PageSize - 1;
private const int LevelSizeBits = 12;
private const int LevelSize = 1 << LevelSizeBits;
private const int LevelSizeMask = LevelSize - 1;
private T[][] values = new T[LevelSize][];
private GCHandle[] handles = new GCHandle[LevelSize];
private IntPtr[] pointers = new IntPtr[LevelSize];
private T[] values0;
private readonly GCHandle handles0;
private readonly IntPtr pointers0;
private readonly int RecordSize;
private readonly int AlignedPageSize;
private volatile int writeCacheLevel;
private volatile int count;
private readonly bool IsPinned;
private readonly bool ReturnPhysicalAddress;
private CountdownEvent checkpointEvent;
private readonly LightEpoch epoch;
private readonly bool ownedEpoch;
private FastThreadLocal<Queue<FreeItem>> freeList;
/// <summary>
/// Create new instance
/// </summary>
/// <param name="returnPhysicalAddress"></param>
/// <param name="epoch"></param>
public MallocFixedPageSize(bool returnPhysicalAddress = false, LightEpoch epoch = null)
{
freeList = new FastThreadLocal<Queue<FreeItem>>();
if (epoch == null)
{
this.epoch = new LightEpoch();
ownedEpoch = true;
}
else
this.epoch = epoch;
values[0] = new T[PageSize];
#if !(CALLOC)
Array.Clear(values[0], 0, PageSize);
#endif
ReturnPhysicalAddress = returnPhysicalAddress;
if (ForceUnpinnedAllocation)
{
IsPinned = false;
ReturnPhysicalAddress = false;
}
else
{
IsPinned = true;
if (default(T) == null)
{
IsPinned = false;
ReturnPhysicalAddress = false;
}
else
{
// The surefire way to check if a type is blittable
// it to try GCHandle.Alloc with a handle type of Pinned.
// If it throws an exception, we know the type is not blittable.
try
{
handles[0] = GCHandle.Alloc(values[0], GCHandleType.Pinned);
pointers[0] = handles[0].AddrOfPinnedObject();
handles0 = handles[0];
pointers0 = pointers[0];
RecordSize = Marshal.SizeOf(values[0][0]);
AlignedPageSize = RecordSize * PageSize;
}
catch (Exception)
{
IsPinned = false;
ReturnPhysicalAddress = false;
}
}
}
values0 = values[0];
writeCacheLevel = -1;
Interlocked.MemoryBarrier();
BulkAllocate(); // null pointer
}
/// <summary>
/// Get physical address
/// </summary>
/// <param name="address"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public long GetPhysicalAddress(long address)
{
if (ReturnPhysicalAddress)
{
return address;
}
else
{
return
(long)pointers[address >> PageSizeBits]
+ (long)(address & PageSizeMask) * RecordSize;
}
}
/// <summary>
/// Get object
/// </summary>
/// <param name="index"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ref T Get(long index)
{
if (this.ReturnPhysicalAddress)
throw new Exception("Physical pointer returned by allocator: de-reference pointer to get records instead of calling Get");
return ref values
[index >> PageSizeBits]
[index & PageSizeMask];
}
/// <summary>
/// Set object
/// </summary>
/// <param name="index"></param>
/// <param name="value"></param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Set(long index, ref T value)
{
if (this.ReturnPhysicalAddress)
throw new Exception("Physical pointer returned by allocator: de-reference pointer to set records instead of calling Set (otherwise, set ForceUnpinnedAllocation to true)");
values
[index >> PageSizeBits]
[index & PageSizeMask]
= value;
}
/// <summary>
/// Free object
/// </summary>
/// <param name="pointer"></param>
/// <param name="removed_epoch"></param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void FreeAtEpoch(long pointer, int removed_epoch = -1)
{
if (!ReturnPhysicalAddress)
{
values[pointer >> PageSizeBits][pointer & PageSizeMask] = default(T);
}
freeList.InitializeThread();
if (freeList.Value == null)
freeList.Value = new Queue<FreeItem>();
freeList.Value.Enqueue(new FreeItem { removed_item = pointer, removal_epoch = removed_epoch });
}
private const int kAllocateChunkSize = 16;
/// <summary>
/// Warning: cannot mix 'n' match use of
/// Allocate and BulkAllocate
/// </summary>
/// <returns></returns>
public long BulkAllocate()
{
// Determine insertion index.
// ReSharper disable once CSharpWarnings::CS0420
#pragma warning disable 420
int index = Interlocked.Add(ref count, kAllocateChunkSize) - kAllocateChunkSize;
#pragma warning restore 420
int offset = index & PageSizeMask;
int baseAddr = index >> PageSizeBits;
// Handle indexes in first batch specially because they do not use write cache.
if (baseAddr == 0)
{
// If index 0, then allocate space for next level.
if (index == 0)
{
var tmp = new T[PageSize];
#if !(CALLOC)
Array.Clear(tmp, 0, PageSize);
#endif
if (IsPinned)
{
handles[1] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
pointers[1] = handles[1].AddrOfPinnedObject();
}
values[1] = tmp;
Interlocked.MemoryBarrier();
}
// Return location.
if (ReturnPhysicalAddress)
return (((long)pointers0) + index * RecordSize);
else
return index;
}
// See if write cache contains corresponding array.
var cache = writeCacheLevel;
T[] array;
if (cache != -1)
{
// Write cache is correct array only if index is within [arrayCapacity, 2*arrayCapacity).
if (cache == baseAddr)
{
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers[baseAddr]) + (long)offset * RecordSize;
else
return index;
}
}
// Write cache did not work, so get level information from index.
// int level = GetLevelFromIndex(index);
// Spin-wait until level has an allocated array.
var spinner = new SpinWait();
while (true)
{
array = values[baseAddr];
if (array != null)
{
break;
}
spinner.SpinOnce();
}
// Perform extra actions if inserting at offset 0 of level.
if (offset == 0)
{
// Update write cache to point to current level.
writeCacheLevel = baseAddr;
Interlocked.MemoryBarrier();
// Allocate for next page
int newBaseAddr = baseAddr + 1;
var tmp = new T[PageSize];
#if !(CALLOC)
Array.Clear(tmp, 0, PageSize);
#endif
if (IsPinned)
{
handles[newBaseAddr] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
pointers[newBaseAddr] = handles[newBaseAddr].AddrOfPinnedObject();
}
values[newBaseAddr] = tmp;
Interlocked.MemoryBarrier();
}
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers[baseAddr]) + (long)offset * RecordSize;
else
return index;
}
/// <summary>
/// Allocate
/// </summary>
/// <returns></returns>
public long Allocate()
{
freeList.InitializeThread();
if (freeList.Value == null)
{
freeList.Value = new Queue<FreeItem>();
}
if (freeList.Value.Count > 0)
{
if (freeList.Value.Peek().removal_epoch <= epoch.SafeToReclaimEpoch)
return freeList.Value.Dequeue().removed_item;
//if (freeList.Count % 64 == 0)
// LightEpoch.Instance.BumpCurrentEpoch();
}
// Determine insertion index.
// ReSharper disable once CSharpWarnings::CS0420
#pragma warning disable 420
int index = Interlocked.Increment(ref count) - 1;
#pragma warning restore 420
int offset = index & PageSizeMask;
int baseAddr = index >> PageSizeBits;
// Handle indexes in first batch specially because they do not use write cache.
if (baseAddr == 0)
{
// If index 0, then allocate space for next level.
if (index == 0)
{
var tmp = new T[PageSize];
#if !(CALLOC)
Array.Clear(tmp, 0, PageSize);
#endif
if (IsPinned)
{
handles[1] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
pointers[1] = handles[1].AddrOfPinnedObject();
}
values[1] = tmp;
Interlocked.MemoryBarrier();
}
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers0) + index * RecordSize;
else
return index;
}
// See if write cache contains corresponding array.
var cache = writeCacheLevel;
T[] array;
if (cache != -1)
{
// Write cache is correct array only if index is within [arrayCapacity, 2*arrayCapacity).
if (cache == baseAddr)
{
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers[baseAddr]) + (long)offset * RecordSize;
else
return index;
}
}
// Write cache did not work, so get level information from index.
// int level = GetLevelFromIndex(index);
// Spin-wait until level has an allocated array.
var spinner = new SpinWait();
while (true)
{
array = values[baseAddr];
if (array != null)
{
break;
}
spinner.SpinOnce();
}
// Perform extra actions if inserting at offset 0 of level.
if (offset == 0)
{
// Update write cache to point to current level.
writeCacheLevel = baseAddr;
Interlocked.MemoryBarrier();
// Allocate for next page
int newBaseAddr = baseAddr + 1;
var tmp = new T[PageSize];
#if !(CALLOC)
Array.Clear(tmp, 0, PageSize);
#endif
if (IsPinned)
{
handles[newBaseAddr] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
pointers[newBaseAddr] = handles[newBaseAddr].AddrOfPinnedObject();
}
values[newBaseAddr] = tmp;
Interlocked.MemoryBarrier();
}
// Return location.
if (ReturnPhysicalAddress)
return ((long)pointers[baseAddr]) + (long)offset * RecordSize;
else
return index;
}
/// <summary>
/// Acquire thread
/// </summary>
public void Acquire()
{
if (ownedEpoch)
epoch.Acquire();
freeList.InitializeThread();
}
/// <summary>
/// Release thread
/// </summary>
public void Release()
{
if (ownedEpoch)
epoch.Release();
freeList.DisposeThread();
}
/// <summary>
/// Dispose
/// </summary>
public void Dispose()
{
for (int i = 0; i < values.Length; i++)
{
if (IsPinned && (handles[i].IsAllocated)) handles[i].Free();
values[i] = null;
}
handles = null;
pointers = null;
values = null;
values0 = null;
count = 0;
if (ownedEpoch)
epoch.Dispose();
freeList.Dispose();
}
#region Checkpoint
/// <summary>
/// Public facing persistence API
/// </summary>
/// <param name="device"></param>
/// <param name="start_offset"></param>
/// <param name="numBytes"></param>
public void TakeCheckpoint(IDevice device, ulong start_offset, out ulong numBytes)
{
BeginCheckpoint(device, start_offset, out numBytes);
}
/// <summary>
/// Is checkpoint complete
/// </summary>
/// <param name="waitUntilComplete"></param>
/// <returns></returns>
public bool IsCheckpointCompleted(bool waitUntilComplete = false)
{
bool completed = checkpointEvent.IsSet;
if (!completed && waitUntilComplete)
{
checkpointEvent.Wait();
return true;
}
return completed;
}
internal void BeginCheckpoint(IDevice device, ulong offset, out ulong numBytesWritten)
{
int localCount = count;
int recordsCountInLastLevel = localCount & PageSizeMask;
int numCompleteLevels = localCount >> PageSizeBits;
int numLevels = numCompleteLevels + (recordsCountInLastLevel > 0 ? 1 : 0);
checkpointEvent = new CountdownEvent(numLevels);
uint alignedPageSize = PageSize * (uint)RecordSize;
uint lastLevelSize = (uint)recordsCountInLastLevel * (uint)RecordSize;
int sectorSize = (int)device.SectorSize;
numBytesWritten = 0;
for (int i = 0; i < numLevels; i++)
{
OverflowPagesFlushAsyncResult result = default(OverflowPagesFlushAsyncResult);
uint writeSize = (uint)((i == numCompleteLevels) ? (lastLevelSize + (sectorSize - 1)) & ~(sectorSize - 1) : alignedPageSize);
device.WriteAsync(pointers[i], offset + numBytesWritten, writeSize, AsyncFlushCallback, result);
numBytesWritten += writeSize;
}
}
private void AsyncFlushCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
try
{
if (errorCode != 0)
{
System.Diagnostics.Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
}
catch (Exception ex)
{
System.Diagnostics.Trace.TraceError("Completion Callback error, {0}", ex.Message);
}
finally
{
checkpointEvent.Signal();
Overlapped.Free(overlap);
}
}
/// <summary>
/// Max valid address
/// </summary>
/// <returns></returns>
public int GetMaxValidAddress()
{
return count;
}
/// <summary>
/// Get page size
/// </summary>
/// <returns></returns>
public int GetPageSize()
{
return PageSize;
}
#endregion
#region Recover
/// <summary>
/// Recover
/// </summary>
/// <param name="device"></param>
/// <param name="buckets"></param>
/// <param name="numBytes"></param>
/// <param name="offset"></param>
public void Recover(IDevice device, ulong offset, int buckets, ulong numBytes)
{
BeginRecovery(device, offset, buckets, numBytes, out ulong numBytesRead);
}
/// <summary>
/// Check if recovery complete
/// </summary>
/// <param name="waitUntilComplete"></param>
/// <returns></returns>
public bool IsRecoveryCompleted(bool waitUntilComplete = false)
{
bool completed = (numLevelsToBeRecovered == 0);
if (!completed && waitUntilComplete)
{
while (numLevelsToBeRecovered != 0)
{
Thread.Sleep(10);
}
}
return completed;
}
// Implementation of asynchronous recovery
private int numLevelsToBeRecovered;
internal void BeginRecovery(IDevice device,
ulong offset,
int buckets,
ulong numBytesToRead,
out ulong numBytesRead)
{
// Allocate as many records in memory
while (count < buckets)
{
Allocate();
}
int numRecords = (int)numBytesToRead / RecordSize;
int recordsCountInLastLevel = numRecords & PageSizeMask;
int numCompleteLevels = numRecords >> PageSizeBits;
int numLevels = numCompleteLevels + (recordsCountInLastLevel > 0 ? 1 : 0);
numLevelsToBeRecovered = numLevels;
numBytesRead = 0;
uint alignedPageSize = (uint)PageSize * (uint)RecordSize;
uint lastLevelSize = (uint)recordsCountInLastLevel * (uint)RecordSize;
for (int i = 0; i < numLevels; i++)
{
//read a full page
uint length = (uint)PageSize * (uint)RecordSize; ;
OverflowPagesReadAsyncResult result = default(OverflowPagesReadAsyncResult);
device.ReadAsync(offset + numBytesRead, pointers[i], length, AsyncPageReadCallback, result);
numBytesRead += (i == numCompleteLevels) ? lastLevelSize : alignedPageSize;
}
}
private void AsyncPageReadCallback(
uint errorCode,
uint numBytes,
NativeOverlapped* overlap)
{
try
{
if (errorCode != 0)
{
System.Diagnostics.Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
}
catch (Exception ex)
{
System.Diagnostics.Trace.TraceError("Completion Callback error, {0}", ex.Message);
}
finally
{
Interlocked.Decrement(ref numLevelsToBeRecovered);
Overlapped.Free(overlap);
}
}
#endregion
}
internal struct FreeItem
{
public long removed_item;
public int removal_epoch;
}
}

@ -0,0 +1,56 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Threading;
namespace FASTER.core
{
class PendingFlushList
{
const int maxSize = 8;
const int maxRetries = 10;
public PageAsyncFlushResult<Empty>[] list;
public PendingFlushList()
{
list = new PageAsyncFlushResult<Empty>[maxSize];
}
public void Add(PageAsyncFlushResult<Empty> t)
{
int retries = 0;
do
{
for (int i = 0; i < maxSize; i++)
{
if (list[i] == default)
{
if (Interlocked.CompareExchange(ref list[i], t, default) == default)
{
return;
}
}
}
} while (retries++ < maxRetries);
throw new Exception("Unable to add item to list");
}
public bool RemoveAdjacent(long address, out PageAsyncFlushResult<Empty> request)
{
for (int i=0; i<maxSize; i++)
{
request = list[i];
if (request?.fromAddress == address)
{
if (Interlocked.CompareExchange(ref list[i], null, request) == request)
{
return true;
}
}
}
request = null;
return false;
}
}
}

@ -0,0 +1,504 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Runtime.InteropServices;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq.Expressions;
using System.IO;
using System.Diagnostics;
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
namespace FASTER.core
{
public unsafe sealed class VariableLengthBlittableAllocator<Key, Value> : AllocatorBase<Key, Value>
where Key : new()
where Value : new()
{
public const int kRecordAlignment = 8; // RecordInfo has a long field, so it should be aligned to 8-bytes
// Circular buffer definition
private byte[][] values;
private GCHandle[] handles;
private long[] pointers;
private readonly GCHandle ptrHandle;
private readonly long* nativePointers;
private readonly bool fixedSizeKey;
private readonly bool fixedSizeValue;
internal readonly IVariableLengthStruct<Key> KeyLength;
internal readonly IVariableLengthStruct<Value> ValueLength;
public VariableLengthBlittableAllocator(LogSettings settings, VariableLengthStructSettings<Key, Value> vlSettings, IFasterEqualityComparer<Key> comparer, Action<long, long> evictCallback = null, LightEpoch epoch = null, Action<CommitInfo> flushCallback = null)
: base(settings, comparer, evictCallback, epoch, flushCallback)
{
values = new byte[BufferSize][];
handles = new GCHandle[BufferSize];
pointers = new long[BufferSize];
ptrHandle = GCHandle.Alloc(pointers, GCHandleType.Pinned);
nativePointers = (long*)ptrHandle.AddrOfPinnedObject();
KeyLength = vlSettings.keyLength;
ValueLength = vlSettings.valueLength;
if (KeyLength == null)
{
fixedSizeKey = true;
KeyLength = new FixedLengthStruct<Key>();
}
if (ValueLength == null)
{
fixedSizeValue = true;
ValueLength = new FixedLengthStruct<Value>();
}
}
public override void Initialize()
{
Initialize(Constants.kFirstValidAddress);
}
public override ref RecordInfo GetInfo(long physicalAddress)
{
return ref Unsafe.AsRef<RecordInfo>((void*)physicalAddress);
}
public override ref RecordInfo GetInfoFromBytePointer(byte* ptr)
{
return ref Unsafe.AsRef<RecordInfo>(ptr);
}
public override ref Key GetKey(long physicalAddress)
{
return ref Unsafe.AsRef<Key>((byte*)physicalAddress + RecordInfo.GetLength());
}
public override ref Value GetValue(long physicalAddress)
{
return ref Unsafe.AsRef<Value>((byte*)physicalAddress + RecordInfo.GetLength() + KeySize(physicalAddress));
}
private int KeySize(long physicalAddress)
{
return KeyLength.GetLength(ref GetKey(physicalAddress));
}
private int ValueSize(long physicalAddress)
{
return ValueLength.GetLength(ref GetValue(physicalAddress));
}
public override int GetRecordSize(long physicalAddress)
{
ref var recordInfo = ref GetInfo(physicalAddress);
if (recordInfo.IsNull())
return RecordInfo.GetLength();
var size = RecordInfo.GetLength() + KeySize(physicalAddress) + ValueSize(physicalAddress);
size = (size + kRecordAlignment - 1) & (~(kRecordAlignment - 1));
return size;
}
public override int GetRequiredRecordSize(long physicalAddress, int availableBytes)
{
// We need at least [record size] + [average key size] + [average value size]
var reqBytes = GetAverageRecordSize();
if (availableBytes < reqBytes)
{
return reqBytes;
}
// We need at least [record size] + [actual key size] + [average value size]
reqBytes = RecordInfo.GetLength() + KeySize(physicalAddress) + ValueLength.GetAverageLength();
if (availableBytes < reqBytes)
{
return reqBytes;
}
// We need at least [record size] + [actual key size] + [actual value size]
reqBytes = RecordInfo.GetLength() + KeySize(physicalAddress) + ValueSize(physicalAddress);
reqBytes = (reqBytes + kRecordAlignment - 1) & (~(kRecordAlignment - 1));
return reqBytes;
}
public override int GetAverageRecordSize()
{
return RecordInfo.GetLength() +
kRecordAlignment +
KeyLength.GetAverageLength() +
ValueLength.GetAverageLength();
}
public override int GetInitialRecordSize<TInput>(ref Key key, ref TInput input)
{
var actualSize = RecordInfo.GetLength() +
KeyLength.GetLength(ref key) +
ValueLength.GetInitialLength(ref input);
return (actualSize + kRecordAlignment - 1) & (~(kRecordAlignment - 1));
}
public override int GetRecordSize(ref Key key, ref Value value)
{
var actualSize = RecordInfo.GetLength() +
KeyLength.GetLength(ref key) +
ValueLength.GetLength(ref value);
return (actualSize + kRecordAlignment - 1) & (~(kRecordAlignment - 1));
}
public override void ShallowCopy(ref Key src, ref Key dst)
{
Buffer.MemoryCopy(
Unsafe.AsPointer(ref src),
Unsafe.AsPointer(ref dst),
KeyLength.GetLength(ref src),
KeyLength.GetLength(ref src));
}
public override void ShallowCopy(ref Value src, ref Value dst)
{
Buffer.MemoryCopy(
Unsafe.AsPointer(ref src),
Unsafe.AsPointer(ref dst),
ValueLength.GetLength(ref src),
ValueLength.GetLength(ref src));
}
/// <summary>
/// Dispose memory allocator
/// </summary>
public override void Dispose()
{
if (values != null)
{
for (int i = 0; i < values.Length; i++)
{
if (handles[i].IsAllocated)
handles[i].Free();
values[i] = null;
}
}
handles = null;
pointers = null;
values = null;
base.Dispose();
}
public override AddressInfo* GetKeyAddressInfo(long physicalAddress)
{
throw new NotSupportedException();
}
public override AddressInfo* GetValueAddressInfo(long physicalAddress)
{
throw new NotSupportedException();
}
/// <summary>
/// Allocate memory page, pinned in memory, and in sector aligned form, if possible
/// </summary>
/// <param name="index"></param>
internal override void AllocatePage(int index)
{
var adjustedSize = PageSize + 2 * sectorSize;
byte[] tmp = new byte[adjustedSize];
Array.Clear(tmp, 0, adjustedSize);
handles[index] = GCHandle.Alloc(tmp, GCHandleType.Pinned);
long p = (long)handles[index].AddrOfPinnedObject();
pointers[index] = (p + (sectorSize - 1)) & ~(sectorSize - 1);
values[index] = tmp;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override long GetPhysicalAddress(long logicalAddress)
{
// Offset within page
int offset = (int)(logicalAddress & ((1L << LogPageSizeBits) - 1));
// Index of page within the circular buffer
int pageIndex = (int)((logicalAddress >> LogPageSizeBits) & (BufferSize - 1));
return *(nativePointers + pageIndex) + offset;
}
protected override bool IsAllocated(int pageIndex)
{
return values[pageIndex] != null;
}
protected override void WriteAsync<TContext>(long flushPage, IOCompletionCallback callback, PageAsyncFlushResult<TContext> asyncResult)
{
WriteAsync((IntPtr)pointers[flushPage % BufferSize],
(ulong)(AlignedPageSizeBytes * flushPage),
(uint)AlignedPageSizeBytes,
callback,
asyncResult, device);
}
protected override void WriteAsyncToDevice<TContext>
(long startPage, long flushPage, int pageSize, IOCompletionCallback callback,
PageAsyncFlushResult<TContext> asyncResult, IDevice device, IDevice objectLogDevice)
{
var alignedPageSize = (pageSize + (sectorSize - 1)) & ~(sectorSize - 1);
WriteAsync((IntPtr)pointers[flushPage % BufferSize],
(ulong)(AlignedPageSizeBytes * (flushPage - startPage)),
(uint)alignedPageSize, callback, asyncResult,
device);
}
/// <summary>
/// Get start logical address
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
public override long GetStartLogicalAddress(long page)
{
return page << LogPageSizeBits;
}
/// <summary>
/// Get first valid logical address
/// </summary>
/// <param name="page"></param>
/// <returns></returns>
public override long GetFirstValidLogicalAddress(long page)
{
if (page == 0)
return (page << LogPageSizeBits) + Constants.kFirstValidAddress;
return page << LogPageSizeBits;
}
protected override void ClearPage(long page, int offset)
{
if (offset == 0)
Array.Clear(values[page % BufferSize], offset, values[page % BufferSize].Length - offset);
else
{
// Adjust array offset for cache alignment
offset += (int)(pointers[page % BufferSize] - (long)handles[page % BufferSize].AddrOfPinnedObject());
Array.Clear(values[page % BufferSize], offset, values[page % BufferSize].Length - offset);
}
}
/// <summary>
/// Delete in-memory portion of the log
/// </summary>
internal override void DeleteFromMemory()
{
for (int i = 0; i < values.Length; i++)
{
if (handles[i].IsAllocated)
handles[i].Free();
values[i] = null;
}
handles = null;
pointers = null;
values = null;
}
private void WriteAsync<TContext>(IntPtr alignedSourceAddress, ulong alignedDestinationAddress, uint numBytesToWrite,
IOCompletionCallback callback, PageAsyncFlushResult<TContext> asyncResult,
IDevice device)
{
if (asyncResult.partial)
{
// Write only required bytes within the page
int aligned_start = (int)((asyncResult.fromAddress - (asyncResult.page << LogPageSizeBits)));
aligned_start = (aligned_start / sectorSize) * sectorSize;
int aligned_end = (int)((asyncResult.untilAddress - (asyncResult.page << LogPageSizeBits)));
aligned_end = ((aligned_end + (sectorSize - 1)) & ~(sectorSize - 1));
numBytesToWrite = (uint)(aligned_end - aligned_start);
device.WriteAsync(alignedSourceAddress + aligned_start, alignedDestinationAddress + (ulong)aligned_start, numBytesToWrite, callback, asyncResult);
}
else
{
device.WriteAsync(alignedSourceAddress, alignedDestinationAddress,
numBytesToWrite, callback, asyncResult);
}
}
protected override void ReadAsync<TContext>(
ulong alignedSourceAddress, int destinationPageIndex, uint aligned_read_length,
IOCompletionCallback callback, PageAsyncReadResult<TContext> asyncResult, IDevice device, IDevice objlogDevice)
{
device.ReadAsync(alignedSourceAddress, (IntPtr)pointers[destinationPageIndex],
aligned_read_length, callback, asyncResult);
}
/// <summary>
/// Invoked by users to obtain a record from disk. It uses sector aligned memory to read
/// the record efficiently into memory.
/// </summary>
/// <param name="fromLogical"></param>
/// <param name="numBytes"></param>
/// <param name="callback"></param>
/// <param name="context"></param>
/// <param name="result"></param>
protected override void AsyncReadRecordObjectsToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext<Key, Value> context, SectorAlignedMemory result = default(SectorAlignedMemory))
{
throw new InvalidOperationException("AsyncReadRecordObjectsToMemory invalid for BlittableAllocator");
}
/// <summary>
/// Retrieve objects from object log
/// </summary>
/// <param name="record"></param>
/// <param name="ctx"></param>
/// <returns></returns>
protected override bool RetrievedFullRecord(byte* record, ref AsyncIOContext<Key, Value> ctx)
{
return true;
}
public override ref Key GetContextRecordKey(ref AsyncIOContext<Key, Value> ctx)
{
return ref GetKey((long)ctx.record.GetValidPointer());
}
public override ref Value GetContextRecordValue(ref AsyncIOContext<Key, Value> ctx)
{
return ref GetValue((long)ctx.record.GetValidPointer());
}
public override IHeapContainer<Key> GetKeyContainer(ref Key key)
{
if (fixedSizeKey) return new StandardHeapContainer<Key>(ref key);
else return new VarLenHeapContainer<Key>(ref key, KeyLength, bufferPool);
}
public override IHeapContainer<Value> GetValueContainer(ref Value value)
{
if (fixedSizeValue) return new StandardHeapContainer<Value>(ref value);
else return new VarLenHeapContainer<Value>(ref value, ValueLength, bufferPool);
}
/// <summary>
/// Whether KVS has keys to serialize/deserialize
/// </summary>
/// <returns></returns>
public override bool KeyHasObjects()
{
return false;
}
/// <summary>
/// Whether KVS has values to serialize/deserialize
/// </summary>
/// <returns></returns>
public override bool ValueHasObjects()
{
return false;
}
public override long[] GetSegmentOffsets()
{
return null;
}
internal override void PopulatePage(byte* src, int required_bytes, long destinationPage)
{
throw new Exception("BlittableAllocator memory pages are sector aligned - use direct copy");
// Buffer.MemoryCopy(src, (void*)pointers[destinationPage % BufferSize], required_bytes, required_bytes);
}
/// <summary>
/// Iterator interface for scanning FASTER log
/// </summary>
/// <param name="beginAddress"></param>
/// <param name="endAddress"></param>
/// <param name="scanBufferingMode"></param>
/// <returns></returns>
public override IFasterScanIterator<Key, Value> Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode)
{
return new VariableLengthBlittableScanIterator<Key, Value>(this, beginAddress, endAddress, scanBufferingMode);
}
/// <summary>
/// Read pages from specified device
/// </summary>
/// <typeparam name="TContext"></typeparam>
/// <param name="readPageStart"></param>
/// <param name="numPages"></param>
/// <param name="untilAddress"></param>
/// <param name="callback"></param>
/// <param name="context"></param>
/// <param name="frame"></param>
/// <param name="completed"></param>
/// <param name="devicePageOffset"></param>
/// <param name="device"></param>
/// <param name="objectLogDevice"></param>
internal void AsyncReadPagesFromDeviceToFrame<TContext>(
long readPageStart,
int numPages,
long untilAddress,
IOCompletionCallback callback,
TContext context,
BlittableFrame frame,
out CountdownEvent completed,
long devicePageOffset = 0,
IDevice device = null, IDevice objectLogDevice = null)
{
var usedDevice = device;
IDevice usedObjlogDevice = objectLogDevice;
if (device == null)
{
usedDevice = this.device;
}
completed = new CountdownEvent(numPages);
for (long readPage = readPageStart; readPage < (readPageStart + numPages); readPage++)
{
int pageIndex = (int)(readPage % frame.frameSize);
if (frame.frame[pageIndex] == null)
{
frame.Allocate(pageIndex);
}
else
{
frame.Clear(pageIndex);
}
var asyncResult = new PageAsyncReadResult<TContext>()
{
page = readPage,
context = context,
handle = completed,
frame = frame
};
ulong offsetInFile = (ulong)(AlignedPageSizeBytes * readPage);
uint readLength = (uint)AlignedPageSizeBytes;
long adjustedUntilAddress = (AlignedPageSizeBytes * (untilAddress >> LogPageSizeBits) + (untilAddress & PageSizeMask));
if (adjustedUntilAddress > 0 && ((adjustedUntilAddress - (long)offsetInFile) < PageSize))
{
readLength = (uint)(adjustedUntilAddress - (long)offsetInFile);
readLength = (uint)((readLength + (sectorSize - 1)) & ~(sectorSize - 1));
}
if (device != null)
offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset));
usedDevice.ReadAsync(offsetInFile, (IntPtr)frame.pointers[pageIndex], readLength, callback, asyncResult);
}
}
}
}

@ -0,0 +1,228 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Threading;
using System.Diagnostics;
namespace FASTER.core
{
/// <summary>
/// Scan iterator for hybrid log
/// </summary>
public class VariableLengthBlittableScanIterator<Key, Value> : IFasterScanIterator<Key, Value>
where Key : new()
where Value : new()
{
private readonly int frameSize;
private readonly VariableLengthBlittableAllocator<Key, Value> hlog;
private readonly long beginAddress, endAddress;
private readonly BlittableFrame frame;
private readonly CountdownEvent[] loaded;
private bool first = true;
private long currentAddress, nextAddress;
private long currentPhysicalAddress;
/// <summary>
/// Current address
/// </summary>
public long CurrentAddress => currentAddress;
/// <summary>
/// Constructor
/// </summary>
/// <param name="hlog"></param>
/// <param name="beginAddress"></param>
/// <param name="endAddress"></param>
/// <param name="scanBufferingMode"></param>
public unsafe VariableLengthBlittableScanIterator(VariableLengthBlittableAllocator<Key, Value> hlog, long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode)
{
this.hlog = hlog;
if (beginAddress == 0)
beginAddress = hlog.GetFirstValidLogicalAddress(0);
this.beginAddress = beginAddress;
this.endAddress = endAddress;
currentAddress = -1;
nextAddress = beginAddress;
if (scanBufferingMode == ScanBufferingMode.SinglePageBuffering)
frameSize = 1;
else if (scanBufferingMode == ScanBufferingMode.DoublePageBuffering)
frameSize = 2;
else if (scanBufferingMode == ScanBufferingMode.NoBuffering)
{
frameSize = 0;
return;
}
frame = new BlittableFrame(frameSize, hlog.PageSize, hlog.GetDeviceSectorSize());
loaded = new CountdownEvent[frameSize];
// Only load addresses flushed to disk
if (nextAddress < hlog.HeadAddress)
{
var frameNumber = (nextAddress >> hlog.LogPageSizeBits) % frameSize;
hlog.AsyncReadPagesFromDeviceToFrame
(nextAddress >> hlog.LogPageSizeBits,
1, endAddress, AsyncReadPagesCallback, Empty.Default,
frame, out loaded[frameNumber]);
}
}
/// <summary>
/// Gets reference to current key
/// </summary>
/// <returns></returns>
public ref Key GetKey()
{
return ref hlog.GetKey(currentPhysicalAddress);
}
/// <summary>
/// Gets reference to current value
/// </summary>
/// <returns></returns>
public ref Value GetValue()
{
return ref hlog.GetValue(currentPhysicalAddress);
}
/// <summary>
/// Get next record in iterator
/// </summary>
/// <param name="recordInfo"></param>
/// <returns></returns>
public bool GetNext(out RecordInfo recordInfo)
{
recordInfo = default(RecordInfo);
currentAddress = nextAddress;
while (true)
{
// Check for boundary conditions
if (currentAddress >= endAddress)
{
return false;
}
if (currentAddress < hlog.BeginAddress)
{
throw new Exception("Iterator address is less than log BeginAddress " + hlog.BeginAddress);
}
if (frameSize == 0 && currentAddress < hlog.HeadAddress)
{
throw new Exception("Iterator address is less than log HeadAddress in memory-scan mode");
}
var currentPage = currentAddress >> hlog.LogPageSizeBits;
var offset = currentAddress & hlog.PageSizeMask;
if (currentAddress < hlog.HeadAddress)
BufferAndLoad(currentAddress, currentPage, currentPage % frameSize);
var physicalAddress = default(long);
if (currentAddress >= hlog.HeadAddress)
physicalAddress = hlog.GetPhysicalAddress(currentAddress);
else
physicalAddress = frame.GetPhysicalAddress(currentPage % frameSize, offset);
// Check if record fits on page, if not skip to next page
var recordSize = hlog.GetRecordSize(physicalAddress);
if ((currentAddress & hlog.PageSizeMask) + recordSize > hlog.PageSize)
{
currentAddress = (1 + (currentAddress >> hlog.LogPageSizeBits)) << hlog.LogPageSizeBits;
continue;
}
ref var info = ref hlog.GetInfo(physicalAddress);
if (info.Invalid || info.IsNull())
{
currentAddress += recordSize;
continue;
}
currentPhysicalAddress = physicalAddress;
recordInfo = info;
nextAddress = currentAddress + recordSize;
return true;
}
}
/// <summary>
/// Get next record in iterator
/// </summary>
/// <param name="recordInfo"></param>
/// <param name="key"></param>
/// <param name="value"></param>
/// <returns></returns>
public bool GetNext(out RecordInfo recordInfo, out Key key, out Value value)
{
throw new NotSupportedException("Use GetNext(out RecordInfo) to retrieve references to key/value");
}
/// <summary>
/// Dispose the iterator
/// </summary>
public void Dispose()
{
frame?.Dispose();
}
private unsafe void BufferAndLoad(long currentAddress, long currentPage, long currentFrame)
{
if (first || (currentAddress & hlog.PageSizeMask) == 0)
{
// Prefetch pages based on buffering mode
if (frameSize == 1)
{
if (!first)
{
hlog.AsyncReadPagesFromDeviceToFrame(currentAddress >> hlog.LogPageSizeBits, 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[currentFrame]);
}
}
else
{
var endPage = endAddress >> hlog.LogPageSizeBits;
if ((endPage > currentPage) &&
((endPage > currentPage + 1) || ((endAddress & hlog.PageSizeMask) != 0)))
{
hlog.AsyncReadPagesFromDeviceToFrame(1 + (currentAddress >> hlog.LogPageSizeBits), 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[(currentPage + 1) % frameSize]);
}
}
first = false;
}
loaded[currentFrame].Wait();
}
private unsafe void AsyncReadPagesCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
var result = (PageAsyncReadResult<Empty>)Overlapped.Unpack(overlap).AsyncResult;
if (result.freeBuffer1 != null)
{
hlog.PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, result.page);
result.freeBuffer1.Return();
result.freeBuffer1 = null;
}
if (result.handle != null)
{
result.handle.Signal();
}
Interlocked.MemoryBarrier();
Overlapped.Free(overlap);
}
}
}

@ -0,0 +1,52 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.IO;
using System.Linq.Expressions;
using System.Runtime.InteropServices;
namespace FASTER.core
{
/// <summary>
/// Factory to create FASTER objects
/// </summary>
public static class Devices
{
/// <summary>
/// This value is supplied for capacity when the device does not have a specified limit.
/// </summary>
public const long CAPACITY_UNSPECIFIED = -1;
private const string EMULATED_STORAGE_STRING = "UseDevelopmentStorage=true;";
private const string TEST_CONTAINER = "test";
/// <summary>
/// Create a storage device for the log
/// </summary>
/// <param name="logPath">Path to file that will store the log (empty for null device)</param>
/// <param name="preallocateFile">Whether we try to preallocate the file on creation</param>
/// <param name="deleteOnClose">Delete files on close</param>
/// <param name="capacity">The maximal number of bytes this storage device can accommondate, or CAPACITY_UNSPECIFIED if there is no such limit</param>
/// <param name="recoverDevice">Whether to recover device metadata from existing files</param>
/// <returns>Device instance</returns>
public static IDevice CreateLogDevice(string logPath, bool preallocateFile = true, bool deleteOnClose = false, long capacity = CAPACITY_UNSPECIFIED, bool recoverDevice = false)
{
if (string.IsNullOrWhiteSpace(logPath))
return new NullDevice();
IDevice logDevice;
#if DOTNETCORE
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
logDevice = new ManagedLocalStorageDevice(logPath, preallocateFile, deleteOnClose, capacity, recoverDevice);
}
else
#endif
{
logDevice = new LocalStorageDevice(logPath, preallocateFile, deleteOnClose, true, capacity, recoverDevice);
}
return logDevice;
}
}
}

@ -0,0 +1,161 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Interface for devices
/// </summary>
public interface IDevice
{
/// <summary>
/// Size of sector
/// </summary>
uint SectorSize { get; }
/// <summary>
/// Name of device
/// </summary>
string FileName { get; }
/// <summary>
/// Returns the maximum capacity of the storage device, in number of bytes.
/// If returned CAPACITY_UNSPECIFIED, the storage device has no specfied capacity limit.
/// </summary>
long Capacity { get; }
/// <summary>
/// A device breaks up each logical log into multiple self-contained segments that are of the same size.
/// It is an atomic unit of data that cannot be partially present on a device (i.e. either the entire segment
/// is present or no data from the segment is present). Examples of this include files or named blobs. This
/// property returns the size of each segment.
/// </summary>
long SegmentSize { get; }
/// <summary>
/// The index of the first segment present on this device
/// </summary>
int StartSegment { get; }
/// <summary>
/// The index of the last segment present on this device
/// </summary>
int EndSegment { get; }
/// <summary>
/// Initialize device. This function is used to pass optional information that may only be known after
/// FASTER initialization (whose constructor takes in IDevice upfront). Implementation are free to ignore
/// information if it does not need the supplied information.
///
/// This is a bit of a hack.
/// </summary>
/// <param name="segmentSize"></param>
/// <param name="epoch">
/// The instance of the epoch protection framework to use, if needed
/// </param>
void Initialize(long segmentSize, LightEpoch epoch = null);
/* Segmented addressing API */
/// <summary>
/// Write
/// </summary>
/// <param name="sourceAddress"></param>
/// <param name="segmentId"></param>
/// <param name="destinationAddress"></param>
/// <param name="numBytesToWrite"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
void WriteAsync(IntPtr sourceAddress, int segmentId, ulong destinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult);
/// <summary>
/// Read
/// </summary>
/// <param name="segmentId"></param>
/// <param name="sourceAddress"></param>
/// <param name="destinationAddress"></param>
/// <param name="readLength"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
void ReadAsync(int segmentId, ulong sourceAddress, IntPtr destinationAddress, uint readLength, IOCompletionCallback callback, IAsyncResult asyncResult);
/* Direct addressing API */
/// <summary>
/// Write
/// </summary>
/// <param name="alignedSourceAddress"></param>
/// <param name="alignedDestinationAddress"></param>
/// <param name="numBytesToWrite"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
void WriteAsync(IntPtr alignedSourceAddress, ulong alignedDestinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult);
/// <summary>
/// Read
/// </summary>
/// <param name="alignedSourceAddress"></param>
/// <param name="alignedDestinationAddress"></param>
/// <param name="aligned_read_length"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
void ReadAsync(ulong alignedSourceAddress, IntPtr alignedDestinationAddress, uint aligned_read_length, IOCompletionCallback callback, IAsyncResult asyncResult);
/// <summary>
/// Truncates the log until the given address. The truncated portion should no longer be accessed as the device is no longer responsible for
/// its maintenance, but physical deletion may not happen immediately.
/// </summary>
/// <param name="toAddress">upper bound of truncated address</param>
/// <param name="callback">callback to invoke when truncation is complete</param>
/// <param name="result">result to be passed to the callback</param>
void TruncateUntilAddressAsync(long toAddress, AsyncCallback callback, IAsyncResult result);
/// <summary>
/// Truncates the log until the given address. The truncated portion should no longer be accessed as the device is no longer responsible for
/// its maintenance, but physical deletion may not happen immediately. This version of the function can block.
/// </summary>
/// <param name="toAddress">upper bound of truncated address</param>
void TruncateUntilAddress(long toAddress);
/// <summary>
/// Truncates the log until the given segment. Physical deletion of the given segments are guaranteed to have happened when the callback is invoked.
/// </summary>
/// <param name="toSegment">the largest (in index) segment to truncate</param>
/// <param name="callback">callback to invoke when truncation is complete</param>
/// <param name="result">result to be passed to the callback</param>
void TruncateUntilSegmentAsync(int toSegment, AsyncCallback callback, IAsyncResult result);
/// <summary>
/// Truncates the log until the given segment. Physical deletion of the given segments are guaranteed to have happened when the function returns.
/// This version of the function can block.
/// </summary>
/// <param name="toSegment">the largest (in index) segment to truncate</param>
void TruncateUntilSegment(int toSegment);
/// <summary>
/// Removes a single segment from the device. This function should not normally be called.
/// Instead, use <see cref="TruncateUntilAddressAsync(long, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segment">index of the segment to remov</param>
/// <param name="callback">callback to invoke when removal is complete</param>
/// <param name="result">result to be passed to the callback</param>
void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result);
/// <summary>
/// Removes a single segment from the device. This function should not normally be called.
/// Instead, use <see cref="TruncateUntilAddressAsync(long, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segment">index of the segment to remov</param>
void RemoveSegment(int segment);
/* Close */
/// <summary>
/// Close
/// </summary>
void Close();
}
}

@ -0,0 +1,303 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using Microsoft.Win32.SafeHandles;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.InteropServices;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Local storage device
/// </summary>
public class LocalStorageDevice : StorageDeviceBase
{
private readonly bool preallocateFile;
private readonly bool deleteOnClose;
private readonly bool disableFileBuffering;
private readonly SafeConcurrentDictionary<int, SafeFileHandle> logHandles;
/// <summary>
/// Constructor
/// </summary>
/// <param name="filename">File name (or prefix) with path</param>
/// <param name="preallocateFile"></param>
/// <param name="deleteOnClose"></param>
/// <param name="disableFileBuffering"></param>
/// <param name="capacity">The maximum number of bytes this storage device can accommondate, or CAPACITY_UNSPECIFIED if there is no such limit </param>
/// <param name="recoverDevice">Whether to recover device metadata from existing files</param>
public LocalStorageDevice(string filename,
bool preallocateFile = false,
bool deleteOnClose = false,
bool disableFileBuffering = true,
long capacity = Devices.CAPACITY_UNSPECIFIED,
bool recoverDevice = false)
: base(filename, GetSectorSize(filename), capacity)
{
Native32.EnableProcessPrivileges();
this.preallocateFile = preallocateFile;
this.deleteOnClose = deleteOnClose;
this.disableFileBuffering = disableFileBuffering;
logHandles = new SafeConcurrentDictionary<int, SafeFileHandle>();
if (recoverDevice)
RecoverFiles();
}
private void RecoverFiles()
{
FileInfo fi = new FileInfo(FileName); // may not exist
DirectoryInfo di = fi.Directory;
if (!di.Exists) return;
string bareName = fi.Name;
List<int> segids = new List<int>();
foreach (FileInfo item in di.GetFiles(bareName + "*"))
{
segids.Add(Int32.Parse(item.Name.Replace(bareName, "").Replace(".", "")));
}
segids.Sort();
int prevSegmentId = -1;
foreach (int segmentId in segids)
{
if (segmentId != prevSegmentId + 1)
{
startSegment = segmentId;
}
else
{
endSegment = segmentId;
}
prevSegmentId = segmentId;
}
// No need to populate map because logHandles use Open or create on files.
}
/// <summary>
///
/// </summary>
/// <param name="segmentId"></param>
/// <param name="sourceAddress"></param>
/// <param name="destinationAddress"></param>
/// <param name="readLength"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public override unsafe void ReadAsync(int segmentId, ulong sourceAddress,
IntPtr destinationAddress,
uint readLength,
IOCompletionCallback callback,
IAsyncResult asyncResult)
{
var logHandle = GetOrAddHandle(segmentId);
Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult);
NativeOverlapped* ovNative = ov.UnsafePack(callback, IntPtr.Zero);
ovNative->OffsetLow = unchecked((int)((ulong)sourceAddress & 0xFFFFFFFF));
ovNative->OffsetHigh = unchecked((int)(((ulong)sourceAddress >> 32) & 0xFFFFFFFF));
bool result = Native32.ReadFile(logHandle,
destinationAddress,
readLength,
out uint bytesRead,
ovNative);
if (!result)
{
int error = Marshal.GetLastWin32Error();
if (error != Native32.ERROR_IO_PENDING)
{
Overlapped.Unpack(ovNative);
Overlapped.Free(ovNative);
throw new Exception("Error reading from log file: " + error);
}
}
}
/// <summary>
///
/// </summary>
/// <param name="sourceAddress"></param>
/// <param name="segmentId"></param>
/// <param name="destinationAddress"></param>
/// <param name="numBytesToWrite"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public override unsafe void WriteAsync(IntPtr sourceAddress,
int segmentId,
ulong destinationAddress,
uint numBytesToWrite,
IOCompletionCallback callback,
IAsyncResult asyncResult)
{
var logHandle = GetOrAddHandle(segmentId);
Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult);
NativeOverlapped* ovNative = ov.UnsafePack(callback, IntPtr.Zero);
ovNative->OffsetLow = unchecked((int)(destinationAddress & 0xFFFFFFFF));
ovNative->OffsetHigh = unchecked((int)((destinationAddress >> 32) & 0xFFFFFFFF));
bool result = Native32.WriteFile(logHandle,
sourceAddress,
numBytesToWrite,
out uint bytesWritten,
ovNative);
if (!result)
{
int error = Marshal.GetLastWin32Error();
if (error != Native32.ERROR_IO_PENDING)
{
Overlapped.Unpack(ovNative);
Overlapped.Free(ovNative);
throw new Exception("Error writing to log file: " + error);
}
}
}
/// <summary>
/// <see cref="IDevice.RemoveSegment(int)"/>
/// </summary>
/// <param name="segment"></param>
public override void RemoveSegment(int segment)
{
if (logHandles.TryRemove(segment, out SafeFileHandle logHandle))
{
logHandle.Dispose();
Native32.DeleteFileW(GetSegmentName(segment));
}
}
/// <summary>
/// <see cref="IDevice.RemoveSegmentAsync(int, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segment"></param>
/// <param name="callback"></param>
/// <param name="result"></param>
public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result)
{
RemoveSegment(segment);
callback(result);
}
// It may be somewhat inefficient to use the default async calls from the base class when the underlying
// method is inherently synchronous. But just for delete (which is called infrequently and off the
// critical path) such inefficiency is probably negligible.
/// <summary>
/// Close device
/// </summary>
public override void Close()
{
foreach (var logHandle in logHandles.Values)
logHandle.Dispose();
}
/// <summary>
///
/// </summary>
/// <param name="segmentId"></param>
/// <returns></returns>
protected string GetSegmentName(int segmentId)
{
return FileName + "." + segmentId;
}
/// <summary>
///
/// </summary>
/// <param name="_segmentId"></param>
/// <returns></returns>
// Can be used to pre-load handles, e.g., after a checkpoint
protected SafeFileHandle GetOrAddHandle(int _segmentId)
{
return logHandles.GetOrAdd(_segmentId, segmentId => CreateHandle(segmentId));
}
private static uint GetSectorSize(string filename)
{
if (!Native32.GetDiskFreeSpace(filename.Substring(0, 3),
out uint lpSectorsPerCluster,
out uint _sectorSize,
out uint lpNumberOfFreeClusters,
out uint lpTotalNumberOfClusters))
{
Debug.WriteLine("Unable to retrieve information for disk " + filename.Substring(0, 3) + " - check if the disk is available and you have specified the full path with drive name. Assuming sector size of 512 bytes.");
_sectorSize = 512;
}
return _sectorSize;
}
private SafeFileHandle CreateHandle(int segmentId)
{
uint fileAccess = Native32.GENERIC_READ | Native32.GENERIC_WRITE;
uint fileShare = unchecked(((uint)FileShare.ReadWrite & ~(uint)FileShare.Inheritable));
uint fileCreation = unchecked((uint)FileMode.OpenOrCreate);
uint fileFlags = Native32.FILE_FLAG_OVERLAPPED;
if (this.disableFileBuffering)
{
fileFlags = fileFlags | Native32.FILE_FLAG_NO_BUFFERING;
}
if (deleteOnClose)
{
fileFlags = fileFlags | Native32.FILE_FLAG_DELETE_ON_CLOSE;
// FILE_SHARE_DELETE allows multiple FASTER instances to share a single log directory and each can specify deleteOnClose.
// This will allow the files to persist until all handles across all instances have been closed.
fileShare = fileShare | Native32.FILE_SHARE_DELETE;
}
var logHandle = Native32.CreateFileW(
GetSegmentName(segmentId),
fileAccess, fileShare,
IntPtr.Zero, fileCreation,
fileFlags, IntPtr.Zero);
if (logHandle.IsInvalid)
{
var error = Marshal.GetLastWin32Error();
throw new IOException($"Error creating log file for {GetSegmentName(segmentId)}, error: {error}", Native32.MakeHRFromErrorCode(error));
}
if (preallocateFile)
SetFileSize(FileName, logHandle, segmentSize);
try
{
ThreadPool.BindHandle(logHandle);
}
catch (Exception e)
{
throw new Exception("Error binding log handle for " + GetSegmentName(segmentId) + ": " + e.ToString());
}
return logHandle;
}
/// Sets file size to the specified value.
/// Does not reset file seek pointer to original location.
private bool SetFileSize(string filename, SafeFileHandle logHandle, long size)
{
if (segmentSize <= 0)
return false;
if (Native32.EnableVolumePrivileges(filename, logHandle))
{
return Native32.SetFileSize(logHandle, size);
}
int lodist = (int)size;
int hidist = (int)(size >> 32);
Native32.SetFilePointer(logHandle, lodist, ref hidist, Native32.EMoveMethod.Begin);
if (!Native32.SetEndOfFile(logHandle)) return false;
return true;
}
}
}

@ -0,0 +1,302 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using Microsoft.Win32.SafeHandles;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.InteropServices;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Managed device using .NET streams
/// </summary>
public class ManagedLocalStorageDevice : StorageDeviceBase
{
private readonly bool preallocateFile;
private readonly bool deleteOnClose;
private readonly ConcurrentDictionary<int, Stream> logHandles;
private SectorAlignedBufferPool pool;
/// <summary>
///
/// </summary>
/// <param name="filename">File name (or prefix) with path</param>
/// <param name="preallocateFile"></param>
/// <param name="deleteOnClose"></param>
/// <param name="capacity">The maximal number of bytes this storage device can accommondate, or CAPACITY_UNSPECIFIED if there is no such limit</param>
/// <param name="recoverDevice">Whether to recover device metadata from existing files</param>
public ManagedLocalStorageDevice(string filename, bool preallocateFile = false, bool deleteOnClose = false, long capacity = Devices.CAPACITY_UNSPECIFIED, bool recoverDevice = false)
: base(filename, GetSectorSize(filename), capacity)
{
pool = new SectorAlignedBufferPool(1, 1);
this.preallocateFile = preallocateFile;
this.deleteOnClose = deleteOnClose;
logHandles = new ConcurrentDictionary<int, Stream>();
if (recoverDevice)
RecoverFiles();
}
private void RecoverFiles()
{
FileInfo fi = new FileInfo(FileName); // may not exist
DirectoryInfo di = fi.Directory;
if (!di.Exists) return;
string bareName = fi.Name;
List<int> segids = new List<int>();
foreach (FileInfo item in di.GetFiles(bareName + "*"))
{
segids.Add(Int32.Parse(item.Name.Replace(bareName, "").Replace(".", "")));
}
segids.Sort();
int prevSegmentId = -1;
foreach (int segmentId in segids)
{
if (segmentId != prevSegmentId + 1)
{
startSegment = segmentId;
}
else
{
endSegment = segmentId;
}
prevSegmentId = segmentId;
}
// No need to populate map because logHandles use Open or create on files.
}
class ReadCallbackWrapper
{
readonly Stream logHandle;
readonly IOCompletionCallback callback;
readonly IAsyncResult asyncResult;
SectorAlignedMemory memory;
readonly IntPtr destinationAddress;
readonly uint readLength;
public ReadCallbackWrapper(Stream logHandle, IOCompletionCallback callback, IAsyncResult asyncResult, SectorAlignedMemory memory, IntPtr destinationAddress, uint readLength)
{
this.logHandle = logHandle;
this.callback = callback;
this.asyncResult = asyncResult;
this.memory = memory;
this.destinationAddress = destinationAddress;
this.readLength = readLength;
}
public unsafe void Callback(IAsyncResult result)
{
uint errorCode = 0;
try
{
logHandle.EndRead(result);
fixed (void* source = memory.buffer)
{
Buffer.MemoryCopy(source, (void*)destinationAddress, readLength, readLength);
}
}
catch
{
errorCode = 1;
}
memory.Return();
Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult);
callback(errorCode, 0, ov.UnsafePack(callback, IntPtr.Zero));
}
}
class WriteCallbackWrapper
{
readonly Stream logHandle;
readonly IOCompletionCallback callback;
readonly IAsyncResult asyncResult;
SectorAlignedMemory memory;
public WriteCallbackWrapper(Stream logHandle, IOCompletionCallback callback, IAsyncResult asyncResult, SectorAlignedMemory memory)
{
this.callback = callback;
this.asyncResult = asyncResult;
this.memory = memory;
this.logHandle = logHandle;
}
public unsafe void Callback(IAsyncResult result)
{
uint errorCode = 0;
try
{
logHandle.EndWrite(result);
}
catch
{
errorCode = 1;
}
memory.Return();
Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult);
callback(errorCode, 0, ov.UnsafePack(callback, IntPtr.Zero));
}
}
/// <summary>
///
/// </summary>
/// <param name="segmentId"></param>
/// <param name="sourceAddress"></param>
/// <param name="destinationAddress"></param>
/// <param name="readLength"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public override unsafe void ReadAsync(int segmentId, ulong sourceAddress,
IntPtr destinationAddress,
uint readLength,
IOCompletionCallback callback,
IAsyncResult asyncResult)
{
var logHandle = GetOrAddHandle(segmentId);
var memory = pool.Get((int)readLength);
logHandle.Seek((long)sourceAddress, SeekOrigin.Begin);
logHandle.BeginRead(memory.buffer, 0, (int)readLength,
new ReadCallbackWrapper(logHandle, callback, asyncResult, memory, destinationAddress, readLength).Callback, null);
}
/// <summary>
///
/// </summary>
/// <param name="sourceAddress"></param>
/// <param name="segmentId"></param>
/// <param name="destinationAddress"></param>
/// <param name="numBytesToWrite"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public override unsafe void WriteAsync(IntPtr sourceAddress,
int segmentId,
ulong destinationAddress,
uint numBytesToWrite,
IOCompletionCallback callback,
IAsyncResult asyncResult)
{
var logHandle = GetOrAddHandle(segmentId);
var memory = pool.Get((int)numBytesToWrite);
fixed (void* destination = memory.buffer)
{
Buffer.MemoryCopy((void*)sourceAddress, destination, numBytesToWrite, numBytesToWrite);
}
logHandle.Seek((long)destinationAddress, SeekOrigin.Begin);
logHandle.BeginWrite(memory.buffer, 0, (int)numBytesToWrite,
new WriteCallbackWrapper(logHandle, callback, asyncResult, memory).Callback, null);
}
/// <summary>
/// <see cref="IDevice.RemoveSegment(int)"/>
/// </summary>
/// <param name="segment"></param>
public override void RemoveSegment(int segment)
{
if (logHandles.TryRemove(segment, out Stream logHandle))
{
logHandle.Dispose();
File.Delete(GetSegmentName(segment));
}
}
/// <summary>
/// <see cref="IDevice.RemoveSegmentAsync(int, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segment"></param>
/// <param name="callback"></param>
/// <param name="result"></param>
public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result)
{
RemoveSegment(segment);
callback(result);
}
/// <summary>
///
/// </summary>
public override void Close()
{
foreach (var logHandle in logHandles.Values)
logHandle.Dispose();
pool.Free();
}
private string GetSegmentName(int segmentId)
{
return FileName + "." + segmentId;
}
private static uint GetSectorSize(string filename)
{
#if DOTNETCORE
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
Debug.WriteLine("Assuming 512 byte sector alignment for disk with file " + filename);
return 512;
}
#endif
if (!Native32.GetDiskFreeSpace(filename.Substring(0, 3),
out uint lpSectorsPerCluster,
out uint _sectorSize,
out uint lpNumberOfFreeClusters,
out uint lpTotalNumberOfClusters))
{
Debug.WriteLine("Unable to retrieve information for disk " + filename.Substring(0, 3) + " - check if the disk is available and you have specified the full path with drive name. Assuming sector size of 512 bytes.");
_sectorSize = 512;
}
return _sectorSize;
}
private Stream CreateHandle(int segmentId)
{
FileOptions fo = FileOptions.WriteThrough;
fo |= FileOptions.Asynchronous;
if (deleteOnClose)
fo |= FileOptions.DeleteOnClose;
var logHandle = new FileStream(
GetSegmentName(segmentId), FileMode.OpenOrCreate,
FileAccess.ReadWrite, FileShare.ReadWrite, 4096, fo);
if (preallocateFile && segmentSize != -1)
SetFileSize(FileName, logHandle, segmentSize);
return logHandle;
}
private Stream GetOrAddHandle(int _segmentId)
{
return logHandles.GetOrAdd(_segmentId, segmentId => CreateHandle(segmentId));
}
/// <summary>
/// Sets file size to the specified value.
/// Does not reset file seek pointer to original location.
/// </summary>
/// <param name="filename"></param>
/// <param name="logHandle"></param>
/// <param name="size"></param>
/// <returns></returns>
private bool SetFileSize(string filename, Stream logHandle, long size)
{
logHandle.SetLength(size);
return true;
}
}
}

@ -0,0 +1,88 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Threading;
namespace FASTER.core
{
/// <summary>
///
/// </summary>
public class NullDevice : StorageDeviceBase
{
/// <summary>
///
/// </summary>
public NullDevice() : base("null", 512, Devices.CAPACITY_UNSPECIFIED)
{
}
/// <summary>
///
/// </summary>
/// <param name="segmentId"></param>
/// <param name="alignedSourceAddress"></param>
/// <param name="alignedDestinationAddress"></param>
/// <param name="aligned_read_length"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public override unsafe void ReadAsync(int segmentId, ulong alignedSourceAddress, IntPtr alignedDestinationAddress, uint aligned_read_length, IOCompletionCallback callback, IAsyncResult asyncResult)
{
alignedSourceAddress = ((ulong)segmentId << 30) | alignedSourceAddress;
Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult);
NativeOverlapped* ov_native = ov.UnsafePack(callback, IntPtr.Zero);
ov_native->OffsetLow = unchecked((int)(alignedSourceAddress & 0xFFFFFFFF));
ov_native->OffsetHigh = unchecked((int)((alignedSourceAddress >> 32) & 0xFFFFFFFF));
callback(0, aligned_read_length, ov_native);
}
/// <summary>
///
/// </summary>
/// <param name="alignedSourceAddress"></param>
/// <param name="segmentId"></param>
/// <param name="alignedDestinationAddress"></param>
/// <param name="numBytesToWrite"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public override unsafe void WriteAsync(IntPtr alignedSourceAddress, int segmentId, ulong alignedDestinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult)
{
alignedDestinationAddress = ((ulong)segmentId << 30) | alignedDestinationAddress;
Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult);
NativeOverlapped* ov_native = ov.UnsafePack(callback, IntPtr.Zero);
ov_native->OffsetLow = unchecked((int)(alignedDestinationAddress & 0xFFFFFFFF));
ov_native->OffsetHigh = unchecked((int)((alignedDestinationAddress >> 32) & 0xFFFFFFFF));
callback(0, numBytesToWrite, ov_native);
}
/// <summary>
/// <see cref="IDevice.RemoveSegment(int)"/>
/// </summary>
/// <param name="segment"></param>
public override void RemoveSegment(int segment)
{
// No-op
}
/// <summary>
/// <see cref="IDevice.RemoveSegmentAsync(int, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segment"></param>
/// <param name="callback"></param>
/// <param name="result"></param>
public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result) => callback(result);
/// <summary>
/// <see cref="IDevice.Close"/>
/// </summary>
public override void Close()
{
}
}
}

@ -0,0 +1,312 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Interface that encapsulates a sharding strategy that is used by <see cref="ShardedStorageDevice"/>. This
/// allows users to customize their sharding behaviors. Some default implementations are supplied for common
/// partitioning schemes.
/// </summary>
interface IPartitionScheme
{
/// <summary>
/// A list of <see cref="IDevice"/> that represents the shards. Indexes into this list will be
/// used as unique identifiers for the shards.
/// </summary>
IList<IDevice> Devices { get; }
/// <summary>
/// Maps a range in the unified logical address space into a contiguous physical chunk on a shard's address space.
/// Because the given range may be sharded across multiple devices, only the largest contiguous chunk starting from
/// start address but smaller than end address is returned in shard, shardStartAddress, and shardEndAddress.
/// </summary>
/// <param name="startAddress">start address of the range to map in the logical address space</param>
/// <param name="endAddress">end address of the range to map in the logical address space</param>
/// <param name="shard"> the shard (potentially part of) the given range resides in, given as index into <see cref="Devices"/></param>
/// <param name="shardStartAddress"> start address translated into physical start address on the returned shard </param>
/// <param name="shardEndAddress">
/// physical address of the end of the part of the range on the returned shard. This is not necessarily a translation of the end address
/// given, as the tail of the range maybe on (a) different device(s).
/// </param>
/// <returns>
/// the logical address translated from the returned shardEndAddress. If this is not equal to the given end address, the caller is
/// expected to repeatedly call this method using the returned value as the new startAddress until the entire original range is
/// covered.
/// </returns>
long MapRange(long startAddress, long endAddress, out int shard, out long shardStartAddress, out long shardEndAddress);
/// <summary>
/// Maps the sector size of a composed device into sector sizes for each shard
/// </summary>
/// <param name="sectorSize">sector size of the composed device</param>
/// <param name="shard">the shard</param>
/// <returns>sector size on shard</returns>
long MapSectorSize(long sectorSize, int shard);
}
/// <summary>
/// Uniformly shards data across given devices.
/// </summary>
class UniformPartitionScheme : IPartitionScheme
{
public IList<IDevice> Devices { get; }
private readonly long chunkSize;
/// <summary>
/// Constructs a UniformPartitionScheme to shard data uniformly across given devices. Suppose we have 3 devices and the following logical write:
/// [chunk 1][chunk 2][chunk 3][chunk 4]...
/// chunk 1 is written on device 0, 2 on device 1, 3 on device 2, 4 on device 0, etc.
/// </summary>
/// <param name="chunkSize">size of each chunk</param>
/// <param name="devices">the devices to compose from</param>
public UniformPartitionScheme(long chunkSize, IList<IDevice> devices)
{
Debug.Assert(devices.Count != 0, "There cannot be zero shards");
Debug.Assert(chunkSize > 0, "chunk size should not be negative");
Debug.Assert((chunkSize & (chunkSize - 1)) == 0, "Chunk size must be a power of 2");
this.Devices = devices;
this.chunkSize = chunkSize;
foreach (IDevice device in Devices)
{
Debug.Assert(chunkSize % device.SectorSize == 0, "A single device sector cannot be partitioned");
}
}
/// <summary>
/// vararg version of <see cref="UniformPartitionScheme(long, IList{IDevice})"/>
/// </summary>
/// <param name="chunkSize"></param>
/// <param name="devices"></param>
public UniformPartitionScheme(long chunkSize, params IDevice[] devices) : this(chunkSize, (IList<IDevice>)devices)
{
}
/// <summary>
/// <see cref="IPartitionScheme.MapRange(long, long, out int, out long, out long)"/>
/// </summary>
/// <param name="startAddress"></param>
/// <param name="endAddress"></param>
/// <param name="shard"></param>
/// <param name="shardStartAddress"></param>
/// <param name="shardEndAddress"></param>
/// <returns></returns>
public long MapRange(long startAddress, long endAddress, out int shard, out long shardStartAddress, out long shardEndAddress)
{
long chunkId = startAddress / chunkSize;
shard = (int)(chunkId % Devices.Count);
shardStartAddress = chunkId / Devices.Count * chunkSize + startAddress % chunkSize;
long chunkEndAddress = (chunkId + 1) * chunkSize;
if (endAddress > chunkEndAddress)
{
shardEndAddress = shardStartAddress + chunkSize;
return chunkEndAddress;
}
else
{
shardEndAddress = endAddress - startAddress + shardStartAddress;
return endAddress;
}
}
/// <summary>
/// <see cref="IPartitionScheme.MapSectorSize(long, int)"/>
/// </summary>
/// <param name="sectorSize"></param>
/// <param name="shard"></param>
/// <returns></returns>
public long MapSectorSize(long sectorSize, int shard)
{
var numChunks = sectorSize / chunkSize;
// ceiling of (a div b) is (a + b - 1) / b where div is mathematical division and / is integer division
return (numChunks + Devices.Count - 1) / Devices.Count * chunkSize;
}
}
/// <summary>
/// A <see cref="ShardedStorageDevice"/> logically composes multiple <see cref="IDevice"/> into a single storage device
/// by sharding writes into different devices according to a supplied <see cref="IPartitionScheme"/>. The goal is to be
/// able to issue large reads and writes in parallel into multiple devices and improve throughput. Beware that this
/// code does not contain error detection or correction mechanism to cope with increased failure from more devices.
/// </summary>
class ShardedStorageDevice : StorageDeviceBase
{
private readonly IPartitionScheme partitions;
/// <summary>
/// Constructs a new ShardedStorageDevice with the given partition scheme
/// </summary>
/// <param name="partitions"> The parition scheme to use </param>
public ShardedStorageDevice(IPartitionScheme partitions) : base("", 512, -1)
{
this.partitions = partitions;
}
/// <summary>
/// <see cref="IDevice.Close"/>
/// </summary>
public override void Close()
{
foreach (IDevice device in partitions.Devices)
{
device.Close();
}
}
/// <summary>
/// <see cref="IDevice.Initialize(long, LightEpoch)"/>
/// </summary>
/// <param name="segmentSize"></param>
/// <param name="epoch"></param>
public override void Initialize(long segmentSize, LightEpoch epoch)
{
base.Initialize(segmentSize, epoch);
for (int i = 0; i < partitions.Devices.Count; i++)
{
partitions.Devices[i].Initialize(partitions.MapSectorSize(segmentSize, 0), epoch);
}
}
/// <summary>
/// <see cref="IDevice.RemoveSegmentAsync(int, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segment"></param>
/// <param name="callback"></param>
/// <param name="result"></param>
public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result)
{
var countdown = new CountdownEvent(partitions.Devices.Count);
foreach (IDevice shard in partitions.Devices)
{
shard.RemoveSegmentAsync(segment, ar =>
{
if (countdown.Signal())
{
callback(ar);
countdown.Dispose();
}
}, result);
}
}
/// <summary>
/// <see cref="IDevice.WriteAsync(IntPtr, int, ulong, uint, IOCompletionCallback, IAsyncResult)"/>
/// </summary>
/// <param name="sourceAddress"></param>
/// <param name="segmentId"></param>
/// <param name="destinationAddress"></param>
/// <param name="numBytesToWrite"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public unsafe override void WriteAsync(IntPtr sourceAddress, int segmentId, ulong destinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult)
{
// Starts off in one, in order to prevent some issued writes calling the callback before all parallel writes are issued.
var countdown = new CountdownEvent(1);
long currentWriteStart = (long)destinationAddress;
long writeEnd = currentWriteStart + (long)numBytesToWrite;
uint aggregateErrorCode = 0;
while (currentWriteStart < writeEnd)
{
long newStart = partitions.MapRange(currentWriteStart, writeEnd, out int shard, out long shardStartAddress, out long shardEndAddress);
ulong writeOffset = (ulong)currentWriteStart - destinationAddress;
// Indicate that there is one more task to wait for
countdown.AddCount();
// Because more than one device can return with an error, it is important that we remember the most recent error code we saw. (It is okay to only
// report one error out of many. It will be as if we failed on that error and cancelled all other reads, even though we issue reads in parallel and
// wait until all of them are complete in the implementation)
// Can there be races on async result as we issue writes or reads in parallel?
partitions.Devices[shard].WriteAsync(IntPtr.Add(sourceAddress, (int)writeOffset),
segmentId,
(ulong)shardStartAddress,
(uint)(shardEndAddress - shardStartAddress),
(e, n, o) =>
{
// TODO: Check if it is incorrect to ignore o
if (e != 0) aggregateErrorCode = e;
if (countdown.Signal())
{
callback(aggregateErrorCode, n, o);
countdown.Dispose();
}
else
{
Overlapped.Free(o);
}
},
asyncResult);
currentWriteStart = newStart;
}
// TODO: Check if overlapped wrapper is handled correctly
if (countdown.Signal())
{
Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult);
NativeOverlapped* ovNative = ov.UnsafePack(callback, IntPtr.Zero);
callback(aggregateErrorCode, numBytesToWrite, ovNative);
countdown.Dispose();
}
}
/// <summary>
/// <see cref="IDevice.ReadAsync(int, ulong, IntPtr, uint, IOCompletionCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segmentId"></param>
/// <param name="sourceAddress"></param>
/// <param name="destinationAddress"></param>
/// <param name="readLength"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public unsafe override void ReadAsync(int segmentId, ulong sourceAddress, IntPtr destinationAddress, uint readLength, IOCompletionCallback callback, IAsyncResult asyncResult)
{
// Starts off in one, in order to prevent some issued writes calling the callback before all parallel writes are issued.
var countdown = new CountdownEvent(1);
long currentReadStart = (long)sourceAddress;
long readEnd = currentReadStart + readLength;
uint aggregateErrorCode = 0;
while (currentReadStart < readEnd)
{
long newStart = partitions.MapRange(currentReadStart, readEnd, out int shard, out long shardStartAddress, out long shardEndAddress);
ulong writeOffset = (ulong)currentReadStart - sourceAddress;
// Because more than one device can return with an error, it is important that we remember the most recent error code we saw. (It is okay to only
// report one error out of many. It will be as if we failed on that error and cancelled all other reads, even though we issue reads in parallel and
// wait until all of them are complete in the implementation)
countdown.AddCount();
partitions.Devices[shard].ReadAsync(segmentId,
(ulong)shardStartAddress,
IntPtr.Add(destinationAddress, (int)writeOffset),
(uint)(shardEndAddress - shardStartAddress),
(e, n, o) =>
{
// TODO: this is incorrect if returned "bytes" written is allowed to be less than requested like POSIX.
if (e != 0) aggregateErrorCode = e;
if (countdown.Signal())
{
callback(aggregateErrorCode, n, o);
countdown.Dispose();
}
else
{
Overlapped.Free(o);
}
},
asyncResult);
currentReadStart = newStart;
}
// TODO: Check handling of overlapped wrapper
if (countdown.Signal())
{
Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult);
NativeOverlapped* ovNative = ov.UnsafePack(callback, IntPtr.Zero);
callback(aggregateErrorCode, readLength, ovNative);
countdown.Dispose();
}
}
}
}

@ -0,0 +1,279 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using Microsoft.Win32.SafeHandles;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
namespace FASTER.core
{
/// <summary>
///
/// </summary>
public abstract class StorageDeviceBase : IDevice
{
/// <summary>
///
/// </summary>
public uint SectorSize { get; }
/// <summary>
///
/// </summary>
public string FileName { get; }
/// <summary>
/// <see cref="IDevice.Capacity"/>
/// </summary>
public long Capacity { get; }
/// <summary>
/// <see cref="IDevice.StartSegment"/>
/// </summary>
public int StartSegment { get { return startSegment; } }
/// <summary>
/// <see cref="IDevice.EndSegment"/>
/// </summary>
public int EndSegment { get { return endSegment; } }
/// <summary>
/// <see cref="IDevice.SegmentSize"/>
/// </summary>
public long SegmentSize { get { return segmentSize; } }
/// <summary>
/// Segment size
/// </summary>
protected long segmentSize;
private int segmentSizeBits;
private ulong segmentSizeMask;
/// <summary>
/// Instance of the epoch protection framework in the current system.
/// A device may have internal in-memory data structure that requires epoch protection under concurrent access.
/// </summary>
protected LightEpoch epoch;
/// <summary>
/// start and end segment corresponding to <see cref="StartSegment"/> and <see cref="EndSegment"/>. Subclasses are
/// allowed to modify these as needed.
/// </summary>
protected int startSegment = 0, endSegment = -1;
/// <summary>
/// Initializes a new StorageDeviceBase
/// </summary>
/// <param name="filename">Name of the file to use</param>
/// <param name="sectorSize">The smallest unit of write of the underlying storage device (e.g. 512 bytes for a disk) </param>
/// <param name="capacity">The maximal number of bytes this storage device can accommondate, or CAPAPCITY_UNSPECIFIED if there is no such limit </param>
public StorageDeviceBase(string filename, uint sectorSize, long capacity)
{
FileName = filename;
SectorSize = sectorSize;
segmentSize = -1;
segmentSizeBits = 64;
segmentSizeMask = ~0UL;
Capacity = capacity;
}
/// <summary>
/// Initialize device
/// </summary>
/// <param name="segmentSize"></param>
/// <param name="epoch"></param>
public virtual void Initialize(long segmentSize, LightEpoch epoch = null)
{
Debug.Assert(Capacity == -1 || Capacity % segmentSize == 0, "capacity must be a multiple of segment sizes");
this.segmentSize = segmentSize;
this.epoch = epoch;
if (!Utility.IsPowerOfTwo(segmentSize))
{
if (segmentSize != -1)
throw new Exception("Invalid segment size: " + segmentSize);
segmentSizeBits = 64;
segmentSizeMask = ~0UL;
}
else
{
segmentSizeBits = Utility.GetLogBase2((ulong)segmentSize);
segmentSizeMask = (ulong)segmentSize - 1;
}
}
/// <summary>
///
/// </summary>
/// <param name="alignedSourceAddress"></param>
/// <param name="alignedDestinationAddress"></param>
/// <param name="numBytesToWrite"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public void WriteAsync(IntPtr alignedSourceAddress, ulong alignedDestinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult)
{
int segment = (int)(segmentSizeBits < 64 ? alignedDestinationAddress >> segmentSizeBits : 0);
// If the device has bounded space, and we are writing a new segment, need to check whether an existing segment needs to be evicted.
if (Capacity != Devices.CAPACITY_UNSPECIFIED && Utility.MonotonicUpdate(ref endSegment, segment, out int oldEnd))
{
// Attempt to update the stored range until there are enough space on the tier to accomodate the current logTail
int newStartSegment = endSegment - (int)(Capacity >> segmentSizeBits);
// Assuming that we still have enough physical capacity to write another segment, even if delete does not immediately free up space.
TruncateUntilSegmentAsync(newStartSegment, r => { }, null);
}
WriteAsync(
alignedSourceAddress,
segment,
alignedDestinationAddress & segmentSizeMask,
numBytesToWrite, callback, asyncResult);
}
/// <summary>
///
/// </summary>
/// <param name="alignedSourceAddress"></param>
/// <param name="alignedDestinationAddress"></param>
/// <param name="aligned_read_length"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public void ReadAsync(ulong alignedSourceAddress, IntPtr alignedDestinationAddress, uint aligned_read_length, IOCompletionCallback callback, IAsyncResult asyncResult)
{
var segment = segmentSizeBits < 64 ? alignedSourceAddress >> segmentSizeBits : 0;
ReadAsync(
(int)segment,
alignedSourceAddress & segmentSizeMask,
alignedDestinationAddress,
aligned_read_length, callback, asyncResult);
}
/// <summary>
/// <see cref="IDevice.RemoveSegmentAsync(int, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segment"></param>
/// <param name="callback"></param>
/// <param name="result"></param>
public abstract void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result);
/// <summary>
/// <see cref="IDevice.RemoveSegment(int)"/>
/// By default the implementation calls into <see cref="RemoveSegmentAsync(int, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="segment"></param>
public virtual void RemoveSegment(int segment)
{
ManualResetEventSlim completionEvent = new ManualResetEventSlim(false);
RemoveSegmentAsync(segment, r => completionEvent.Set(), null);
completionEvent.Wait();
}
/// <summary>
/// <see cref="IDevice.TruncateUntilSegmentAsync(int, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="toSegment"></param>
/// <param name="callback"></param>
/// <param name="result"></param>
public void TruncateUntilSegmentAsync(int toSegment, AsyncCallback callback, IAsyncResult result)
{
// Reset begin range to at least toAddress
if (!Utility.MonotonicUpdate(ref startSegment, toSegment, out int oldStart))
{
// If no-op, invoke callback and return immediately
callback(result);
return;
}
CountdownEvent countdown = new CountdownEvent(toSegment - oldStart);
// This action needs to be epoch-protected because readers may be issuing reads to the deleted segment, unaware of the delete.
// Because of earlier compare-and-swap, the caller has exclusive access to the range [oldStartSegment, newStartSegment), and there will
// be no double deletes.
epoch.BumpCurrentEpoch(() =>
{
for (int i = oldStart; i < toSegment; i++)
{
RemoveSegmentAsync(i, r => {
if (countdown.Signal())
{
callback(r);
countdown.Dispose();
}
}, result);
}
});
}
/// <summary>
/// <see cref="IDevice.TruncateUntilSegment(int)"/>
/// </summary>
/// <param name="toSegment"></param>
public void TruncateUntilSegment(int toSegment)
{
using (ManualResetEventSlim completionEvent = new ManualResetEventSlim(false))
{
TruncateUntilSegmentAsync(toSegment, r => completionEvent.Set(), null);
completionEvent.Wait();
}
}
/// <summary>
/// <see cref="IDevice.TruncateUntilAddressAsync(long, AsyncCallback, IAsyncResult)"/>
/// </summary>
/// <param name="toAddress"></param>
/// <param name="callback"></param>
/// <param name="result"></param>
public virtual void TruncateUntilAddressAsync(long toAddress, AsyncCallback callback, IAsyncResult result)
{
// Truncate only up to segment boundary if address is not aligned
TruncateUntilSegmentAsync((int)(toAddress >> segmentSizeBits), callback, result);
}
/// <summary>
/// <see cref="IDevice.TruncateUntilAddress(long)"/>
/// </summary>
/// <param name="toAddress"></param>
public virtual void TruncateUntilAddress(long toAddress)
{
using (ManualResetEventSlim completionEvent = new ManualResetEventSlim(false))
{
TruncateUntilAddressAsync(toAddress, r => completionEvent.Set(), null);
completionEvent.Wait();
}
}
/// <summary>
///
/// </summary>
/// <param name="sourceAddress"></param>
/// <param name="segmentId"></param>
/// <param name="destinationAddress"></param>
/// <param name="numBytesToWrite"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public abstract void WriteAsync(IntPtr sourceAddress, int segmentId, ulong destinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult);
/// <summary>
///
/// </summary>
/// <param name="segmentId"></param>
/// <param name="sourceAddress"></param>
/// <param name="destinationAddress"></param>
/// <param name="readLength"></param>
/// <param name="callback"></param>
/// <param name="asyncResult"></param>
public abstract void ReadAsync(int segmentId, ulong sourceAddress, IntPtr destinationAddress, uint readLength, IOCompletionCallback callback, IAsyncResult asyncResult);
/// <summary>
///
/// </summary>
public abstract void Close();
}
}

@ -0,0 +1,176 @@
using System;
using System.Collections.Generic;
using System.Text;
using System.Diagnostics;
using System.Threading;
using System.ComponentModel;
using System.Collections.Concurrent;
namespace FASTER.core
{
/// <summary>
/// A <see cref="TieredStorageDevice"/> logically composes multiple <see cref="IDevice"/> into a single storage device. It is assumed
/// that some <see cref="IDevice"/> are used as caches while there is one that is considered the commit point, i.e. when a write is completed
/// on the device, it is considered persistent. Reads are served from the closest device with available data. Writes are issued in parallel to
/// all devices
/// </summary>
class TieredStorageDevice : StorageDeviceBase
{
private readonly IList<IDevice> devices;
private readonly int commitPoint;
/// <summary>
/// Constructs a new TieredStorageDevice composed of the given devices.
/// </summary>
/// <param name="commitPoint">
/// The index of an <see cref="IDevice">IDevice</see> in <see cref="devices"/>. When a write has been completed on the device,
/// the write is considered persistent. It is guaranteed that the callback in <see cref="WriteAsync(IntPtr, int, ulong, uint, IOCompletionCallback, IAsyncResult)"/>
/// will not be called until the write is completed on the commit point device.
/// </param>
/// <param name="devices">
/// List of devices to be used. The list should be given in order of hot to cold. Read is served from the
/// device with smallest index in the list that has the requested data
/// </param>
public TieredStorageDevice(int commitPoint, IList<IDevice> devices) : base(ComputeFileString(devices, commitPoint), 512, ComputeCapacity(devices))
{
Debug.Assert(commitPoint >= 0 && commitPoint < devices.Count, "commit point is out of range");
this.devices = devices;
this.commitPoint = commitPoint;
}
/// <summary>
/// Constructs a new TieredStorageDevice composed of the given devices.
/// </summary>
/// <param name="commitPoint">
/// The index of an <see cref="IDevice">IDevice</see> in <see cref="devices">devices</see>. When a write has been completed on the device,
/// the write is considered persistent. It is guaranteed that the callback in <see cref="WriteAsync(IntPtr, int, ulong, uint, IOCompletionCallback, IAsyncResult)"/>
/// will not be called until the write is completed on commit point device and all previous tiers.
/// </param>
/// <param name="devices">
/// List of devices to be used. The list should be given in order of hot to cold. Read is served from the
/// device with smallest index in the list that has the requested data
/// </param>
public TieredStorageDevice(int commitPoint, params IDevice[] devices) : this(commitPoint, (IList<IDevice>)devices)
{
}
public override void Initialize(long segmentSize, LightEpoch epoch)
{
base.Initialize(segmentSize, epoch);
foreach (IDevice devices in devices)
{
devices.Initialize(segmentSize, epoch);
}
}
public override void Close()
{
foreach (IDevice device in devices)
{
device.Close();
}
}
public override void ReadAsync(int segmentId, ulong sourceAddress, IntPtr destinationAddress, uint readLength, IOCompletionCallback callback, IAsyncResult asyncResult)
{
// This device is epoch-protected and cannot be stale while the operation is in flight
IDevice closestDevice = devices[FindClosestDeviceContaining(segmentId)];
// We can directly forward the address, because assuming an inclusive policy, all devices agree on the same address space. The only difference is that some segments may not
// be present for certain devices.
closestDevice.ReadAsync(segmentId, sourceAddress, destinationAddress, readLength, callback, asyncResult);
}
public override unsafe void WriteAsync(IntPtr sourceAddress, int segmentId, ulong destinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult)
{
int startTier = FindClosestDeviceContaining(segmentId);
Debug.Assert(startTier <= commitPoint, "Write should not elide the commit point");
var countdown = new CountdownEvent(commitPoint + 1); // number of devices to wait on
// Issue writes to all tiers in parallel
for (int i = startTier; i < devices.Count; i++)
{
if (i <= commitPoint)
{
// All tiers before the commit point (incluisive) need to be persistent before the callback is invoked.
devices[i].WriteAsync(sourceAddress, segmentId, destinationAddress, numBytesToWrite, (e, n, o) =>
{
// The last tier to finish invokes the callback
if (countdown.Signal())
{
callback(e, n, o);
countdown.Dispose();
}
}, asyncResult);
}
else
{
// Otherwise, simply issue the write without caring about callbacks
devices[i].WriteAsync(sourceAddress, segmentId, destinationAddress, numBytesToWrite, (e, n, o) => { }, null);
}
}
}
public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result)
{
int startTier = FindClosestDeviceContaining(segment);
var countdown = new CountdownEvent(devices.Count);
for(int i = startTier; i < devices.Count; i++)
{
devices[i].RemoveSegmentAsync(segment, r =>
{
if (countdown.Signal())
{
callback(r);
countdown.Dispose();
}
}, result);
}
}
private static long ComputeCapacity(IList<IDevice> devices)
{
long result = 0;
// The capacity of a tiered storage device is the sum of the capacity of its tiers
foreach (IDevice device in devices)
{
// Unless the last tier device has unspecified storage capacity, in which case the tiered storage also has unspecified capacity
if (device.Capacity == Devices.CAPACITY_UNSPECIFIED)
{
Debug.Assert(device == devices[devices.Count - 1], "Only the last tier storage of a tiered storage device can have unspecified capacity");
return Devices.CAPACITY_UNSPECIFIED;
}
result = Math.Max(result, device.Capacity);
}
return result;
}
private static string ComputeFileString(IList<IDevice> devices, int commitPoint)
{
StringBuilder result = new StringBuilder();
foreach (IDevice device in devices)
{
string formatString = "{0}, file name {1}, capacity {2} bytes;";
string capacity = device.Capacity == Devices.CAPACITY_UNSPECIFIED ? "unspecified" : device.Capacity.ToString();
result.AppendFormat(formatString, device.GetType().Name, device.FileName, capacity);
}
result.AppendFormat("commit point: {0} at tier {1}", devices[commitPoint].GetType().Name, commitPoint);
return result.ToString();
}
private int FindClosestDeviceContaining(int segment)
{
// Can use binary search, but 1) it might not be faster than linear on a array assumed small, and 2) C# built in does not guarantee first element is returned on duplicates.
// Therefore we are sticking to the simpler approach at first.
for (int i = 0; i < devices.Count; i++)
{
if (devices[i].StartSegment <= segment) return i;
}
throw new ArgumentException("No such address exists");
}
}
}

@ -0,0 +1,81 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Net;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Fast implementation of instance-thread-local variables
/// </summary>
/// <typeparam name="T"></typeparam>
internal class FastThreadLocal<T>
{
// Max instances supported
private const int kMaxInstances = 128;
[ThreadStatic]
private static T[] tl_values;
[ThreadStatic]
private static int[] tl_iid;
private readonly int offset;
private readonly int iid;
private static readonly int[] instances = new int[kMaxInstances];
private static int instanceId = 0;
public FastThreadLocal()
{
iid = Interlocked.Increment(ref instanceId);
for (int i = 0; i < kMaxInstances; i++)
{
if (0 == Interlocked.CompareExchange(ref instances[i], iid, 0))
{
offset = i;
return;
}
}
throw new Exception("Unsupported number of simultaneous instances");
}
public void InitializeThread()
{
if (tl_values == null)
{
tl_values = new T[kMaxInstances];
tl_iid = new int[kMaxInstances];
}
if (tl_iid[offset] != iid)
{
tl_iid[offset] = iid;
tl_values[offset] = default(T);
}
}
public void DisposeThread()
{
tl_values[offset] = default(T);
tl_iid[offset] = 0;
}
/// <summary>
/// Dispose instance for all threads
/// </summary>
public void Dispose()
{
instances[offset] = 0;
}
public T Value
{
get => tl_values[offset];
set => tl_values[offset] = value;
}
public bool IsInitializedForThread => (tl_values != null) && (iid == tl_iid[offset]);
}
}

@ -0,0 +1,450 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Threading;
using System.Runtime.InteropServices;
using System.Runtime.CompilerServices;
using System.Diagnostics;
namespace FASTER.core
{
/// <summary>
/// Epoch protection
/// </summary>
public unsafe class LightEpoch
{
/// <summary>
/// Default invalid index entry.
/// </summary>
private const int kInvalidIndex = 0;
/// <summary>
/// Default number of entries in the entries table
/// </summary>
public const int kTableSize = 128;
/// <summary>
/// Default drainlist size
/// </summary>
private const int kDrainListSize = 16;
/// <summary>
/// Thread protection status entries.
/// </summary>
private Entry[] tableRaw;
private GCHandle tableHandle;
private Entry* tableAligned;
private static Entry[] threadIndex;
private static GCHandle threadIndexHandle;
private static Entry* threadIndexAligned;
/// <summary>
/// List of action, epoch pairs containing actions to performed
/// when an epoch becomes safe to reclaim.
/// </summary>
private int drainCount = 0;
private readonly EpochActionPair[] drainList = new EpochActionPair[kDrainListSize];
/// <summary>
/// A thread's entry in the epoch table.
/// </summary>
[ThreadStatic]
private static int threadEntryIndex;
/// <summary>
/// Number of instances using this entry
/// </summary>
[ThreadStatic]
private static int threadEntryIndexCount;
[ThreadStatic]
static int threadId;
/// <summary>
/// Global current epoch value
/// </summary>
public int CurrentEpoch;
/// <summary>
/// Cached value of latest epoch that is safe to reclaim
/// </summary>
public int SafeToReclaimEpoch;
/// <summary>
/// Static constructor to setup shared cache-aligned space
/// to store per-entry count of instances using that entry
/// </summary>
static LightEpoch()
{
// Over-allocate to do cache-line alignment
threadIndex = new Entry[kTableSize + 2];
threadIndexHandle = GCHandle.Alloc(threadIndex, GCHandleType.Pinned);
long p = (long)threadIndexHandle.AddrOfPinnedObject();
// Force the pointer to align to 64-byte boundaries
long p2 = (p + (Constants.kCacheLineBytes - 1)) & ~(Constants.kCacheLineBytes - 1);
threadIndexAligned = (Entry*)p2;
}
/// <summary>
/// Instantiate the epoch table
/// </summary>
public LightEpoch()
{
// Over-allocate to do cache-line alignment
tableRaw = new Entry[kTableSize + 2];
tableHandle = GCHandle.Alloc(tableRaw, GCHandleType.Pinned);
long p = (long)tableHandle.AddrOfPinnedObject();
// Force the pointer to align to 64-byte boundaries
long p2 = (p + (Constants.kCacheLineBytes - 1)) & ~(Constants.kCacheLineBytes - 1);
tableAligned = (Entry*)p2;
CurrentEpoch = 1;
SafeToReclaimEpoch = 0;
for (int i = 0; i < kDrainListSize; i++)
drainList[i].epoch = int.MaxValue;
drainCount = 0;
}
/// <summary>
/// Clean up epoch table
/// </summary>
public void Dispose()
{
tableHandle.Free();
tableAligned = null;
tableRaw = null;
CurrentEpoch = 1;
SafeToReclaimEpoch = 0;
}
/// <summary>
/// Check whether current thread is protected
/// </summary>
/// <returns>Result of the check</returns>
public bool IsProtected()
{
return kInvalidIndex != threadEntryIndex;
}
/// <summary>
/// Enter the thread into the protected code region
/// </summary>
/// <returns>Current epoch</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int ProtectAndDrain()
{
int entry = threadEntryIndex;
(*(tableAligned + entry)).threadId = threadEntryIndex;
(*(tableAligned + entry)).localCurrentEpoch = CurrentEpoch;
if (drainCount > 0)
{
Drain((*(tableAligned + entry)).localCurrentEpoch);
}
return (*(tableAligned + entry)).localCurrentEpoch;
}
/// <summary>
/// Check and invoke trigger actions that are ready
/// </summary>
/// <param name="nextEpoch">Next epoch</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void Drain(int nextEpoch)
{
ComputeNewSafeToReclaimEpoch(nextEpoch);
for (int i = 0; i < kDrainListSize; i++)
{
var trigger_epoch = drainList[i].epoch;
if (trigger_epoch <= SafeToReclaimEpoch)
{
if (Interlocked.CompareExchange(ref drainList[i].epoch, int.MaxValue - 1, trigger_epoch) == trigger_epoch)
{
var trigger_action = drainList[i].action;
drainList[i].action = null;
drainList[i].epoch = int.MaxValue;
trigger_action();
if (Interlocked.Decrement(ref drainCount) == 0) break;
}
}
}
}
/// <summary>
/// Thread acquires its epoch entry
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Acquire()
{
if (threadEntryIndex == kInvalidIndex)
threadEntryIndex = ReserveEntryForThread();
threadEntryIndexCount++;
}
/// <summary>
/// Thread releases its epoch entry
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Release()
{
int entry = threadEntryIndex;
(*(tableAligned + entry)).localCurrentEpoch = 0;
(*(tableAligned + entry)).threadId = 0;
threadEntryIndexCount--;
if (threadEntryIndexCount == 0)
{
(threadIndexAligned + threadEntryIndex)->threadId = 0;
threadEntryIndex = kInvalidIndex;
}
}
/// <summary>
/// Thread suspends its epoch entry
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Suspend()
{
Release();
}
/// <summary>
/// Thread resumes its epoch entry
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Resume()
{
Acquire();
ProtectAndDrain();
}
/// <summary>
/// Increment global current epoch
/// </summary>
/// <returns></returns>
public int BumpCurrentEpoch()
{
int nextEpoch = Interlocked.Add(ref CurrentEpoch, 1);
if (drainCount > 0)
Drain(nextEpoch);
return nextEpoch;
}
/// <summary>
/// Increment current epoch and associate trigger action
/// with the prior epoch
/// </summary>
/// <param name="onDrain">Trigger action</param>
/// <returns></returns>
public int BumpCurrentEpoch(Action onDrain)
{
int PriorEpoch = BumpCurrentEpoch() - 1;
int i = 0, j = 0;
while (true)
{
if (drainList[i].epoch == int.MaxValue)
{
if (Interlocked.CompareExchange(ref drainList[i].epoch, int.MaxValue - 1, int.MaxValue) == int.MaxValue)
{
drainList[i].action = onDrain;
drainList[i].epoch = PriorEpoch;
Interlocked.Increment(ref drainCount);
break;
}
}
else
{
var triggerEpoch = drainList[i].epoch;
if (triggerEpoch <= SafeToReclaimEpoch)
{
if (Interlocked.CompareExchange(ref drainList[i].epoch, int.MaxValue - 1, triggerEpoch) == triggerEpoch)
{
var triggerAction = drainList[i].action;
drainList[i].action = onDrain;
drainList[i].epoch = PriorEpoch;
triggerAction();
break;
}
}
}
if (++i == kDrainListSize)
{
ProtectAndDrain();
i = 0;
if (++j == 500)
{
j = 0;
Debug.WriteLine("Delay finding a free entry in the drain list");
}
}
}
ProtectAndDrain();
return PriorEpoch + 1;
}
/// <summary>
/// Looks at all threads and return the latest safe epoch
/// </summary>
/// <param name="currentEpoch">Current epoch</param>
/// <returns>Safe epoch</returns>
private int ComputeNewSafeToReclaimEpoch(int currentEpoch)
{
int oldestOngoingCall = currentEpoch;
for (int index = 1; index <= kTableSize; ++index)
{
int entry_epoch = (*(tableAligned + index)).localCurrentEpoch;
if (0 != entry_epoch)
{
if (entry_epoch < oldestOngoingCall)
{
oldestOngoingCall = entry_epoch;
}
}
}
// The latest safe epoch is the one just before
// the earliest unsafe epoch.
SafeToReclaimEpoch = oldestOngoingCall - 1;
return SafeToReclaimEpoch;
}
/// <summary>
/// Reserve entry for thread. This method relies on the fact that no
/// thread will ever have ID 0.
/// </summary>
/// <param name="startIndex">Start index</param>
/// <param name="threadId">Thread id</param>
/// <returns>Reserved entry</returns>
private static int ReserveEntry(int startIndex, int threadId)
{
int current_iteration = 0;
for (; ; )
{
// Reserve an entry in the table.
for (int i = 0; i < kTableSize; ++i)
{
int index_to_test = 1 + ((startIndex + i) & (kTableSize - 1));
if (0 == (threadIndexAligned + index_to_test)->threadId)
{
bool success =
(0 == Interlocked.CompareExchange(
ref (threadIndexAligned+index_to_test)->threadId,
threadId, 0));
if (success)
{
return (int)index_to_test;
}
}
++current_iteration;
}
if (current_iteration > (kTableSize * 10))
{
throw new Exception("Unable to reserve an epoch entry, try increasing the epoch table size (kTableSize)");
}
}
}
/// <summary>
/// Allocate a new entry in epoch table. This is called
/// once for a thread.
/// </summary>
/// <returns>Reserved entry</returns>
private static int ReserveEntryForThread()
{
if (threadId == 0) // run once per thread for performance
{
// For portability(run on non-windows platform)
threadId = Environment.OSVersion.Platform == PlatformID.Win32NT ? (int)Native32.GetCurrentThreadId() : Thread.CurrentThread.ManagedThreadId;
}
int startIndex = Utility.Murmur3(threadId);
return ReserveEntry(startIndex, threadId);
}
/// <summary>
/// Epoch table entry (cache line size).
/// </summary>
[StructLayout(LayoutKind.Explicit, Size = Constants.kCacheLineBytes)]
private struct Entry
{
/// <summary>
/// Thread-local value of epoch
/// </summary>
[FieldOffset(0)]
public int localCurrentEpoch;
/// <summary>
/// ID of thread associated with this entry.
/// </summary>
[FieldOffset(4)]
public int threadId;
[FieldOffset(8)]
public int reentrant;
[FieldOffset(12)]
public fixed int markers[13];
};
private struct EpochActionPair
{
public long epoch;
public Action action;
}
/// <summary>
/// Mechanism for threads to mark some activity as completed until
/// some version by this thread, and check if all active threads
/// have completed the same activity until that version.
/// </summary>
/// <param name="markerIdx">ID of activity</param>
/// <param name="version">Version</param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool MarkAndCheckIsComplete(int markerIdx, int version)
{
int entry = threadEntryIndex;
if (kInvalidIndex == entry)
{
Debug.WriteLine("New Thread entered during CPR");
Debug.Assert(false);
}
(*(tableAligned + entry)).markers[markerIdx] = version;
// check if all threads have reported complete
for (int index = 1; index <= kTableSize; ++index)
{
int entry_epoch = (*(tableAligned + index)).localCurrentEpoch;
int fc_version = (*(tableAligned + index)).markers[markerIdx];
if (0 != entry_epoch)
{
if (fc_version != version && entry_epoch < int.MaxValue)
{
return false;
}
}
}
return true;
}
}
}

@ -0,0 +1,95 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 1591
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
namespace FASTER.core
{
/// <summary>
/// AddressInfo struct
/// </summary>
[StructLayout(LayoutKind.Explicit, Size = 8)]
public unsafe struct AddressInfo
{
private const int kMultiplierBits = 1;
private static readonly int kTotalBits = sizeof(IntPtr) * 8;
private static readonly int kAddressBits = 42*kTotalBits/64;
private static readonly int kSizeBits = kTotalBits - kAddressBits - kMultiplierBits;
private static readonly long kSizeMaskInWord = ((1L << kSizeBits) - 1) << kAddressBits;
private static readonly long kSizeMaskInInteger = (1L << kSizeBits) - 1;
private static readonly long kMultiplierMaskInWord = ((1L << kMultiplierBits) - 1) << (kAddressBits + kSizeBits);
private const long kMultiplierMaskInInteger = (1L << kMultiplierBits) - 1;
private static readonly long kAddressMask = (1L << kAddressBits) - 1;
[FieldOffset(0)]
private IntPtr word;
public static void WriteInfo(AddressInfo* info, long address, long size)
{
info->word = default(IntPtr);
info->Address = address;
info->Size = size;
}
public static string ToString(AddressInfo* info)
{
return "RecordHeader Word = " + info->word;
}
public long Size
{
get
{
int multiplier = (int)((((long)word & kMultiplierMaskInWord) >> (kAddressBits + kSizeBits)) & kMultiplierMaskInInteger);
return (multiplier == 0 ? 512 : 1<<20)*((((long)word & kSizeMaskInWord) >> kAddressBits) & kSizeMaskInInteger);
}
set
{
int multiplier = 0;
int val = (int)(value >> 9);
if ((value & ((1<<9)-1)) != 0) val++;
if (val >= (1 << kSizeBits))
{
val = (int)(value >> 20);
if ((value & ((1<<20) - 1)) != 0) val++;
multiplier = 1;
if (val >= (1 << kSizeBits))
{
throw new Exception("Unsupported object size: " + value);
}
}
var _word = (long)word;
_word &= ~kSizeMaskInWord;
_word &= ~kMultiplierMaskInWord;
_word |= (val & kSizeMaskInInteger) << kAddressBits;
_word |= (multiplier & kMultiplierMaskInInteger) << (kAddressBits + kSizeBits);
word = (IntPtr)_word;
}
}
public long Address
{
get
{
return (long)word & kAddressMask;
}
set
{
var _word = (long)word;
_word &= ~kAddressMask;
_word |= (value & kAddressMask);
word = (IntPtr)_word;
if (value != Address)
{
throw new Exception("Overflow in AddressInfo" + ((kAddressBits < 64) ? " - consider running the program in x64 mode for larger address space support" : ""));
}
}
}
}
}

@ -0,0 +1,48 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
namespace FASTER.core
{
/// <summary>
/// Checkpoint type
/// </summary>
public enum CheckpointType
{
/// <summary>
/// Take separate snapshot of in-memory portion of log (default)
/// </summary>
Snapshot,
/// <summary>
/// Flush current log (move read-only to tail)
/// (enables incremental checkpointing, but log grows faster)
/// </summary>
FoldOver
}
/// <summary>
/// Checkpoint-related settings
/// </summary>
public class CheckpointSettings
{
/// <summary>
/// Checkpoint manager
/// </summary>
public ICheckpointManager CheckpointManager = null;
/// <summary>
/// Type of checkpoint
/// </summary>
public CheckpointType CheckPointType = CheckpointType.Snapshot;
/// <summary>
/// Use specified directory for storing and retrieving checkpoints
/// This is a shortcut to providing the following:
/// CheckpointSettings.CheckpointManager = new LocalCheckpointManager(CheckpointDir)
/// </summary>
public string CheckpointDir = null;
}
}

@ -0,0 +1,479 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;
using System.Threading;
namespace FASTER.core
{
internal enum OperationType
{
READ,
RMW,
UPSERT,
INSERT,
DELETE
}
internal enum OperationStatus
{
SUCCESS,
NOTFOUND,
RETRY_NOW,
RETRY_LATER,
RECORD_ON_DISK,
SUCCESS_UNMARK,
CPR_SHIFT_DETECTED,
CPR_PENDING_DETECTED
}
internal class SerializedFasterExecutionContext
{
public int version;
public long serialNum;
public Guid guid;
public void Write(StreamWriter writer)
{
writer.WriteLine(version);
writer.WriteLine(guid);
writer.WriteLine(serialNum);
}
public void Load(StreamReader reader)
{
string value = reader.ReadLine();
version = int.Parse(value);
value = reader.ReadLine();
guid = Guid.Parse(value);
value = reader.ReadLine();
serialNum = long.Parse(value);
}
}
public unsafe partial class FasterKV<Key, Value, Input, Output, Context, Functions> : FasterBase, IFasterKV<Key, Value, Input, Output, Context>
where Key : new()
where Value : new()
where Functions : IFunctions<Key, Value, Input, Output, Context>
{
internal struct PendingContext
{
// User provided information
public OperationType type;
public IHeapContainer<Key> key;
public IHeapContainer<Value> value;
public Input input;
public Output output;
public Context userContext;
// Some additional information about the previous attempt
public long id;
public int version;
public long logicalAddress;
public long serialNum;
public HashBucketEntry entry;
public void Dispose()
{
key?.Dispose();
value?.Dispose();
}
}
internal class FasterExecutionContext : SerializedFasterExecutionContext
{
public Phase phase;
public bool[] markers;
public long totalPending;
public Queue<PendingContext> retryRequests;
public Dictionary<long, PendingContext> ioPendingRequests;
public BlockingCollection<AsyncIOContext<Key, Value>> readyResponses;
}
}
/// <summary>
/// Recovery info for hybrid log
/// </summary>
public struct HybridLogRecoveryInfo
{
/// <summary>
/// Guid
/// </summary>
public Guid guid;
/// <summary>
/// Use snapshot file
/// </summary>
public int useSnapshotFile;
/// <summary>
/// Version
/// </summary>
public int version;
/// <summary>
/// Number of threads
/// </summary>
public int numThreads;
/// <summary>
/// Flushed logical address
/// </summary>
public long flushedLogicalAddress;
/// <summary>
/// Start logical address
/// </summary>
public long startLogicalAddress;
/// <summary>
/// Final logical address
/// </summary>
public long finalLogicalAddress;
/// <summary>
/// Head address
/// </summary>
public long headAddress;
/// <summary>
/// Begin address
/// </summary>
public long beginAddress;
/// <summary>
/// Guid array
/// </summary>
public Guid[] guids;
/// <summary>
/// Tokens per guid restored during Continue
/// </summary>
public ConcurrentDictionary<Guid, long> continueTokens;
/// <summary>
/// Tokens per guid created during Checkpoint
/// </summary>
public ConcurrentDictionary<Guid, long> checkpointTokens;
/// <summary>
/// Object log segment offsets
/// </summary>
public long[] objectLogSegmentOffsets;
/// <summary>
/// Initialize
/// </summary>
/// <param name="token"></param>
/// <param name="_version"></param>
public void Initialize(Guid token, int _version)
{
guid = token;
useSnapshotFile = 0;
version = _version;
numThreads = 0;
flushedLogicalAddress = 0;
startLogicalAddress = 0;
finalLogicalAddress = 0;
headAddress = 0;
guids = new Guid[LightEpoch.kTableSize + 1];
continueTokens = new ConcurrentDictionary<Guid, long>();
checkpointTokens = new ConcurrentDictionary<Guid, long>();
objectLogSegmentOffsets = null;
}
/// <summary>
/// Initialize from stream
/// </summary>
/// <param name="reader"></param>
public void Initialize(StreamReader reader)
{
guids = new Guid[LightEpoch.kTableSize + 1];
continueTokens = new ConcurrentDictionary<Guid, long>();
string value = reader.ReadLine();
guid = Guid.Parse(value);
value = reader.ReadLine();
useSnapshotFile = int.Parse(value);
value = reader.ReadLine();
version = int.Parse(value);
value = reader.ReadLine();
flushedLogicalAddress = long.Parse(value);
value = reader.ReadLine();
startLogicalAddress = long.Parse(value);
value = reader.ReadLine();
finalLogicalAddress = long.Parse(value);
value = reader.ReadLine();
headAddress = long.Parse(value);
value = reader.ReadLine();
beginAddress = long.Parse(value);
value = reader.ReadLine();
numThreads = int.Parse(value);
for (int i = 0; i < numThreads; i++)
{
value = reader.ReadLine();
guids[i] = Guid.Parse(value);
value = reader.ReadLine();
var serialno = long.Parse(value);
continueTokens.TryAdd(guids[i], serialno);
}
// Read object log segment offsets
value = reader.ReadLine();
var numSegments = int.Parse(value);
if (numSegments > 0)
{
objectLogSegmentOffsets = new long[numSegments];
for (int i = 0; i < numSegments; i++)
{
value = reader.ReadLine();
objectLogSegmentOffsets[i] = long.Parse(value);
}
}
}
/// <summary>
/// Recover info from token
/// </summary>
/// <param name="token"></param>
/// <param name="checkpointManager"></param>
/// <returns></returns>
internal void Recover(Guid token, ICheckpointManager checkpointManager)
{
var metadata = checkpointManager.GetLogCommitMetadata(token);
if (metadata == null)
throw new Exception("Invalid log commit metadata for ID " + token.ToString());
Initialize(new StreamReader(new MemoryStream(metadata)));
}
/// <summary>
/// Reset
/// </summary>
public void Reset()
{
Initialize(default(Guid), -1);
}
/// <summary>
/// Write info to byte array
/// </summary>
public byte[] ToByteArray()
{
using (var ms = new MemoryStream())
{
using (StreamWriter writer = new StreamWriter(ms))
{
writer.WriteLine(guid);
writer.WriteLine(useSnapshotFile);
writer.WriteLine(version);
writer.WriteLine(flushedLogicalAddress);
writer.WriteLine(startLogicalAddress);
writer.WriteLine(finalLogicalAddress);
writer.WriteLine(headAddress);
writer.WriteLine(beginAddress);
writer.WriteLine(numThreads);
for (int i = 0; i < numThreads; i++)
{
writer.WriteLine(guids[i]);
writer.WriteLine(checkpointTokens[guids[i]]);
}
// Write object log segment offsets
writer.WriteLine(objectLogSegmentOffsets == null ? 0 : objectLogSegmentOffsets.Length);
if (objectLogSegmentOffsets != null)
{
for (int i = 0; i < objectLogSegmentOffsets.Length; i++)
{
writer.WriteLine(objectLogSegmentOffsets[i]);
}
}
}
return ms.ToArray();
}
}
/// <summary>
/// Print checkpoint info for debugging purposes
/// </summary>
public void DebugPrint()
{
Debug.WriteLine("******** HybridLog Checkpoint Info for {0} ********", guid);
Debug.WriteLine("Version: {0}", version);
Debug.WriteLine("Is Snapshot?: {0}", useSnapshotFile == 1);
Debug.WriteLine("Flushed LogicalAddress: {0}", flushedLogicalAddress);
Debug.WriteLine("Start Logical Address: {0}", startLogicalAddress);
Debug.WriteLine("Final Logical Address: {0}", finalLogicalAddress);
Debug.WriteLine("Head Address: {0}", headAddress);
Debug.WriteLine("Begin Address: {0}", beginAddress);
Debug.WriteLine("Num sessions recovered: {0}", numThreads);
Debug.WriteLine("Recovered sessions: ");
foreach (var sessionInfo in continueTokens)
{
Debug.WriteLine("{0}: {1}", sessionInfo.Key, sessionInfo.Value);
}
}
}
internal struct HybridLogCheckpointInfo
{
public HybridLogRecoveryInfo info;
public IDevice snapshotFileDevice;
public IDevice snapshotFileObjectLogDevice;
public CountdownEvent flushed;
public long started;
public void Initialize(Guid token, int _version, ICheckpointManager checkpointManager)
{
info.Initialize(token, _version);
started = 0;
checkpointManager.InitializeLogCheckpoint(token);
}
public void Recover(Guid token, ICheckpointManager checkpointManager)
{
info.Recover(token, checkpointManager);
started = 0;
}
public void Reset()
{
started = 0;
flushed = null;
info.Reset();
if (snapshotFileDevice != null) snapshotFileDevice.Close();
if (snapshotFileObjectLogDevice != null) snapshotFileObjectLogDevice.Close();
}
}
internal struct IndexRecoveryInfo
{
public Guid token;
public long table_size;
public ulong num_ht_bytes;
public ulong num_ofb_bytes;
public int num_buckets;
public long startLogicalAddress;
public long finalLogicalAddress;
public void Initialize(Guid token, long _size)
{
this.token = token;
table_size = _size;
num_ht_bytes = 0;
num_ofb_bytes = 0;
startLogicalAddress = 0;
finalLogicalAddress = 0;
num_buckets = 0;
}
public void Initialize(StreamReader reader)
{
string value = reader.ReadLine();
token = Guid.Parse(value);
value = reader.ReadLine();
table_size = long.Parse(value);
value = reader.ReadLine();
num_ht_bytes = ulong.Parse(value);
value = reader.ReadLine();
num_ofb_bytes = ulong.Parse(value);
value = reader.ReadLine();
num_buckets = int.Parse(value);
value = reader.ReadLine();
startLogicalAddress = long.Parse(value);
value = reader.ReadLine();
finalLogicalAddress = long.Parse(value);
}
public void Recover(Guid guid, ICheckpointManager checkpointManager)
{
var metadata = checkpointManager.GetIndexCommitMetadata(guid);
if (metadata == null)
throw new Exception("Invalid index commit metadata for ID " + guid.ToString());
Initialize(new StreamReader(new MemoryStream(metadata)));
}
public byte[] ToByteArray()
{
using (var ms = new MemoryStream())
{
using (var writer = new StreamWriter(ms))
{
writer.WriteLine(token);
writer.WriteLine(table_size);
writer.WriteLine(num_ht_bytes);
writer.WriteLine(num_ofb_bytes);
writer.WriteLine(num_buckets);
writer.WriteLine(startLogicalAddress);
writer.WriteLine(finalLogicalAddress);
}
return ms.ToArray();
}
}
public void DebugPrint()
{
Debug.WriteLine("******** Index Checkpoint Info for {0} ********", token);
Debug.WriteLine("Table Size: {0}", table_size);
Debug.WriteLine("Main Table Size (in GB): {0}", ((double)num_ht_bytes) / 1000.0 / 1000.0 / 1000.0);
Debug.WriteLine("Overflow Table Size (in GB): {0}", ((double)num_ofb_bytes) / 1000.0 / 1000.0 / 1000.0);
Debug.WriteLine("Num Buckets: {0}", num_buckets);
Debug.WriteLine("Start Logical Address: {0}", startLogicalAddress);
Debug.WriteLine("Final Logical Address: {0}", finalLogicalAddress);
}
public void Reset()
{
token = default(Guid);
table_size = 0;
num_ht_bytes = 0;
num_ofb_bytes = 0;
num_buckets = 0;
startLogicalAddress = 0;
finalLogicalAddress = 0;
}
}
internal struct IndexCheckpointInfo
{
public IndexRecoveryInfo info;
public IDevice main_ht_device;
public void Initialize(Guid token, long _size, ICheckpointManager checkpointManager)
{
info.Initialize(token, _size);
checkpointManager.InitializeIndexCheckpoint(token);
main_ht_device = checkpointManager.GetIndexDevice(token);
}
public void Recover(Guid token, ICheckpointManager checkpointManager)
{
info.Recover(token, checkpointManager);
}
public void Reset()
{
info.Reset();
main_ht_device.Close();
}
}
}

@ -0,0 +1,72 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Runtime.CompilerServices;
namespace FASTER.core
{
/// <summary>
/// Heap container to store keys and values when they go pending
/// </summary>
/// <typeparam name="T"></typeparam>
public interface IHeapContainer<T>
{
/// <summary>
/// Get object
/// </summary>
/// <returns></returns>
ref T Get();
/// <summary>
/// Dispose container
/// </summary>
void Dispose();
}
/// <summary>
/// Heap container for standard C# objects (non-variable-length)
/// </summary>
/// <typeparam name="T"></typeparam>
internal class StandardHeapContainer<T> : IHeapContainer<T>
{
private T obj;
public StandardHeapContainer(ref T obj)
{
this.obj = obj;
}
public ref T Get() => ref obj;
public void Dispose() { }
}
/// <summary>
/// Heap container for variable length structs
/// </summary>
/// <typeparam name="T"></typeparam>
internal class VarLenHeapContainer<T> : IHeapContainer<T>
{
private SectorAlignedMemory mem;
public unsafe VarLenHeapContainer(ref T obj, IVariableLengthStruct<T> varLenStruct, SectorAlignedBufferPool pool)
{
var len = varLenStruct.GetLength(ref obj);
mem = pool.Get(len);
Buffer.MemoryCopy(Unsafe.AsPointer(ref obj), mem.GetValidPointer(), len, len);
}
public unsafe ref T Get()
{
return ref Unsafe.AsRef<T>(mem.GetValidPointer());
}
public void Dispose()
{
mem.Return();
}
}
}

@ -0,0 +1,185 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
namespace FASTER.core
{
/// <summary>
/// Configuration settings for serializing objects
/// </summary>
/// <typeparam name="Key"></typeparam>
/// <typeparam name="Value"></typeparam>
public class SerializerSettings<Key, Value>
{
/// <summary>
/// Key serializer
/// </summary>
public Func<IObjectSerializer<Key>> keySerializer;
/// <summary>
/// Value serializer
/// </summary>
public Func<IObjectSerializer<Value>> valueSerializer;
}
/// <summary>
/// Interface for variable length in-place objects
/// modeled as structs, in FASTER
/// </summary>
/// <typeparam name="T"></typeparam>
public interface IVariableLengthStruct<T>
{
/// <summary>
/// Actual length of object
/// </summary>
/// <param name="t"></param>
/// <returns></returns>
int GetLength(ref T t);
/// <summary>
/// Average length of objects, make sure this includes the object
/// header needed to compute the actual object length
/// </summary>
/// <returns></returns>
int GetAverageLength();
/// <summary>
/// Initial length, when populating for RMW from given input
/// </summary>
/// <typeparam name="Input"></typeparam>
/// <param name="input"></param>
/// <returns></returns>
int GetInitialLength<Input>(ref Input input);
}
/// <summary>
/// Length specification for fixed size (normal) structs
/// </summary>
/// <typeparam name="T"></typeparam>
public struct FixedLengthStruct<T> : IVariableLengthStruct<T>
{
private static readonly int size = Utility.GetSize(default(T));
/// <summary>
/// Get average length
/// </summary>
/// <returns></returns>
public int GetAverageLength()
{
return size;
}
/// <summary>
/// Get initial length
/// </summary>
/// <typeparam name="Input"></typeparam>
/// <param name="input"></param>
/// <returns></returns>
public int GetInitialLength<Input>(ref Input input)
{
return size;
}
/// <summary>
/// Get length
/// </summary>
/// <param name="t"></param>
/// <returns></returns>
public int GetLength(ref T t)
{
return size;
}
}
/// <summary>
/// Settings for variable length keys and values
/// </summary>
/// <typeparam name="Key"></typeparam>
/// <typeparam name="Value"></typeparam>
public class VariableLengthStructSettings<Key, Value>
{
/// <summary>
/// Key length
/// </summary>
public IVariableLengthStruct<Key> keyLength;
/// <summary>
/// Value length
/// </summary>
public IVariableLengthStruct<Value> valueLength;
}
/// <summary>
/// Configuration settings for hybrid log
/// </summary>
public class LogSettings
{
/// <summary>
/// Device used for main hybrid log
/// </summary>
public IDevice LogDevice = new NullDevice();
/// <summary>
/// Device used for serialized heap objects in hybrid log
/// </summary>
public IDevice ObjectLogDevice = new NullDevice();
/// <summary>
/// Size of a segment (group of pages), in bits
/// </summary>
public int PageSizeBits = 25;
/// <summary>
/// Size of a segment (group of pages), in bits
/// </summary>
public int SegmentSizeBits = 30;
/// <summary>
/// Total size of in-memory part of log, in bits
/// </summary>
public int MemorySizeBits = 34;
/// <summary>
/// Fraction of log marked as mutable (in-place updates)
/// </summary>
public double MutableFraction = 0.9;
/// <summary>
/// Copy reads to tail of log
/// </summary>
public bool CopyReadsToTail = false;
/// <summary>
/// Settings for optional read cache
/// Overrides the "copy reads to tail" setting
/// </summary>
public ReadCacheSettings ReadCacheSettings = null;
}
/// <summary>
/// Configuration settings for hybrid log
/// </summary>
public class ReadCacheSettings
{
/// <summary>
/// Size of a segment (group of pages), in bits
/// </summary>
public int PageSizeBits = 25;
/// <summary>
/// Total size of in-memory part of log, in bits
/// </summary>
public int MemorySizeBits = 34;
/// <summary>
/// Fraction of log head (in memory) used for second chance
/// copy to tail. This is (1 - MutableFraction) for the
/// underlying log
/// </summary>
public double SecondChanceFraction = 0.1;
}
}

@ -0,0 +1,243 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 1591
//#define RECORD_INFO_WITH_PIN_COUNT
using System;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace FASTER.core
{
#if RECORD_INFO_WITH_PIN_COUNT
[StructLayout(LayoutKind.Explicit, Size = 12)]
#else
[StructLayout(LayoutKind.Explicit, Size = 8)]
#endif
public unsafe struct RecordInfo
{
public const int kFinalBitOffset = 48;
public const int kTombstoneBitOffset = 49;
public const int kInvalidBitOffset = 50;
public const int kVersionBits = 13;
public const int kVersionShiftInWord = 51;
public const long kVersionMaskInWord = ((1L << kVersionBits) - 1) << kVersionShiftInWord;
public const long kVersionMaskInInteger = (1L << kVersionBits) - 1;
public const long kPreviousAddressMask = (1L << 48) - 1;
public const long kFinalBitMask = (1L << kFinalBitOffset);
public const long kTombstoneMask = (1L << kTombstoneBitOffset);
public const long kInvalidBitMask = (1L << kInvalidBitOffset);
#if RECORD_INFO_WITH_PIN_COUNT
public const int kTotalSizeInBytes = sizeof(long) + sizeof(int);
public const int kTotalBits = kTotalSizeInBytes * 8;
[FieldOffset(0)]
private long word;
[FieldOffset(sizeof(long))]
private int access_data;
public static void WriteInfo(RecordInfo* info, int checkpointVersion, bool final, bool tombstone, bool invalidBit, long previousAddress)
{
info->word = default(long);
info->Final = final;
info->Tombstone = tombstone;
info->Invalid = invalidBit;
info->PreviousAddress = previousAddress;
info->Version = checkpointVersion;
info->access_data = 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryPin()
{
return Interlocked.Increment(ref access_data) > 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryMarkReadOnly()
{
return Interlocked.CompareExchange(ref access_data, int.MinValue, 0) == 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void MarkReadOnly()
{
var found_value = Interlocked.CompareExchange(ref access_data, int.MinValue, 0);
if (found_value != 0)
{
int num_iterations = 1000;
Thread.SpinWait(num_iterations);
while (Interlocked.CompareExchange(ref access_data, int.MinValue, 0) != 0)
{
Thread.SpinWait(num_iterations);
num_iterations <<= 1;
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Unpin()
{
Interlocked.Decrement(ref access_data);
}
#else
public const int kTotalSizeInBytes = sizeof(long);
public const int kTotalBits = kTotalSizeInBytes * 8;
[FieldOffset(0)]
private long word;
public static void WriteInfo(ref RecordInfo info, int checkpointVersion, bool final, bool tombstone, bool invalidBit, long previousAddress)
{
info.word = default(long);
info.Final = final;
info.Tombstone = tombstone;
info.Invalid = invalidBit;
info.PreviousAddress = previousAddress;
info.Version = checkpointVersion;
}
public static string ToString(RecordInfo* info)
{
return "RecordHeader Word = " + info->word;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryPin()
{
throw new InvalidOperationException();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryMarkReadOnly()
{
throw new InvalidOperationException();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void MarkReadOnly()
{
throw new InvalidOperationException();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Unpin()
{
throw new InvalidOperationException();
}
#endif
public bool IsNull()
{
return word == 0;
}
public bool Tombstone
{
get
{
return (word & kTombstoneMask) > 0;
}
set
{
if (value)
{
word |= kTombstoneMask;
}
else
{
word &= ~kTombstoneMask;
}
}
}
public bool Final
{
get
{
return (word & kFinalBitMask) > 0;
}
set
{
if (value)
{
word |= kFinalBitMask;
}
else
{
word &= ~kFinalBitMask;
}
}
}
public bool Invalid
{
get
{
return !((word & kInvalidBitMask) > 0);
}
set
{
if (value)
{
word &= ~kInvalidBitMask;
}
else
{
word |= kInvalidBitMask;
}
}
}
public int Version
{
get
{
return (int)(((word & kVersionMaskInWord) >> kVersionShiftInWord) & kVersionMaskInInteger);
}
set
{
word &= ~kVersionMaskInWord;
word |= ((value & kVersionMaskInInteger) << kVersionShiftInWord);
}
}
public long PreviousAddress
{
get
{
return (word & kPreviousAddressMask);
}
set
{
word &= ~kPreviousAddressMask;
word |= (value & kPreviousAddressMask);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetLength()
{
return kTotalSizeInBytes;
}
}
}

@ -0,0 +1,72 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System;
namespace FASTER.core
{
/// <summary>
/// Log subscription extensions
/// </summary>
public static class Extensions
{
/// <summary>
/// Create observable of log records
/// </summary>
/// <typeparam name="Key"></typeparam>
/// <typeparam name="Value"></typeparam>
/// <param name="source"></param>
/// <returns></returns>
public static IObservable<Record<Key, Value>> ToRecordObservable<Key, Value>(this IObservable<IFasterScanIterator<Key, Value>> source)
{
return new RecordObservable<Key, Value>(source);
}
internal class RecordObservable<Key, Value> : IObservable<Record<Key, Value>>
{
IObservable<IFasterScanIterator<Key, Value>> o;
public RecordObservable(IObservable<IFasterScanIterator<Key, Value>> o)
{
this.o = o;
}
public IDisposable Subscribe(IObserver<Record<Key, Value>> observer)
{
return o.Subscribe(new RecordObserver<Key, Value>(observer));
}
}
internal class RecordObserver<Key, Value> : IObserver<IFasterScanIterator<Key, Value>>
{
private IObserver<Record<Key, Value>> observer;
public RecordObserver(IObserver<Record<Key, Value>> observer)
{
this.observer = observer;
}
public void OnCompleted()
{
observer.OnCompleted();
}
public void OnError(Exception error)
{
observer.OnError(error);
}
public void OnNext(IFasterScanIterator<Key, Value> v)
{
while (v.GetNext(out RecordInfo info, out Key key, out Value value))
{
observer.OnNext(new Record<Key, Value> { info = info, key = key, value = value });
}
v.Dispose();
}
}
}
}

@ -0,0 +1,488 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System;
using System.Collections.Concurrent;
using System.Runtime.CompilerServices;
namespace FASTER.core
{
public unsafe partial class FasterKV<Key, Value, Input, Output, Context, Functions> : FasterBase, IFasterKV<Key, Value, Input, Output, Context>
where Key : new()
where Value : new()
where Functions : IFunctions<Key, Value, Input, Output, Context>
{
private readonly Functions functions;
private readonly AllocatorBase<Key, Value> hlog;
private readonly AllocatorBase<Key, Value> readcache;
private readonly IFasterEqualityComparer<Key> comparer;
private readonly bool UseReadCache = false;
private readonly bool CopyReadsToTail = false;
private readonly bool FoldOverSnapshot = false;
private readonly int sectorSize;
private readonly bool WriteDefaultOnDelete = false;
/// <summary>
/// Number of used entries in hash index
/// </summary>
public long EntryCount => GetEntryCount();
/// <summary>
/// Size of index in #cache lines (64 bytes each)
/// </summary>
public long IndexSize => state[resizeInfo.version].size;
/// <summary>
/// Comparer used by FASTER
/// </summary>
public IFasterEqualityComparer<Key> Comparer => comparer;
/// <summary>
/// Hybrid log used by this FASTER instance
/// </summary>
public LogAccessor<Key, Value, Input, Output, Context> Log { get; }
/// <summary>
/// Read cache used by this FASTER instance
/// </summary>
public LogAccessor<Key, Value, Input, Output, Context> ReadCache { get; }
private enum CheckpointType
{
INDEX_ONLY,
HYBRID_LOG_ONLY,
FULL,
NONE
}
private CheckpointType _checkpointType;
private Guid _indexCheckpointToken;
private Guid _hybridLogCheckpointToken;
private SystemState _systemState;
private HybridLogCheckpointInfo _hybridLogCheckpoint;
private ConcurrentDictionary<Guid, long> _recoveredSessions;
private FastThreadLocal<FasterExecutionContext> prevThreadCtx;
private FastThreadLocal<FasterExecutionContext> threadCtx;
/// <summary>
/// Create FASTER instance
/// </summary>
/// <param name="size">Size of core index (#cache lines)</param>
/// <param name="comparer">FASTER equality comparer for key</param>
/// <param name="variableLengthStructSettings"></param>
/// <param name="functions">Callback functions</param>
/// <param name="logSettings">Log settings</param>
/// <param name="checkpointSettings">Checkpoint settings</param>
/// <param name="serializerSettings">Serializer settings</param>
public FasterKV(long size, Functions functions, LogSettings logSettings, CheckpointSettings checkpointSettings = null, SerializerSettings<Key, Value> serializerSettings = null, IFasterEqualityComparer<Key> comparer = null, VariableLengthStructSettings<Key, Value> variableLengthStructSettings = null)
{
threadCtx = new FastThreadLocal<FasterExecutionContext>();
prevThreadCtx = new FastThreadLocal<FasterExecutionContext>();
if (comparer != null)
this.comparer = comparer;
else
{
if (typeof(IFasterEqualityComparer<Key>).IsAssignableFrom(typeof(Key)))
{
this.comparer = new Key() as IFasterEqualityComparer<Key>;
}
else
{
Console.WriteLine("***WARNING*** Creating default FASTER key equality comparer based on potentially slow EqualityComparer<Key>.Default. To avoid this, provide a comparer (IFasterEqualityComparer<Key>) as an argument to FASTER's constructor, or make Key implement the interface IFasterEqualityComparer<Key>");
this.comparer = FasterEqualityComparer<Key>.Default;
}
}
if (checkpointSettings == null)
checkpointSettings = new CheckpointSettings();
if (checkpointSettings.CheckpointDir != null && checkpointSettings.CheckpointManager != null)
throw new Exception("Specify either CheckpointManager or CheckpointDir for CheckpointSettings, not both");
checkpointManager = checkpointSettings.CheckpointManager ?? new LocalCheckpointManager(checkpointSettings.CheckpointDir ?? "");
FoldOverSnapshot = checkpointSettings.CheckPointType == core.CheckpointType.FoldOver;
CopyReadsToTail = logSettings.CopyReadsToTail;
this.functions = functions;
if (logSettings.ReadCacheSettings != null)
{
CopyReadsToTail = false;
UseReadCache = true;
}
if (Utility.IsBlittable<Key>() && Utility.IsBlittable<Value>())
{
if (variableLengthStructSettings != null)
{
hlog = new VariableLengthBlittableAllocator<Key, Value>(logSettings, variableLengthStructSettings, this.comparer, null, epoch);
Log = new LogAccessor<Key, Value, Input, Output, Context>(this, hlog);
if (UseReadCache)
{
readcache = new VariableLengthBlittableAllocator<Key, Value>(
new LogSettings
{
PageSizeBits = logSettings.ReadCacheSettings.PageSizeBits,
MemorySizeBits = logSettings.ReadCacheSettings.MemorySizeBits,
SegmentSizeBits = logSettings.ReadCacheSettings.MemorySizeBits,
MutableFraction = 1 - logSettings.ReadCacheSettings.SecondChanceFraction
}, variableLengthStructSettings, this.comparer, ReadCacheEvict, epoch);
readcache.Initialize();
ReadCache = new LogAccessor<Key, Value, Input, Output, Context>(this, readcache);
}
}
else
{
hlog = new BlittableAllocator<Key, Value>(logSettings, this.comparer, null, epoch);
Log = new LogAccessor<Key, Value, Input, Output, Context>(this, hlog);
if (UseReadCache)
{
readcache = new BlittableAllocator<Key, Value>(
new LogSettings
{
PageSizeBits = logSettings.ReadCacheSettings.PageSizeBits,
MemorySizeBits = logSettings.ReadCacheSettings.MemorySizeBits,
SegmentSizeBits = logSettings.ReadCacheSettings.MemorySizeBits,
MutableFraction = 1 - logSettings.ReadCacheSettings.SecondChanceFraction
}, this.comparer, ReadCacheEvict, epoch);
readcache.Initialize();
ReadCache = new LogAccessor<Key, Value, Input, Output, Context>(this, readcache);
}
}
}
else
{
WriteDefaultOnDelete = true;
hlog = new GenericAllocator<Key, Value>(logSettings, serializerSettings, this.comparer, null, epoch);
Log = new LogAccessor<Key, Value, Input, Output, Context>(this, hlog);
if (UseReadCache)
{
readcache = new GenericAllocator<Key, Value>(
new LogSettings
{
PageSizeBits = logSettings.ReadCacheSettings.PageSizeBits,
MemorySizeBits = logSettings.ReadCacheSettings.MemorySizeBits,
SegmentSizeBits = logSettings.ReadCacheSettings.MemorySizeBits,
MutableFraction = 1 - logSettings.ReadCacheSettings.SecondChanceFraction
}, serializerSettings, this.comparer, ReadCacheEvict, epoch);
readcache.Initialize();
ReadCache = new LogAccessor<Key, Value, Input, Output, Context>(this, readcache);
}
}
hlog.Initialize();
sectorSize = (int)logSettings.LogDevice.SectorSize;
Initialize(size, sectorSize);
_systemState = default(SystemState);
_systemState.phase = Phase.REST;
_systemState.version = 1;
_checkpointType = CheckpointType.HYBRID_LOG_ONLY;
}
/// <summary>
/// Take full checkpoint
/// </summary>
/// <param name="token"></param>
/// <returns></returns>
public bool TakeFullCheckpoint(out Guid token)
{
var success = InternalTakeCheckpoint(CheckpointType.FULL);
if (success)
{
token = _indexCheckpointToken;
}
else
{
token = default(Guid);
}
return success;
}
/// <summary>
/// Take index checkpoint
/// </summary>
/// <param name="token"></param>
/// <returns></returns>
public bool TakeIndexCheckpoint(out Guid token)
{
var success = InternalTakeCheckpoint(CheckpointType.INDEX_ONLY);
if (success)
{
token = _indexCheckpointToken;
}
else
{
token = default(Guid);
}
return success;
}
/// <summary>
/// Take hybrid log checkpoint
/// </summary>
/// <param name="token"></param>
/// <returns></returns>
public bool TakeHybridLogCheckpoint(out Guid token)
{
var success = InternalTakeCheckpoint(CheckpointType.HYBRID_LOG_ONLY);
if (success)
{
token = _hybridLogCheckpointToken;
}
else
{
token = default(Guid);
}
return success;
}
/// <summary>
/// Recover from the latest checkpoints
/// </summary>
public void Recover()
{
InternalRecoverFromLatestCheckpoints();
}
/// <summary>
/// Recover
/// </summary>
/// <param name="fullCheckpointToken"></param>
public void Recover(Guid fullCheckpointToken)
{
InternalRecover(fullCheckpointToken, fullCheckpointToken);
}
/// <summary>
/// Recover
/// </summary>
/// <param name="indexCheckpointToken"></param>
/// <param name="hybridLogCheckpointToken"></param>
public void Recover(Guid indexCheckpointToken, Guid hybridLogCheckpointToken)
{
InternalRecover(indexCheckpointToken, hybridLogCheckpointToken);
}
/// <summary>
/// Start session with FASTER - call once per thread before using FASTER
/// </summary>
/// <returns></returns>
public Guid StartSession()
{
return InternalAcquire();
}
/// <summary>
/// Continue session with FASTER
/// </summary>
/// <param name="guid"></param>
/// <returns></returns>
public long ContinueSession(Guid guid)
{
return InternalContinue(guid);
}
/// <summary>
/// Stop session with FASTER
/// </summary>
public void StopSession()
{
InternalRelease();
}
/// <summary>
/// Refresh epoch (release memory pins)
/// </summary>
public void Refresh()
{
InternalRefresh();
}
/// <summary>
/// Complete outstanding pending operations
/// </summary>
/// <param name="wait"></param>
/// <returns></returns>
public bool CompletePending(bool wait = false)
{
return InternalCompletePending(wait);
}
/// <summary>
/// Complete the ongoing checkpoint (if any)
/// </summary>
/// <param name="wait"></param>
/// <returns></returns>
public bool CompleteCheckpoint(bool wait = false)
{
if (threadCtx == null)
{
// the thread does not have an active session
// we can wait until system state becomes REST
do
{
if (_systemState.phase == Phase.REST)
{
return true;
}
} while (wait);
}
else
{
// the thread does has an active session and
// so we need to constantly complete pending
// and refresh (done inside CompletePending)
// for the checkpoint to be proceed
do
{
CompletePending();
if (_systemState.phase == Phase.REST)
{
CompletePending();
return true;
}
} while (wait);
}
return false;
}
/// <summary>
/// Read
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="output"></param>
/// <param name="userContext"></param>
/// <param name="monotonicSerialNum"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Status Read(ref Key key, ref Input input, ref Output output, Context userContext, long monotonicSerialNum)
{
var context = default(PendingContext);
var internalStatus = InternalRead(ref key, ref input, ref output, ref userContext, ref context);
var status = default(Status);
if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND)
{
status = (Status)internalStatus;
}
else
{
status = HandleOperationStatus(threadCtx.Value, context, internalStatus);
}
threadCtx.Value.serialNum = monotonicSerialNum;
return status;
}
/// <summary>
/// Upsert
/// </summary>
/// <param name="key"></param>
/// <param name="desiredValue"></param>
/// <param name="userContext"></param>
/// <param name="monotonicSerialNum"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Status Upsert(ref Key key, ref Value desiredValue, Context userContext, long monotonicSerialNum)
{
var context = default(PendingContext);
var internalStatus = InternalUpsert(ref key, ref desiredValue, ref userContext, ref context);
var status = default(Status);
if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND)
{
status = (Status)internalStatus;
}
else
{
status = HandleOperationStatus(threadCtx.Value, context, internalStatus);
}
threadCtx.Value.serialNum = monotonicSerialNum;
return status;
}
/// <summary>
/// Read-modify-write
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="userContext"></param>
/// <param name="monotonicSerialNum"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Status RMW(ref Key key, ref Input input, Context userContext, long monotonicSerialNum)
{
var context = default(PendingContext);
var internalStatus = InternalRMW(ref key, ref input, ref userContext, ref context);
var status = default(Status);
if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND)
{
status = (Status)internalStatus;
}
else
{
status = HandleOperationStatus(threadCtx.Value, context, internalStatus);
}
threadCtx.Value.serialNum = monotonicSerialNum;
return status;
}
/// <summary>
/// Delete entry (use tombstone if necessary)
/// Hash entry is removed as a best effort (if key is in memory and at
/// the head of hash chain.
/// Value is set to null (using ConcurrentWrite) if it is in mutable region
/// </summary>
/// <param name="key"></param>
/// <param name="userContext"></param>
/// <param name="monotonicSerialNum"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public Status Delete(ref Key key, Context userContext, long monotonicSerialNum)
{
var context = default(PendingContext);
var internalStatus = InternalDelete(ref key, ref userContext, ref context);
var status = default(Status);
if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND)
{
status = (Status)internalStatus;
}
threadCtx.Value.serialNum = monotonicSerialNum;
return status;
}
/// <summary>
/// Grow the hash index
/// </summary>
/// <returns></returns>
public bool GrowIndex()
{
return InternalGrowIndex();
}
/// <summary>
/// Dispose FASTER instance
/// </summary>
public void Dispose()
{
base.Free();
threadCtx.Dispose();
prevThreadCtx.Dispose();
hlog.Dispose();
}
}
}

@ -0,0 +1,794 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Generic;
using System.Globalization;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace FASTER.core
{
internal static class Constants
{
/// Size of cache line in bytes
public const int kCacheLineBytes = 64;
public const bool kFineGrainedHandoverRecord = false;
public const bool kFineGrainedHandoverBucket = true;
/// Number of entries per bucket (assuming 8-byte entries to fill a cacheline)
/// Number of bits per bucket (assuming 8-byte entries to fill a cacheline)
public const int kBitsPerBucket = 3;
public const int kEntriesPerBucket = 1 << kBitsPerBucket;
// Position of fields in hash-table entry
public const int kTentativeBitShift = 63;
public const long kTentativeBitMask = (1L << kTentativeBitShift);
public const int kPendingBitShift = 62;
public const long kPendingBitMask = (1L << kPendingBitShift);
public const int kReadCacheBitShift = 47;
public const long kReadCacheBitMask = (1L << kReadCacheBitShift);
public const int kTagSize = 14;
public const int kTagShift = 62 - kTagSize;
public const long kTagMask = (1L << kTagSize) - 1;
public const long kTagPositionMask = (kTagMask << kTagShift);
public const long kAddressMask = (1L << 48) - 1;
// Position of tag in hash value (offset is always in the least significant bits)
public const int kHashTagShift = 64 - kTagSize;
/// Invalid entry value
public const int kInvalidEntrySlot = kEntriesPerBucket;
/// Location of the special bucket entry
public const long kOverflowBucketIndex = kEntriesPerBucket - 1;
/// Invalid value in the hash table
public const long kInvalidEntry = 0;
/// Number of times to retry a compare-and-swap before failure
public const long kRetryThreshold = 1000000;
/// Number of merge/split chunks.
public const int kNumMergeChunkBits = 8;
public const int kNumMergeChunks = 1 << kNumMergeChunkBits;
// Size of chunks for garbage collection
public const int kSizeofChunkBits = 14;
public const int kSizeofChunk = 1 << 14;
public const long kInvalidAddress = 0;
public const long kTempInvalidAddress = 1;
public const int kFirstValidAddress = 64;
}
[StructLayout(LayoutKind.Explicit, Size = Constants.kEntriesPerBucket * 8)]
internal unsafe struct HashBucket
{
public const long kPinConstant = (1L << 48);
public const long kExclusiveLatchBitMask = (1L << 63);
[FieldOffset(0)]
public fixed long bucket_entries[Constants.kEntriesPerBucket];
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryAcquireSharedLatch(HashBucket* bucket)
{
return Interlocked.Add(ref bucket->bucket_entries[Constants.kOverflowBucketIndex],
kPinConstant) > 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void ReleaseSharedLatch(HashBucket* bucket)
{
Interlocked.Add(ref bucket->bucket_entries[Constants.kOverflowBucketIndex],
-kPinConstant);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool TryAcquireExclusiveLatch(HashBucket* bucket)
{
long expected_word = bucket->bucket_entries[Constants.kOverflowBucketIndex];
if ((expected_word & ~Constants.kAddressMask) == 0)
{
long desired_word = expected_word | kExclusiveLatchBitMask;
var found_word = Interlocked.CompareExchange(
ref bucket->bucket_entries[Constants.kOverflowBucketIndex],
desired_word,
expected_word);
return found_word == expected_word;
}
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void ReleaseExclusiveLatch(HashBucket* bucket)
{
long expected_word = bucket->bucket_entries[Constants.kOverflowBucketIndex];
long desired_word = expected_word & Constants.kAddressMask;
var found_word = Interlocked.Exchange(
ref bucket->bucket_entries[Constants.kOverflowBucketIndex],
desired_word);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool NoSharedLatches(HashBucket* bucket)
{
long word = bucket->bucket_entries[Constants.kOverflowBucketIndex];
return (word & ~Constants.kAddressMask) == 0;
}
}
// Long value layout: [1-bit tentative][15-bit TAG][48-bit address]
// Physical little endian memory layout: [48-bit address][15-bit TAG][1-bit tentative]
[StructLayout(LayoutKind.Explicit, Size = 8)]
internal struct HashBucketEntry
{
[FieldOffset(0)]
public long word;
public long Address
{
get
{
return word & Constants.kAddressMask;
}
set
{
word &= ~Constants.kAddressMask;
word |= (value & Constants.kAddressMask);
}
}
public ushort Tag
{
get
{
return (ushort)((word & Constants.kTagPositionMask) >> Constants.kTagShift);
}
set
{
word &= ~Constants.kTagPositionMask;
word |= ((long)value << Constants.kTagShift);
}
}
public bool Pending
{
get
{
return (word & Constants.kPendingBitMask) != 0;
}
set
{
if (value)
{
word |= Constants.kPendingBitMask;
}
else
{
word &= ~Constants.kPendingBitMask;
}
}
}
public bool Tentative
{
get
{
return (word & Constants.kTentativeBitMask) != 0;
}
set
{
if (value)
{
word |= Constants.kTentativeBitMask;
}
else
{
word &= ~Constants.kTentativeBitMask;
}
}
}
public bool ReadCache
{
get
{
return (word & Constants.kReadCacheBitMask) != 0;
}
set
{
if (value)
{
word |= Constants.kReadCacheBitMask;
}
else
{
word &= ~Constants.kReadCacheBitMask;
}
}
}
}
internal unsafe struct InternalHashTable
{
public long size;
public long size_mask;
public int size_bits;
public HashBucket[] tableRaw;
public GCHandle tableHandle;
public HashBucket* tableAligned;
}
public unsafe partial class FasterBase
{
// Initial size of the table
internal long minTableSize = 16;
// Allocator for the hash buckets
internal readonly MallocFixedPageSize<HashBucket> overflowBucketsAllocator;
// An array of size two, that contains the old and new versions of the hash-table
internal InternalHashTable[] state = new InternalHashTable[2];
// Array used to denote if a specific chunk is merged or not
internal long[] splitStatus;
// Used as an atomic counter to check if resizing is complete
internal long numPendingChunksToBeSplit;
// Epoch set for resizing
internal int resizeEpoch;
internal LightEpoch epoch;
internal ResizeInfo resizeInfo;
/// <summary>
/// Constructor
/// </summary>
public FasterBase()
{
epoch = new LightEpoch();
overflowBucketsAllocator = new MallocFixedPageSize<HashBucket>(false, epoch);
}
internal Status Free()
{
Free(0);
Free(1);
epoch.Dispose();
overflowBucketsAllocator.Dispose();
return Status.OK;
}
private Status Free(int version)
{
if (state[version].tableHandle.IsAllocated)
state[version].tableHandle.Free();
state[version].tableRaw = null;
state[version].tableAligned = null;
return Status.OK;
}
/// <summary>
/// Initialize
/// </summary>
/// <param name="size"></param>
/// <param name="sector_size"></param>
public void Initialize(long size, int sector_size)
{
if (!Utility.IsPowerOfTwo(size))
{
throw new ArgumentException("Size {0} is not a power of 2");
}
if (!Utility.Is32Bit(size))
{
throw new ArgumentException("Size {0} is not 32-bit");
}
minTableSize = size;
resizeInfo = default(ResizeInfo);
resizeInfo.status = ResizeOperationStatus.DONE;
resizeInfo.version = 0;
Initialize(resizeInfo.version, size, sector_size);
}
/// <summary>
/// Initialize
/// </summary>
/// <param name="version"></param>
/// <param name="size"></param>
/// <param name="sector_size"></param>
protected void Initialize(int version, long size, int sector_size)
{
long size_bytes = size * sizeof(HashBucket);
long aligned_size_bytes = sector_size +
((size_bytes + (sector_size - 1)) & ~(sector_size - 1));
//Over-allocate and align the table to the cacheline
state[version].size = size;
state[version].size_mask = size - 1;
state[version].size_bits = Utility.GetLogBase2((int)size);
state[version].tableRaw = new HashBucket[aligned_size_bytes / Constants.kCacheLineBytes];
state[version].tableHandle = GCHandle.Alloc(state[version].tableRaw, GCHandleType.Pinned);
long sectorAlignedPointer = ((long)state[version].tableHandle.AddrOfPinnedObject() + (sector_size - 1)) & ~(sector_size - 1);
state[version].tableAligned = (HashBucket*)sectorAlignedPointer;
}
/// <summary>
/// A helper function that is used to find the slot corresponding to a
/// key in the specified version of the hash table
/// </summary>
/// <param name="hash"></param>
/// <param name="tag"></param>
/// <param name="bucket"></param>
/// <param name="slot"></param>
/// <param name="entry"></param>
/// <returns>true if such a slot exists, false otherwise</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool FindTag(long hash, ushort tag, ref HashBucket* bucket, ref int slot, ref HashBucketEntry entry)
{
var target_entry_word = default(long);
var entry_slot_bucket = default(HashBucket*);
var version = resizeInfo.version;
var masked_entry_word = hash & state[version].size_mask;
bucket = state[version].tableAligned + masked_entry_word;
slot = Constants.kInvalidEntrySlot;
do
{
// Search through the bucket looking for our key. Last entry is reserved
// for the overflow pointer.
for (int index = 0; index < Constants.kOverflowBucketIndex; ++index)
{
target_entry_word = *(((long*)bucket) + index);
if (0 == target_entry_word)
{
continue;
}
entry.word = target_entry_word;
if (tag == entry.Tag)
{
slot = index;
if (!entry.Tentative)
return true;
}
}
target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex) & Constants.kAddressMask;
// Go to next bucket in the chain
if (target_entry_word == 0)
{
entry = default(HashBucketEntry);
return false;
}
bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word);
} while (true);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void FindOrCreateTag(long hash, ushort tag, ref HashBucket* bucket, ref int slot, ref HashBucketEntry entry, long BeginAddress)
{
var version = resizeInfo.version;
var masked_entry_word = hash & state[version].size_mask;
while (true)
{
bucket = state[version].tableAligned + masked_entry_word;
slot = Constants.kInvalidEntrySlot;
if (FindTagOrFreeInternal(hash, tag, ref bucket, ref slot, ref entry, BeginAddress))
return;
// Install tentative tag in free slot
entry = default(HashBucketEntry);
entry.Tag = tag;
entry.Address = Constants.kTempInvalidAddress;
entry.Pending = false;
entry.Tentative = true;
if (0 == Interlocked.CompareExchange(ref bucket->bucket_entries[slot], entry.word, 0))
{
var orig_bucket = state[version].tableAligned + masked_entry_word;
var orig_slot = Constants.kInvalidEntrySlot;
if (FindOtherTagMaybeTentativeInternal(hash, tag, ref orig_bucket, ref orig_slot, bucket, slot))
{
bucket->bucket_entries[slot] = 0;
}
else
{
entry.Tentative = false;
*((long*)bucket + slot) = entry.word;
break;
}
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool FindTagInternal(long hash, ushort tag, ref HashBucket* bucket, ref int slot)
{
var target_entry_word = default(long);
var entry_slot_bucket = default(HashBucket*);
do
{
// Search through the bucket looking for our key. Last entry is reserved
// for the overflow pointer.
for (int index = 0; index < Constants.kOverflowBucketIndex; ++index)
{
target_entry_word = *(((long*)bucket) + index);
if (0 == target_entry_word)
{
continue;
}
HashBucketEntry entry = default(HashBucketEntry);
entry.word = target_entry_word;
if (tag == entry.Tag)
{
slot = index;
if (!entry.Tentative)
return true;
}
}
target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex) & Constants.kAddressMask;
// Go to next bucket in the chain
if (target_entry_word == 0)
{
return false;
}
bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word);
} while (true);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool FindTagMaybeTentativeInternal(long hash, ushort tag, ref HashBucket* bucket, ref int slot)
{
var target_entry_word = default(long);
var entry_slot_bucket = default(HashBucket*);
do
{
// Search through the bucket looking for our key. Last entry is reserved
// for the overflow pointer.
for (int index = 0; index < Constants.kOverflowBucketIndex; ++index)
{
target_entry_word = *(((long*)bucket) + index);
if (0 == target_entry_word)
{
continue;
}
HashBucketEntry entry = default(HashBucketEntry);
entry.word = target_entry_word;
if (tag == entry.Tag)
{
slot = index;
return true;
}
}
target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex) & Constants.kAddressMask;
// Go to next bucket in the chain
if (target_entry_word == 0)
{
return false;
}
bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word);
} while (true);
}
/// <summary>
/// Find existing entry (non-tenative)
/// If not found, return pointer to some empty slot
/// </summary>
/// <param name="hash"></param>
/// <param name="tag"></param>
/// <param name="bucket"></param>
/// <param name="slot"></param>
/// <param name="entry"></param>
/// <param name="BeginAddress"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool FindTagOrFreeInternal(long hash, ushort tag, ref HashBucket* bucket, ref int slot, ref HashBucketEntry entry, long BeginAddress = 0)
{
var target_entry_word = default(long);
var recordExists = false;
var entry_slot_bucket = default(HashBucket*);
do
{
// Search through the bucket looking for our key. Last entry is reserved
// for the overflow pointer.
for (int index = 0; index < Constants.kOverflowBucketIndex; ++index)
{
target_entry_word = *(((long*)bucket) + index);
if (0 == target_entry_word)
{
if (slot == Constants.kInvalidEntrySlot)
{
slot = index;
entry_slot_bucket = bucket;
}
continue;
}
entry.word = target_entry_word;
if (entry.Address < BeginAddress && entry.Address != Constants.kTempInvalidAddress)
{
if (entry.word == Interlocked.CompareExchange(ref bucket->bucket_entries[index], Constants.kInvalidAddress, target_entry_word))
{
if (slot == Constants.kInvalidEntrySlot)
{
slot = index;
entry_slot_bucket = bucket;
}
continue;
}
}
if (tag == entry.Tag && !entry.Tentative)
{
slot = index;
recordExists = true;
return recordExists;
}
}
target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex);
// Go to next bucket in the chain
if ((target_entry_word & Constants.kAddressMask) == 0)
{
if (slot == Constants.kInvalidEntrySlot)
{
// Allocate new bucket
var logicalBucketAddress = overflowBucketsAllocator.Allocate();
var physicalBucketAddress = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(logicalBucketAddress);
long compare_word = target_entry_word;
target_entry_word = logicalBucketAddress;
target_entry_word |= (compare_word & ~Constants.kAddressMask);
long result_word = Interlocked.CompareExchange(
ref bucket->bucket_entries[Constants.kOverflowBucketIndex],
target_entry_word,
compare_word);
if (compare_word != result_word)
{
// Install failed, undo allocation; use the winner's entry
overflowBucketsAllocator.FreeAtEpoch(logicalBucketAddress, 0);
target_entry_word = result_word;
}
else
{
// Install succeeded
bucket = physicalBucketAddress;
slot = 0;
entry = default(HashBucketEntry);
return recordExists;
}
}
else
{
if (!recordExists)
{
bucket = entry_slot_bucket;
}
entry = default(HashBucketEntry);
break;
}
}
bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word & Constants.kAddressMask);
} while (true);
return recordExists;
}
/// <summary>
/// Find existing entry (tenative or otherwise) other than the specified "exception" slot
/// If not found, return false. Does not return a free slot.
/// </summary>
/// <param name="hash"></param>
/// <param name="tag"></param>
/// <param name="bucket"></param>
/// <param name="slot"></param>
/// <param name="except_bucket"></param>
/// <param name="except_entry_slot"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool FindOtherTagMaybeTentativeInternal(long hash, ushort tag, ref HashBucket* bucket, ref int slot, HashBucket* except_bucket, int except_entry_slot)
{
var target_entry_word = default(long);
var entry_slot_bucket = default(HashBucket*);
do
{
// Search through the bucket looking for our key. Last entry is reserved
// for the overflow pointer.
for (int index = 0; index < Constants.kOverflowBucketIndex; ++index)
{
target_entry_word = *(((long*)bucket) + index);
if (0 == target_entry_word)
{
continue;
}
HashBucketEntry entry = default(HashBucketEntry);
entry.word = target_entry_word;
if (tag == entry.Tag)
{
if ((except_entry_slot == index) && (except_bucket == bucket))
continue;
slot = index;
return true;
}
}
target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex) & Constants.kAddressMask;
// Go to next bucket in the chain
if (target_entry_word == 0)
{
return false;
}
bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word);
} while (true);
}
/// <summary>
/// Helper function used to update the slot atomically with the
/// new offset value using the CAS operation
/// </summary>
/// <param name="bucket"></param>
/// <param name="entrySlot"></param>
/// <param name="expected"></param>
/// <param name="desired"></param>
/// <param name="found"></param>
/// <returns>If atomic update was successful</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal bool UpdateSlot(HashBucket* bucket, int entrySlot, long expected, long desired, out long found)
{
found = Interlocked.CompareExchange(
ref bucket->bucket_entries[entrySlot],
desired,
expected);
return (found == expected);
}
/// <summary>
///
/// </summary>
/// <returns></returns>
protected virtual long GetEntryCount()
{
var version = resizeInfo.version;
var table_size_ = state[version].size;
var ptable_ = state[version].tableAligned;
long total_entry_count = 0;
for (long bucket = 0; bucket < table_size_; ++bucket)
{
HashBucket b = *(ptable_ + bucket);
while (true)
{
for (int bucket_entry = 0; bucket_entry < Constants.kOverflowBucketIndex; ++bucket_entry)
if (0 != b.bucket_entries[bucket_entry])
++total_entry_count;
if (b.bucket_entries[Constants.kOverflowBucketIndex] == 0) break;
b = *((HashBucket*)overflowBucketsAllocator.GetPhysicalAddress((b.bucket_entries[Constants.kOverflowBucketIndex])));
}
}
return total_entry_count;
}
/// <summary>
///
/// </summary>
/// <param name="version"></param>
protected virtual string _DumpDistribution(int version)
{
var table_size_ = state[version].size;
var ptable_ = state[version].tableAligned;
long total_record_count = 0;
Dictionary<int, long> histogram = new Dictionary<int, long>();
for (long bucket = 0; bucket < table_size_; ++bucket)
{
List<int> tags = new List<int>();
int cnt = 0;
HashBucket b = *(ptable_ + bucket);
while (true)
{
for (int bucket_entry = 0; bucket_entry < Constants.kOverflowBucketIndex; ++bucket_entry)
{
if (0 != b.bucket_entries[bucket_entry])
{
var x = default(HashBucketEntry);
x.word = b.bucket_entries[bucket_entry];
if (tags.Contains(x.Tag) && !x.Tentative)
throw new Exception("Duplicate tag found in index");
tags.Add(x.Tag);
++cnt;
++total_record_count;
}
}
if (b.bucket_entries[Constants.kOverflowBucketIndex] == 0) break;
b = *((HashBucket*)overflowBucketsAllocator.GetPhysicalAddress((b.bucket_entries[Constants.kOverflowBucketIndex])));
}
if (!histogram.ContainsKey(cnt)) histogram[cnt] = 0;
histogram[cnt]++;
}
var distribution =
$"Number of hash buckets: {{{table_size_}}}\n" +
$"Total distinct hash-table entry count: {{{total_record_count}}}\n" +
$"Average #entries per hash bucket: {{{total_record_count / (double)table_size_:0.00}}}\n" +
$"Histogram of #entries per bucket:\n";
foreach (var kvp in histogram.OrderBy(e => e.Key))
{
distribution += $" {kvp.Key} : {kvp.Value}\n";
}
return distribution;
}
/// <summary>
/// Dumps the distribution of each non-empty bucket in the hash table.
/// </summary>
public string DumpDistribution()
{
return _DumpDistribution(resizeInfo.version);
}
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,364 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
namespace FASTER.core
{
public unsafe partial class FasterKV<Key, Value, Input, Output, Context, Functions> : FasterBase, IFasterKV<Key, Value, Input, Output, Context>
where Key : new()
where Value : new()
where Functions : IFunctions<Key, Value, Input, Output, Context>
{
internal Guid InternalAcquire()
{
epoch.Acquire();
overflowBucketsAllocator.Acquire();
threadCtx.InitializeThread();
prevThreadCtx.InitializeThread();
Phase phase = _systemState.phase;
if (phase != Phase.REST)
{
throw new Exception("Can acquire only in REST phase!");
}
Guid guid = Guid.NewGuid();
InitLocalContext(guid);
InternalRefresh();
return threadCtx.Value.guid;
}
internal long InternalContinue(Guid guid)
{
epoch.Acquire();
overflowBucketsAllocator.Acquire();
threadCtx.InitializeThread();
prevThreadCtx.InitializeThread();
if (_recoveredSessions != null)
{
if (_recoveredSessions.TryGetValue(guid, out long serialNum))
{
// We have recovered the corresponding session.
// Now obtain the session by first locking the rest phase
var currentState = SystemState.Copy(ref _systemState);
if(currentState.phase == Phase.REST)
{
var intermediateState = SystemState.Make(Phase.INTERMEDIATE, currentState.version);
if(MakeTransition(currentState,intermediateState))
{
// No one can change from REST phase
if(_recoveredSessions.TryRemove(guid, out serialNum))
{
// We have atomically removed session details.
// No one else can continue this session
InitLocalContext(guid);
threadCtx.Value.serialNum = serialNum;
InternalRefresh();
}
else
{
// Someone else continued this session
serialNum = -1;
Debug.WriteLine("Session already continued by another thread!");
}
MakeTransition(intermediateState, currentState);
return serialNum;
}
}
// Need to try again when in REST
Debug.WriteLine("Can continue only in REST phase");
return -1;
}
}
Debug.WriteLine("No recovered sessions!");
return -1;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void InternalRefresh()
{
epoch.ProtectAndDrain();
// We check if we are in normal mode
var newPhaseInfo = SystemState.Copy(ref _systemState);
if (threadCtx.Value.phase == Phase.REST && newPhaseInfo.phase == Phase.REST)
{
return;
}
// Moving to non-checkpointing phases
if (newPhaseInfo.phase == Phase.GC || newPhaseInfo.phase == Phase.PREPARE_GROW || newPhaseInfo.phase == Phase.IN_PROGRESS_GROW)
{
threadCtx.Value.phase = newPhaseInfo.phase;
return;
}
HandleCheckpointingPhases();
}
internal void InternalRelease()
{
Debug.Assert(threadCtx.Value.retryRequests.Count == 0 &&
threadCtx.Value.ioPendingRequests.Count == 0);
if (prevThreadCtx.Value != default(FasterExecutionContext))
{
Debug.Assert(prevThreadCtx.Value.retryRequests.Count == 0 &&
prevThreadCtx.Value.ioPendingRequests.Count == 0);
}
Debug.Assert(threadCtx.Value.phase == Phase.REST);
threadCtx.DisposeThread();
prevThreadCtx.DisposeThread();
epoch.Release();
overflowBucketsAllocator.Release();
}
internal void InitLocalContext(Guid token)
{
var ctx =
new FasterExecutionContext
{
phase = Phase.REST,
version = _systemState.version,
markers = new bool[8],
serialNum = 0,
totalPending = 0,
guid = token,
retryRequests = new Queue<PendingContext>(),
readyResponses = new BlockingCollection<AsyncIOContext<Key, Value>>(),
ioPendingRequests = new Dictionary<long, PendingContext>()
};
for(int i = 0; i < 8; i++)
{
ctx.markers[i] = false;
}
threadCtx.Value = ctx;
}
internal bool InternalCompletePending(bool wait = false)
{
do
{
bool done = true;
#region Previous pending requests
if (threadCtx.Value.phase == Phase.IN_PROGRESS
||
threadCtx.Value.phase == Phase.WAIT_PENDING)
{
CompleteIOPendingRequests(prevThreadCtx.Value);
Refresh();
CompleteRetryRequests(prevThreadCtx.Value);
done &= (prevThreadCtx.Value.ioPendingRequests.Count == 0);
done &= (prevThreadCtx.Value.retryRequests.Count == 0);
}
#endregion
if (!(threadCtx.Value.phase == Phase.IN_PROGRESS
||
threadCtx.Value.phase == Phase.WAIT_PENDING))
{
CompleteIOPendingRequests(threadCtx.Value);
}
InternalRefresh();
CompleteRetryRequests(threadCtx.Value);
done &= (threadCtx.Value.ioPendingRequests.Count == 0);
done &= (threadCtx.Value.retryRequests.Count == 0);
if (done)
{
return true;
}
if (wait)
{
// Yield before checking again
Thread.Yield();
}
} while (wait);
return false;
}
internal void CompleteRetryRequests(FasterExecutionContext context)
{
int count = context.retryRequests.Count;
for (int i = 0; i < count; i++)
{
var pendingContext = context.retryRequests.Dequeue();
InternalRetryRequestAndCallback(context, pendingContext);
}
}
internal void CompleteIOPendingRequests(FasterExecutionContext context)
{
if (context.readyResponses.Count == 0) return;
while (context.readyResponses.TryTake(out AsyncIOContext<Key, Value> request))
{
InternalContinuePendingRequestAndCallback(context, request);
}
}
internal void InternalRetryRequestAndCallback(
FasterExecutionContext ctx,
PendingContext pendingContext)
{
var status = default(Status);
var internalStatus = default(OperationStatus);
ref Key key = ref pendingContext.key.Get();
ref Value value = ref pendingContext.value.Get();
#region Entry latch operation
var handleLatches = false;
if ((ctx.version < threadCtx.Value.version) // Thread has already shifted to (v+1)
||
(threadCtx.Value.phase == Phase.PREPARE)) // Thread still in version v, but acquired shared-latch
{
handleLatches = true;
}
#endregion
// Issue retry command
switch(pendingContext.type)
{
case OperationType.RMW:
internalStatus = InternalRetryPendingRMW(ctx, ref pendingContext);
break;
case OperationType.UPSERT:
internalStatus = InternalUpsert(ref key,
ref value,
ref pendingContext.userContext,
ref pendingContext);
break;
case OperationType.DELETE:
internalStatus = InternalDelete(ref key,
ref pendingContext.userContext,
ref pendingContext);
break;
case OperationType.READ:
throw new Exception("Cannot happen!");
}
// Handle operation status
if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND)
{
status = (Status)internalStatus;
}
else
{
status = HandleOperationStatus(ctx, pendingContext, internalStatus);
}
// If done, callback user code.
if (status == Status.OK || status == Status.NOTFOUND)
{
if (handleLatches)
ReleaseSharedLatch(key);
switch (pendingContext.type)
{
case OperationType.RMW:
functions.RMWCompletionCallback(ref key,
ref pendingContext.input,
pendingContext.userContext, status);
break;
case OperationType.UPSERT:
functions.UpsertCompletionCallback(ref key,
ref value,
pendingContext.userContext);
break;
case OperationType.DELETE:
functions.DeleteCompletionCallback(ref key,
pendingContext.userContext);
break;
default:
throw new Exception("Operation type not allowed for retry");
}
}
}
internal void InternalContinuePendingRequestAndCallback(
FasterExecutionContext ctx,
AsyncIOContext<Key, Value> request)
{
var handleLatches = false;
if ((ctx.version < threadCtx.Value.version) // Thread has already shifted to (v+1)
||
(threadCtx.Value.phase == Phase.PREPARE)) // Thread still in version v, but acquired shared-latch
{
handleLatches = true;
}
if (ctx.ioPendingRequests.TryGetValue(request.id, out PendingContext pendingContext))
{
var status = default(Status);
var internalStatus = default(OperationStatus);
ref Key key = ref pendingContext.key.Get();
// Remove from pending dictionary
ctx.ioPendingRequests.Remove(request.id);
// Issue the continue command
if (pendingContext.type == OperationType.READ)
{
internalStatus = InternalContinuePendingRead(ctx, request, ref pendingContext);
}
else
{
internalStatus = InternalContinuePendingRMW(ctx, request, ref pendingContext); ;
}
request.Dispose();
// Handle operation status
if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND)
{
status = (Status)internalStatus;
}
else
{
status = HandleOperationStatus(ctx, pendingContext, internalStatus);
}
// If done, callback user code
if(status == Status.OK || status == Status.NOTFOUND)
{
if (handleLatches)
ReleaseSharedLatch(key);
if (pendingContext.type == OperationType.READ)
{
functions.ReadCompletionCallback(ref key,
ref pendingContext.input,
ref pendingContext.output,
pendingContext.userContext,
status);
}
else
{
functions.RMWCompletionCallback(ref key,
ref pendingContext.input,
pendingContext.userContext,
status);
}
}
pendingContext.Dispose();
}
}
}
}

@ -0,0 +1,347 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System;
namespace FASTER.core
{
/// <summary>
/// Wrapper to process log-related commands
/// </summary>
/// <typeparam name="Key"></typeparam>
/// <typeparam name="Value"></typeparam>
/// <typeparam name="Input"></typeparam>
/// <typeparam name="Output"></typeparam>
/// <typeparam name="Context"></typeparam>
public class LogAccessor<Key, Value, Input, Output, Context> : IObservable<IFasterScanIterator<Key, Value>>
where Key : new()
where Value : new()
{
private readonly IFasterKV<Key, Value, Input, Output, Context> fht;
private readonly AllocatorBase<Key, Value> allocator;
/// <summary>
/// Constructor
/// </summary>
/// <param name="fht"></param>
/// <param name="allocator"></param>
public LogAccessor(IFasterKV<Key, Value, Input, Output, Context> fht, AllocatorBase<Key, Value> allocator)
{
this.fht = fht;
this.allocator = allocator;
}
/// <summary>
/// Tail address of log
/// </summary>
public long TailAddress => allocator.GetTailAddress();
/// <summary>
/// Read-only address of log, i.e. boundary between read-only region and mutable region
/// </summary>
public long ReadOnlyAddress => allocator.ReadOnlyAddress;
/// <summary>
/// Safe read-only address of log, i.e. boundary between read-only region and mutable region
/// </summary>
public long SafeReadOnlyAddress => allocator.SafeReadOnlyAddress;
/// <summary>
/// Head address of log, i.e. beginning of in-memory regions
/// </summary>
public long HeadAddress => allocator.HeadAddress;
/// <summary>
/// Beginning address of log
/// </summary>
public long BeginAddress => allocator.BeginAddress;
/// <summary>
/// Truncate the log until, but not including, untilAddress
/// </summary>
/// <param name="untilAddress"></param>
public void ShiftBeginAddress(long untilAddress)
{
allocator.ShiftBeginAddress(untilAddress);
}
/// <summary>
/// Shift log head address to prune memory foorprint of hybrid log
/// </summary>
/// <param name="newHeadAddress">Address to shift head until</param>
/// <param name="wait">Wait to ensure shift is registered (may involve page flushing)</param>
/// <returns>When wait is false, this tells whether the shift to newHeadAddress was successfully registered with FASTER</returns>
public bool ShiftHeadAddress(long newHeadAddress, bool wait)
{
// First shift read-only
ShiftReadOnlyAddress(newHeadAddress, wait);
// Then shift head address
var updatedHeadAddress = allocator.ShiftHeadAddress(newHeadAddress);
return updatedHeadAddress >= newHeadAddress;
}
/// <summary>
/// Subscribe to records (in batches) as they become read-only in the log
/// Currently, we support only one subscriber to the log (easy to extend)
/// Subscriber only receives new log updates from the time of subscription onwards
/// To scan the historical part of the log, use the Scan(...) method
/// </summary>
/// <param name="readOnlyObserver">Observer to which scan iterator is pushed</param>
public IDisposable Subscribe(IObserver<IFasterScanIterator<Key, Value>> readOnlyObserver)
{
allocator.OnReadOnlyObserver = readOnlyObserver;
return new LogSubscribeDisposable(allocator);
}
/// <summary>
/// Wrapper to help dispose the subscription
/// </summary>
class LogSubscribeDisposable : IDisposable
{
private AllocatorBase<Key, Value> allocator;
public LogSubscribeDisposable(AllocatorBase<Key, Value> allocator)
{
this.allocator = allocator;
}
public void Dispose()
{
allocator.OnReadOnlyObserver = null;
}
}
/// <summary>
/// Shift log read-only address
/// </summary>
/// <param name="newReadOnlyAddress">Address to shift read-only until</param>
/// <param name="wait">Wait to ensure shift is complete (may involve page flushing)</param>
public void ShiftReadOnlyAddress(long newReadOnlyAddress, bool wait)
{
allocator.ShiftReadOnlyAddress(newReadOnlyAddress);
// Wait for flush to complete
while (wait && allocator.FlushedUntilAddress < newReadOnlyAddress)
fht.Refresh();
}
/// <summary>
/// Scan the log given address range
/// </summary>
/// <param name="beginAddress"></param>
/// <param name="endAddress"></param>
/// <param name="scanBufferingMode"></param>
/// <returns></returns>
public IFasterScanIterator<Key, Value> Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode = ScanBufferingMode.DoublePageBuffering)
{
return allocator.Scan(beginAddress, endAddress, scanBufferingMode);
}
/// <summary>
/// Flush log until current tail (records are still retained in memory)
/// </summary>
/// <param name="wait">Synchronous wait for operation to complete</param>
public void Flush(bool wait)
{
ShiftReadOnlyAddress(allocator.GetTailAddress(), wait);
}
/// <summary>
/// Flush log and evict all records from memory
/// </summary>
/// <param name="wait">Synchronous wait for operation to complete</param>
/// <returns>When wait is false, this tells whether the full eviction was successfully registered with FASTER</returns>
public bool FlushAndEvict(bool wait)
{
return ShiftHeadAddress(allocator.GetTailAddress(), wait);
}
/// <summary>
/// Delete log entirely from memory. Cannot allocate on the log
/// after this point. This is a synchronous operation.
/// </summary>
public void DisposeFromMemory()
{
// Ensure we have flushed and evicted
FlushAndEvict(true);
// Delete from memory
allocator.DeleteFromMemory();
}
/// <summary>
/// Compact the log until specified address, moving active
/// records to the tail of the log
/// </summary>
/// <param name="untilAddress"></param>
public void Compact(long untilAddress)
{
var variableLengthStructSettings = default(VariableLengthStructSettings<Key, Value>);
if (allocator is VariableLengthBlittableAllocator<Key, Value> varLen)
{
var functions = new LogVariableCompactFunctions(varLen);
variableLengthStructSettings = new VariableLengthStructSettings<Key, Value>
{
keyLength = varLen.KeyLength,
valueLength = varLen.ValueLength,
};
Compact(functions, untilAddress, variableLengthStructSettings);
}
else
{
Compact(new LogCompactFunctions(), untilAddress, null);
}
}
private void Compact<T>(T functions, long untilAddress, VariableLengthStructSettings<Key, Value> variableLengthStructSettings)
where T : IFunctions<Key, Value, Input, Output, Context>
{
var originalUntilAddress = untilAddress;
var tempKv = new FasterKV<Key, Value, Input, Output, Context, T>
(fht.IndexSize, functions, new LogSettings(), comparer: fht.Comparer, variableLengthStructSettings: variableLengthStructSettings);
tempKv.StartSession();
int cnt = 0;
using (var iter1 = fht.Log.Scan(fht.Log.BeginAddress, untilAddress))
{
while (iter1.GetNext(out RecordInfo recordInfo))
{
ref var key = ref iter1.GetKey();
ref var value = ref iter1.GetValue();
if (recordInfo.Tombstone)
tempKv.Delete(ref key, default(Context), 0);
else
tempKv.Upsert(ref key, ref value, default(Context), 0);
if (++cnt % 1000 == 0)
{
fht.Refresh();
tempKv.Refresh();
}
}
}
// TODO: Scan until SafeReadOnlyAddress
long scanUntil = untilAddress;
LogScanForValidity(ref untilAddress, ref scanUntil, ref tempKv);
// Make sure key wasn't inserted between SafeReadOnlyAddress and TailAddress
cnt = 0;
using (var iter3 = tempKv.Log.Scan(tempKv.Log.BeginAddress, tempKv.Log.TailAddress))
{
while (iter3.GetNext(out RecordInfo recordInfo))
{
ref var key = ref iter3.GetKey();
ref var value = ref iter3.GetValue();
if (!recordInfo.Tombstone)
{
if (fht.ContainsKeyInMemory(ref key, scanUntil) == Status.NOTFOUND)
fht.Upsert(ref key, ref value, default(Context), 0);
}
if (++cnt % 1000 == 0)
{
fht.Refresh();
tempKv.Refresh();
}
if (scanUntil < fht.Log.SafeReadOnlyAddress)
{
LogScanForValidity(ref untilAddress, ref scanUntil, ref tempKv);
}
}
}
tempKv.StopSession();
tempKv.Dispose();
ShiftBeginAddress(originalUntilAddress);
}
private void LogScanForValidity<T>(ref long untilAddress, ref long scanUntil, ref FasterKV<Key, Value, Input, Output, Context, T> tempKv)
where T : IFunctions<Key, Value, Input, Output, Context>
{
while (scanUntil < fht.Log.SafeReadOnlyAddress)
{
untilAddress = scanUntil;
scanUntil = fht.Log.SafeReadOnlyAddress;
int cnt = 0;
using (var iter2 = fht.Log.Scan(untilAddress, scanUntil))
{
while (iter2.GetNext(out RecordInfo recordInfo))
{
ref var key = ref iter2.GetKey();
ref var value = ref iter2.GetValue();
tempKv.Delete(ref key, default(Context), 0);
if (++cnt % 1000 == 0)
{
fht.Refresh();
tempKv.Refresh();
}
}
}
fht.Refresh();
}
}
private class LogVariableCompactFunctions : IFunctions<Key, Value, Input, Output, Context>
{
private VariableLengthBlittableAllocator<Key, Value> allocator;
public LogVariableCompactFunctions(VariableLengthBlittableAllocator<Key, Value> allocator)
{
this.allocator = allocator;
}
public void CheckpointCompletionCallback(Guid sessionId, long serialNum) { }
public void ConcurrentReader(ref Key key, ref Input input, ref Value value, ref Output dst) { }
public bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst)
{
var srcLength = allocator.ValueLength.GetLength(ref src);
var dstLength = allocator.ValueLength.GetLength(ref dst);
if (srcLength != dstLength)
return false;
allocator.ShallowCopy(ref src, ref dst);
return true;
}
public void CopyUpdater(ref Key key, ref Input input, ref Value oldValue, ref Value newValue) { }
public void InitialUpdater(ref Key key, ref Input input, ref Value value) { }
public bool InPlaceUpdater(ref Key key, ref Input input, ref Value value) => false;
public void ReadCompletionCallback(ref Key key, ref Input input, ref Output output, Context ctx, Status status) { }
public void RMWCompletionCallback(ref Key key, ref Input input, Context ctx, Status status) { }
public void SingleReader(ref Key key, ref Input input, ref Value value, ref Output dst) { }
public void SingleWriter(ref Key key, ref Value src, ref Value dst) { allocator.ShallowCopy(ref src, ref dst); }
public void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx) { }
public void DeleteCompletionCallback(ref Key key, Context ctx) { }
}
private class LogCompactFunctions : IFunctions<Key, Value, Input, Output, Context>
{
public void CheckpointCompletionCallback(Guid sessionId, long serialNum) { }
public void ConcurrentReader(ref Key key, ref Input input, ref Value value, ref Output dst) { }
public bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst) { dst = src; return true; }
public void CopyUpdater(ref Key key, ref Input input, ref Value oldValue, ref Value newValue) { }
public void InitialUpdater(ref Key key, ref Input input, ref Value value) { }
public bool InPlaceUpdater(ref Key key, ref Input input, ref Value value) { return true; }
public void ReadCompletionCallback(ref Key key, ref Input input, ref Output output, Context ctx, Status status) { }
public void RMWCompletionCallback(ref Key key, ref Input input, Context ctx, Status status) { }
public void SingleReader(ref Key key, ref Input input, ref Value value, ref Output dst) { }
public void SingleWriter(ref Key key, ref Value src, ref Value dst) { dst = src; }
public void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx) { }
public void DeleteCompletionCallback(ref Key key, Context ctx) { }
}
}
}

@ -0,0 +1,25 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System;
using System.Threading.Tasks;
namespace FASTER.core
{
/// <summary>
/// Exception thrown when commit fails
/// </summary>
public class CommitFailureException : Exception
{
/// <summary>
/// Commit info and next commit task in chain
/// </summary>
public LinkedCommitInfo LinkedCommitInfo { get; private set; }
internal CommitFailureException(LinkedCommitInfo linkedCommitInfo, string message)
: base(message)
=> LinkedCommitInfo = linkedCommitInfo;
}
}

@ -0,0 +1,51 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System.Threading.Tasks;
namespace FASTER.core
{
/// <summary>
/// Info contained in task associated with commit
/// </summary>
public struct CommitInfo
{
/// <summary>
/// Begin address
/// </summary>
public long BeginAddress;
/// <summary>
/// From address of commit range
/// </summary>
public long FromAddress;
/// <summary>
/// Until address of commit range
/// </summary>
public long UntilAddress;
/// <summary>
/// Error code (0 = success)
/// </summary>
public uint ErrorCode;
}
/// <summary>
/// Linked list (chain) of commit info
/// </summary>
public struct LinkedCommitInfo
{
/// <summary>
/// Commit info
/// </summary>
public CommitInfo CommitInfo;
/// <summary>
/// Next task in commit chain
/// </summary>
public Task<LinkedCommitInfo> NextTask;
}
}

@ -0,0 +1,941 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;
using System.Threading;
using System.Threading.Tasks;
namespace FASTER.core
{
/// <summary>
/// FASTER log
/// </summary>
public class FasterLog : IDisposable
{
private readonly BlittableAllocator<Empty, byte> allocator;
private readonly LightEpoch epoch;
private readonly ILogCommitManager logCommitManager;
private readonly GetMemory getMemory;
private readonly int headerSize;
private readonly LogChecksumType logChecksum;
private readonly Dictionary<string, long> RecoveredIterators;
private TaskCompletionSource<LinkedCommitInfo> commitTcs
= new TaskCompletionSource<LinkedCommitInfo>(TaskCreationOptions.RunContinuationsAsynchronously);
/// <summary>
/// Beginning address of log
/// </summary>
public long BeginAddress => allocator.BeginAddress;
/// <summary>
/// Tail address of log
/// </summary>
public long TailAddress => allocator.GetTailAddress();
/// <summary>
/// Log flushed until address
/// </summary>
public long FlushedUntilAddress => allocator.FlushedUntilAddress;
/// <summary>
/// Log committed until address
/// </summary>
public long CommittedUntilAddress;
/// <summary>
/// Log committed begin address
/// </summary>
public long CommittedBeginAddress;
/// <summary>
/// Task notifying commit completions
/// </summary>
internal Task<LinkedCommitInfo> CommitTask => commitTcs.Task;
/// <summary>
/// Create new log instance
/// </summary>
/// <param name="logSettings"></param>
public FasterLog(FasterLogSettings logSettings)
{
logCommitManager = logSettings.LogCommitManager ??
new LocalLogCommitManager(logSettings.LogCommitFile ??
logSettings.LogDevice.FileName + ".commit");
// Reserve 8 byte checksum in header if requested
logChecksum = logSettings.LogChecksum;
headerSize = logChecksum == LogChecksumType.PerEntry ? 12 : 4;
getMemory = logSettings.GetMemory;
epoch = new LightEpoch();
CommittedUntilAddress = Constants.kFirstValidAddress;
CommittedBeginAddress = Constants.kFirstValidAddress;
allocator = new BlittableAllocator<Empty, byte>(
logSettings.GetLogSettings(), null,
null, epoch, CommitCallback);
allocator.Initialize();
Restore(out RecoveredIterators);
}
/// <summary>
/// Dispose
/// </summary>
public void Dispose()
{
allocator.Dispose();
epoch.Dispose();
commitTcs.TrySetException(new ObjectDisposedException("Log has been disposed"));
}
#region Enqueue
/// <summary>
/// Enqueue entry to log (in memory) - no guarantee of flush/commit
/// </summary>
/// <param name="entry">Entry to be enqueued to log</param>
/// <returns>Logical address of added entry</returns>
public long Enqueue(byte[] entry)
{
long logicalAddress;
while (!TryEnqueue(entry, out logicalAddress)) ;
return logicalAddress;
}
/// <summary>
/// Enqueue entry to log (in memory) - no guarantee of flush/commit
/// </summary>
/// <param name="entry">Entry to be enqueued to log</param>
/// <returns>Logical address of added entry</returns>
public long Enqueue(ReadOnlySpan<byte> entry)
{
long logicalAddress;
while (!TryEnqueue(entry, out logicalAddress)) ;
return logicalAddress;
}
/// <summary>
/// Enqueue batch of entries to log (in memory) - no guarantee of flush/commit
/// </summary>
/// <param name="readOnlySpanBatch">Batch of entries to be enqueued to log</param>
/// <returns>Logical address of added entry</returns>
public long Enqueue(IReadOnlySpanBatch readOnlySpanBatch)
{
long logicalAddress;
while (!TryEnqueue(readOnlySpanBatch, out logicalAddress)) ;
return logicalAddress;
}
#endregion
#region TryEnqueue
/// <summary>
/// Try to enqueue entry to log (in memory). If it returns true, we are
/// done. If it returns false, we need to retry.
/// </summary>
/// <param name="entry">Entry to be enqueued to log</param>
/// <param name="logicalAddress">Logical address of added entry</param>
/// <returns>Whether the append succeeded</returns>
public unsafe bool TryEnqueue(byte[] entry, out long logicalAddress)
{
logicalAddress = 0;
epoch.Resume();
var length = entry.Length;
logicalAddress = allocator.TryAllocate(headerSize + Align(length));
if (logicalAddress == 0)
{
epoch.Suspend();
return false;
}
var physicalAddress = allocator.GetPhysicalAddress(logicalAddress);
fixed (byte* bp = entry)
Buffer.MemoryCopy(bp, (void*)(headerSize + physicalAddress), length, length);
SetHeader(length, (byte*)physicalAddress);
epoch.Suspend();
return true;
}
/// <summary>
/// Try to append entry to log. If it returns true, we are
/// done. If it returns false, we need to retry.
/// </summary>
/// <param name="entry">Entry to be appended to log</param>
/// <param name="logicalAddress">Logical address of added entry</param>
/// <returns>Whether the append succeeded</returns>
public unsafe bool TryEnqueue(ReadOnlySpan<byte> entry, out long logicalAddress)
{
logicalAddress = 0;
epoch.Resume();
var length = entry.Length;
logicalAddress = allocator.TryAllocate(headerSize + Align(length));
if (logicalAddress == 0)
{
epoch.Suspend();
return false;
}
var physicalAddress = allocator.GetPhysicalAddress(logicalAddress);
fixed (byte* bp = &entry.GetPinnableReference())
Buffer.MemoryCopy(bp, (void*)(headerSize + physicalAddress), length, length);
SetHeader(length, (byte*)physicalAddress);
epoch.Suspend();
return true;
}
/// <summary>
/// Try to enqueue batch of entries as a single atomic unit (to memory). Entire
/// batch needs to fit on one log page.
/// </summary>
/// <param name="readOnlySpanBatch">Batch to be appended to log</param>
/// <param name="logicalAddress">Logical address of first added entry</param>
/// <returns>Whether the append succeeded</returns>
public bool TryEnqueue(IReadOnlySpanBatch readOnlySpanBatch, out long logicalAddress)
{
return TryAppend(readOnlySpanBatch, out logicalAddress, out _);
}
#endregion
#region EnqueueAsync
/// <summary>
/// Enqueue entry to log in memory (async) - completes after entry is
/// appended to memory, NOT committed to storage.
/// </summary>
/// <param name="entry"></param>
/// <returns></returns>
public async ValueTask<long> EnqueueAsync(byte[] entry)
{
long logicalAddress;
while (true)
{
var task = CommitTask;
if (TryEnqueue(entry, out logicalAddress))
break;
if (NeedToWait(CommittedUntilAddress, TailAddress))
{
// Wait for *some* commit - failure can be ignored
try
{
await task;
}
catch { }
}
}
return logicalAddress;
}
/// <summary>
/// Enqueue entry to log in memory (async) - completes after entry is
/// appended to memory, NOT committed to storage.
/// </summary>
/// <param name="entry"></param>
/// <returns></returns>
public async ValueTask<long> EnqueueAsync(ReadOnlyMemory<byte> entry)
{
long logicalAddress;
while (true)
{
var task = CommitTask;
if (TryEnqueue(entry.Span, out logicalAddress))
break;
if (NeedToWait(CommittedUntilAddress, TailAddress))
{
// Wait for *some* commit - failure can be ignored
try
{
await task;
}
catch { }
}
}
return logicalAddress;
}
/// <summary>
/// Enqueue batch of entries to log in memory (async) - completes after entry is
/// appended to memory, NOT committed to storage.
/// </summary>
/// <param name="readOnlySpanBatch"></param>
/// <returns></returns>
public async ValueTask<long> EnqueueAsync(IReadOnlySpanBatch readOnlySpanBatch)
{
long logicalAddress;
while (true)
{
var task = CommitTask;
if (TryEnqueue(readOnlySpanBatch, out logicalAddress))
break;
if (NeedToWait(CommittedUntilAddress, TailAddress))
{
// Wait for *some* commit - failure can be ignored
try
{
await task;
}
catch { }
}
}
return logicalAddress;
}
#endregion
#region WaitForCommit and WaitForCommitAsync
/// <summary>
/// Spin-wait for enqueues, until tail or specified address, to commit to
/// storage. Does NOT itself issue a commit, just waits for commit. So you should
/// ensure that someone else causes the commit to happen.
/// </summary>
/// <param name="untilAddress">Address until which we should wait for commit, default 0 for tail of log</param>
/// <returns></returns>
public void WaitForCommit(long untilAddress = 0)
{
var tailAddress = untilAddress;
if (tailAddress == 0) tailAddress = allocator.GetTailAddress();
while (CommittedUntilAddress < tailAddress) ;
}
/// <summary>
/// Wait for appends (in memory), until tail or specified address, to commit to
/// storage. Does NOT itself issue a commit, just waits for commit. So you should
/// ensure that someone else causes the commit to happen.
/// </summary>
/// <param name="untilAddress">Address until which we should wait for commit, default 0 for tail of log</param>
/// <returns></returns>
public async ValueTask WaitForCommitAsync(long untilAddress = 0)
{
var task = CommitTask;
var tailAddress = untilAddress;
if (tailAddress == 0) tailAddress = allocator.GetTailAddress();
while (true)
{
var linkedCommitInfo = await task;
if (linkedCommitInfo.CommitInfo.UntilAddress < tailAddress)
task = linkedCommitInfo.NextTask;
else
break;
}
}
#endregion
#region Commit
/// <summary>
/// Issue commit request for log (until tail)
/// </summary>
/// <param name="spinWait">If true, spin-wait until commit completes. Otherwise, issue commit and return immediately.</param>
/// <returns></returns>
public void Commit(bool spinWait = false)
{
CommitInternal(spinWait);
}
/// <summary>
/// Async commit log (until tail), completes only when we
/// complete the commit. Throws exception if this or any
/// ongoing commit fails.
/// </summary>
/// <returns></returns>
public async ValueTask CommitAsync()
{
var task = CommitTask;
var tailAddress = CommitInternal();
while (true)
{
var linkedCommitInfo = await task;
if (linkedCommitInfo.CommitInfo.UntilAddress < tailAddress)
task = linkedCommitInfo.NextTask;
else
break;
}
}
/// <summary>
/// Async commit log (until tail), completes only when we
/// complete the commit. Throws exception if any commit
/// from prevCommitTask to current fails.
/// </summary>
/// <returns></returns>
public async ValueTask<Task<LinkedCommitInfo>> CommitAsync(Task<LinkedCommitInfo> prevCommitTask)
{
if (prevCommitTask == null) prevCommitTask = commitTcs.Task;
var tailAddress = CommitInternal();
while (true)
{
var linkedCommitInfo = await prevCommitTask;
if (linkedCommitInfo.CommitInfo.UntilAddress < tailAddress)
prevCommitTask = linkedCommitInfo.NextTask;
else
return linkedCommitInfo.NextTask;
}
}
#endregion
#region EnqueueAndWaitForCommit
/// <summary>
/// Append entry to log - spin-waits until entry is committed to storage.
/// Does NOT itself issue flush!
/// </summary>
/// <param name="entry"></param>
/// <returns></returns>
public long EnqueueAndWaitForCommit(byte[] entry)
{
long logicalAddress;
while (!TryEnqueue(entry, out logicalAddress)) ;
while (CommittedUntilAddress < logicalAddress + 1) ;
return logicalAddress;
}
/// <summary>
/// Append entry to log - spin-waits until entry is committed to storage.
/// Does NOT itself issue flush!
/// </summary>
/// <param name="entry"></param>
/// <returns></returns>
public long EnqueueAndWaitForCommit(ReadOnlySpan<byte> entry)
{
long logicalAddress;
while (!TryEnqueue(entry, out logicalAddress)) ;
while (CommittedUntilAddress < logicalAddress + 1) ;
return logicalAddress;
}
/// <summary>
/// Append batch of entries to log - spin-waits until entry is committed to storage.
/// Does NOT itself issue flush!
/// </summary>
/// <param name="readOnlySpanBatch"></param>
/// <returns></returns>
public long EnqueueAndWaitForCommit(IReadOnlySpanBatch readOnlySpanBatch)
{
long logicalAddress;
while (!TryEnqueue(readOnlySpanBatch, out logicalAddress)) ;
while (CommittedUntilAddress < logicalAddress + 1) ;
return logicalAddress;
}
#endregion
#region EnqueueAndWaitForCommitAsync
/// <summary>
/// Append entry to log (async) - completes after entry is committed to storage.
/// Does NOT itself issue flush!
/// </summary>
/// <param name="entry"></param>
/// <returns></returns>
public async ValueTask<long> EnqueueAndWaitForCommitAsync(byte[] entry)
{
long logicalAddress;
Task<LinkedCommitInfo> task;
// Phase 1: wait for commit to memory
while (true)
{
task = CommitTask;
if (TryEnqueue(entry, out logicalAddress))
break;
if (NeedToWait(CommittedUntilAddress, TailAddress))
{
// Wait for *some* commit - failure can be ignored
try
{
await task;
}
catch { }
}
}
// Phase 2: wait for commit/flush to storage
while (true)
{
LinkedCommitInfo linkedCommitInfo;
try
{
linkedCommitInfo = await task;
}
catch (CommitFailureException e)
{
linkedCommitInfo = e.LinkedCommitInfo;
if (logicalAddress >= linkedCommitInfo.CommitInfo.FromAddress && logicalAddress < linkedCommitInfo.CommitInfo.UntilAddress)
throw e;
}
if (linkedCommitInfo.CommitInfo.UntilAddress < logicalAddress + 1)
task = linkedCommitInfo.NextTask;
else
break;
}
return logicalAddress;
}
/// <summary>
/// Append entry to log (async) - completes after entry is committed to storage.
/// Does NOT itself issue flush!
/// </summary>
/// <param name="entry"></param>
/// <returns></returns>
public async ValueTask<long> EnqueueAndWaitForCommitAsync(ReadOnlyMemory<byte> entry)
{
long logicalAddress;
Task<LinkedCommitInfo> task;
// Phase 1: wait for commit to memory
while (true)
{
task = CommitTask;
if (TryEnqueue(entry.Span, out logicalAddress))
break;
if (NeedToWait(CommittedUntilAddress, TailAddress))
{
// Wait for *some* commit - failure can be ignored
try
{
await task;
}
catch { }
}
}
// Phase 2: wait for commit/flush to storage
while (true)
{
LinkedCommitInfo linkedCommitInfo;
try
{
linkedCommitInfo = await task;
}
catch (CommitFailureException e)
{
linkedCommitInfo = e.LinkedCommitInfo;
if (logicalAddress >= linkedCommitInfo.CommitInfo.FromAddress && logicalAddress < linkedCommitInfo.CommitInfo.UntilAddress)
throw e;
}
if (linkedCommitInfo.CommitInfo.UntilAddress < logicalAddress + 1)
task = linkedCommitInfo.NextTask;
else
break;
}
return logicalAddress;
}
/// <summary>
/// Append batch of entries to log (async) - completes after batch is committed to storage.
/// Does NOT itself issue flush!
/// </summary>
/// <param name="readOnlySpanBatch"></param>
/// <returns></returns>
public async ValueTask<long> EnqueueAndWaitForCommitAsync(IReadOnlySpanBatch readOnlySpanBatch)
{
long logicalAddress;
Task<LinkedCommitInfo> task;
// Phase 1: wait for commit to memory
while (true)
{
task = CommitTask;
if (TryEnqueue(readOnlySpanBatch, out logicalAddress))
break;
if (NeedToWait(CommittedUntilAddress, TailAddress))
{
// Wait for *some* commit - failure can be ignored
try
{
await task;
}
catch { }
}
}
// Phase 2: wait for commit/flush to storage
while (true)
{
LinkedCommitInfo linkedCommitInfo;
try
{
linkedCommitInfo = await task;
}
catch (CommitFailureException e)
{
linkedCommitInfo = e.LinkedCommitInfo;
if (logicalAddress >= linkedCommitInfo.CommitInfo.FromAddress && logicalAddress < linkedCommitInfo.CommitInfo.UntilAddress)
throw e;
}
if (linkedCommitInfo.CommitInfo.UntilAddress < logicalAddress + 1)
task = linkedCommitInfo.NextTask;
else
break;
}
return logicalAddress;
}
#endregion
/// <summary>
/// Truncate the log until, but not including, untilAddress
/// </summary>
/// <param name="untilAddress"></param>
public void TruncateUntil(long untilAddress)
{
allocator.ShiftBeginAddress(untilAddress);
}
/// <summary>
/// Pull-based iterator interface for scanning FASTER log
/// </summary>
/// <param name="beginAddress">Begin address for scan.</param>
/// <param name="endAddress">End address for scan (or long.MaxValue for tailing).</param>
/// <param name="name">Name of iterator, if we need to persist/recover it (default null - do not persist).</param>
/// <param name="recover">Whether to recover named iterator from latest commit (if exists). If false, iterator starts from beginAddress.</param>
/// <param name="scanBufferingMode">Use single or double buffering</param>
/// <returns></returns>
public FasterLogScanIterator Scan(long beginAddress, long endAddress, string name = null, bool recover = true, ScanBufferingMode scanBufferingMode = ScanBufferingMode.DoublePageBuffering)
{
FasterLogScanIterator iter;
if (recover && name != null && RecoveredIterators != null && RecoveredIterators.ContainsKey(name))
iter = new FasterLogScanIterator(this, allocator, RecoveredIterators[name], endAddress, getMemory, scanBufferingMode, epoch, headerSize, name);
else
iter = new FasterLogScanIterator(this, allocator, beginAddress, endAddress, getMemory, scanBufferingMode, epoch, headerSize, name);
if (name != null)
{
if (name.Length > 20)
throw new Exception("Max length of iterator name is 20 characters");
if (FasterLogScanIterator.PersistedIterators.ContainsKey(name))
Debug.WriteLine("Iterator name exists, overwriting");
FasterLogScanIterator.PersistedIterators[name] = iter;
}
return iter;
}
/// <summary>
/// Random read record from log, at given address
/// </summary>
/// <param name="address">Logical address to read from</param>
/// <param name="estimatedLength">Estimated length of entry, if known</param>
/// <returns></returns>
public async ValueTask<(byte[], int)> ReadAsync(long address, int estimatedLength = 0)
{
epoch.Resume();
if (address >= CommittedUntilAddress || address < BeginAddress)
{
epoch.Suspend();
return default;
}
var ctx = new SimpleReadContext
{
logicalAddress = address,
completedRead = new SemaphoreSlim(0)
};
unsafe
{
allocator.AsyncReadRecordToMemory(address, headerSize + estimatedLength, AsyncGetFromDiskCallback, ref ctx);
}
epoch.Suspend();
await ctx.completedRead.WaitAsync();
return GetRecordAndFree(ctx.record);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private int Align(int length)
{
return (length + 3) & ~3;
}
/// <summary>
/// Commit log
/// </summary>
private void CommitCallback(CommitInfo commitInfo)
{
TaskCompletionSource<LinkedCommitInfo> _commitTcs = default;
// We can only allow serial monotonic synchronous commit
lock (this)
{
if (CommittedBeginAddress > commitInfo.BeginAddress)
commitInfo.BeginAddress = CommittedBeginAddress;
if (CommittedUntilAddress > commitInfo.FromAddress)
commitInfo.FromAddress = CommittedUntilAddress;
if (CommittedUntilAddress > commitInfo.UntilAddress)
commitInfo.UntilAddress = CommittedUntilAddress;
FasterLogRecoveryInfo info = new FasterLogRecoveryInfo
{
BeginAddress = commitInfo.BeginAddress,
FlushedUntilAddress = commitInfo.UntilAddress
};
info.PopulateIterators();
logCommitManager.Commit(info.BeginAddress, info.FlushedUntilAddress, info.ToByteArray());
CommittedBeginAddress = info.BeginAddress;
CommittedUntilAddress = info.FlushedUntilAddress;
_commitTcs = commitTcs;
// If task is not faulted, create new task
// If task is faulted due to commit exception, create new task
if (commitTcs.Task.Status != TaskStatus.Faulted || commitTcs.Task.Exception.InnerException as CommitFailureException != null)
{
commitTcs = new TaskCompletionSource<LinkedCommitInfo>(TaskCreationOptions.RunContinuationsAsynchronously);
}
}
var lci = new LinkedCommitInfo
{
CommitInfo = commitInfo,
NextTask = commitTcs.Task
};
if (commitInfo.ErrorCode == 0)
_commitTcs?.TrySetResult(lci);
else
_commitTcs.TrySetException(new CommitFailureException(lci, $"Commit of address range [{commitInfo.FromAddress}-{commitInfo.UntilAddress}] failed with error code {commitInfo.ErrorCode}"));
}
/// <summary>
/// Restore log
/// </summary>
private void Restore(out Dictionary<string, long> recoveredIterators)
{
recoveredIterators = null;
FasterLogRecoveryInfo info = new FasterLogRecoveryInfo();
var commitInfo = logCommitManager.GetCommitMetadata();
if (commitInfo == null) return;
using (var r = new BinaryReader(new MemoryStream(commitInfo)))
{
info.Initialize(r);
}
var headAddress = info.FlushedUntilAddress - allocator.GetOffsetInPage(info.FlushedUntilAddress);
if (headAddress == 0) headAddress = Constants.kFirstValidAddress;
recoveredIterators = info.Iterators;
allocator.RestoreHybridLog(info.FlushedUntilAddress, headAddress, info.BeginAddress);
CommittedUntilAddress = info.FlushedUntilAddress;
CommittedBeginAddress = info.BeginAddress;
}
/// <summary>
/// Try to append batch of entries as a single atomic unit. Entire batch
/// needs to fit on one page.
/// </summary>
/// <param name="readOnlySpanBatch">Batch to be appended to log</param>
/// <param name="logicalAddress">Logical address of first added entry</param>
/// <param name="allocatedLength">Actual allocated length</param>
/// <returns>Whether the append succeeded</returns>
private unsafe bool TryAppend(IReadOnlySpanBatch readOnlySpanBatch, out long logicalAddress, out int allocatedLength)
{
logicalAddress = 0;
int totalEntries = readOnlySpanBatch.TotalEntries();
allocatedLength = 0;
for (int i = 0; i < totalEntries; i++)
{
allocatedLength += Align(readOnlySpanBatch.Get(i).Length) + headerSize;
}
epoch.Resume();
logicalAddress = allocator.TryAllocate(allocatedLength);
if (logicalAddress == 0)
{
epoch.Suspend();
return false;
}
var physicalAddress = allocator.GetPhysicalAddress(logicalAddress);
for (int i = 0; i < totalEntries; i++)
{
var span = readOnlySpanBatch.Get(i);
var entryLength = span.Length;
fixed (byte* bp = &span.GetPinnableReference())
Buffer.MemoryCopy(bp, (void*)(headerSize + physicalAddress), entryLength, entryLength);
SetHeader(entryLength, (byte*)physicalAddress);
physicalAddress += Align(entryLength) + headerSize;
}
epoch.Suspend();
return true;
}
private unsafe void AsyncGetFromDiskCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
var ctx = (SimpleReadContext)Overlapped.Unpack(overlap).AsyncResult;
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
ctx.record.Return();
ctx.record = null;
ctx.completedRead.Release();
}
else
{
var record = ctx.record.GetValidPointer();
var length = GetLength(record);
if (length < 0 || length > allocator.PageSize)
{
Debug.WriteLine("Invalid record length found: " + length);
ctx.record.Return();
ctx.record = null;
ctx.completedRead.Release();
}
else
{
int requiredBytes = headerSize + length;
if (ctx.record.available_bytes >= requiredBytes)
{
ctx.completedRead.Release();
}
else
{
ctx.record.Return();
allocator.AsyncReadRecordToMemory(ctx.logicalAddress, requiredBytes, AsyncGetFromDiskCallback, ref ctx);
}
}
}
Overlapped.Free(overlap);
}
private (byte[], int) GetRecordAndFree(SectorAlignedMemory record)
{
if (record == null)
return (null, 0);
byte[] result;
int length;
unsafe
{
var ptr = record.GetValidPointer();
length = GetLength(ptr);
if (!VerifyChecksum(ptr, length))
{
throw new Exception("Checksum failed for read");
}
result = getMemory != null ? getMemory(length) : new byte[length];
fixed (byte* bp = result)
{
Buffer.MemoryCopy(ptr + headerSize, bp, length, length);
}
}
record.Return();
return (result, length);
}
private long CommitInternal(bool spinWait = false)
{
epoch.Resume();
if (allocator.ShiftReadOnlyToTail(out long tailAddress))
{
if (spinWait)
{
while (CommittedUntilAddress < tailAddress)
{
epoch.ProtectAndDrain();
Thread.Yield();
}
}
epoch.Suspend();
}
else
{
// May need to commit begin address and/or iterators
epoch.Suspend();
var beginAddress = allocator.BeginAddress;
if (beginAddress > CommittedBeginAddress || FasterLogScanIterator.PersistedIterators.Count > 0)
CommitCallback(new CommitInfo { BeginAddress = beginAddress,
FromAddress = CommittedUntilAddress,
UntilAddress = CommittedUntilAddress,
ErrorCode = 0 });
}
return tailAddress;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal unsafe int GetLength(byte* ptr)
{
if (logChecksum == LogChecksumType.None)
return *(int*)ptr;
else if (logChecksum == LogChecksumType.PerEntry)
return *(int*)(ptr + 8);
return 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal unsafe bool VerifyChecksum(byte* ptr, int length)
{
if (logChecksum == LogChecksumType.PerEntry)
{
var cs = Utility.XorBytes(ptr + 8, length + 4);
if (cs != *(ulong*)ptr)
{
return false;
}
}
return true;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal unsafe ulong GetChecksum(byte* ptr)
{
if (logChecksum == LogChecksumType.PerEntry)
{
return *(ulong*)ptr;
}
return 0;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private unsafe void SetHeader(int length, byte* dest)
{
if (logChecksum == LogChecksumType.None)
{
*(int*)dest = length;
return;
}
else if (logChecksum == LogChecksumType.PerEntry)
{
*(int*)(dest + 8) = length;
*(ulong*)dest = Utility.XorBytes(dest + 8, length + 4);
}
}
/// <summary>
/// Do we need to await a commit to make forward progress?
/// </summary>
/// <param name="committedUntilAddress"></param>
/// <param name="tailAddress"></param>
/// <returns></returns>
private bool NeedToWait(long committedUntilAddress, long tailAddress)
{
Thread.Yield();
return
allocator.GetPage(committedUntilAddress) <=
(allocator.GetPage(tailAddress) - allocator.BufferSize);
}
}
}

@ -0,0 +1,425 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Threading;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Threading.Tasks;
using System.Buffers;
using System.Collections.Generic;
using System.Collections.Concurrent;
namespace FASTER.core
{
/// <summary>
/// Scan iterator for hybrid log
/// </summary>
public class FasterLogScanIterator : IDisposable
{
private readonly int frameSize;
private readonly string name;
private readonly FasterLog fasterLog;
private readonly BlittableAllocator<Empty, byte> allocator;
private readonly long endAddress;
private readonly BlittableFrame frame;
private readonly CountdownEvent[] loaded;
private readonly CancellationTokenSource[] loadedCancel;
private readonly long[] loadedPage;
private readonly LightEpoch epoch;
private readonly GetMemory getMemory;
private readonly int headerSize;
private long currentAddress, nextAddress;
/// <summary>
/// Current address
/// </summary>
public long CurrentAddress => currentAddress;
/// <summary>
/// Next address
/// </summary>
public long NextAddress => nextAddress;
internal static readonly ConcurrentDictionary<string, FasterLogScanIterator> PersistedIterators
= new ConcurrentDictionary<string, FasterLogScanIterator>();
/// <summary>
/// Constructor
/// </summary>
/// <param name="fasterLog"></param>
/// <param name="hlog"></param>
/// <param name="beginAddress"></param>
/// <param name="endAddress"></param>
/// <param name="scanBufferingMode"></param>
/// <param name="epoch"></param>
/// <param name="headerSize"></param>
/// <param name="name"></param>
/// <param name="getMemory"></param>
internal unsafe FasterLogScanIterator(FasterLog fasterLog, BlittableAllocator<Empty, byte> hlog, long beginAddress, long endAddress, GetMemory getMemory, ScanBufferingMode scanBufferingMode, LightEpoch epoch, int headerSize, string name)
{
this.fasterLog = fasterLog;
this.allocator = hlog;
this.getMemory = getMemory;
this.epoch = epoch;
this.headerSize = headerSize;
if (beginAddress == 0)
beginAddress = hlog.GetFirstValidLogicalAddress(0);
this.name = name;
this.endAddress = endAddress;
currentAddress = beginAddress;
nextAddress = beginAddress;
if (scanBufferingMode == ScanBufferingMode.SinglePageBuffering)
frameSize = 1;
else if (scanBufferingMode == ScanBufferingMode.DoublePageBuffering)
frameSize = 2;
else if (scanBufferingMode == ScanBufferingMode.NoBuffering)
{
frameSize = 0;
return;
}
frame = new BlittableFrame(frameSize, hlog.PageSize, hlog.GetDeviceSectorSize());
loaded = new CountdownEvent[frameSize];
loadedCancel = new CancellationTokenSource[frameSize];
loadedPage = new long[frameSize];
for (int i = 0; i < frameSize; i++)
{
loadedPage[i] = -1;
loadedCancel[i] = new CancellationTokenSource();
}
}
#if DOTNETCORE
/// <summary>
/// Async enumerable for iterator
/// </summary>
/// <returns>Entry and entry length</returns>
public async IAsyncEnumerable<(byte[], int)> GetAsyncEnumerable()
{
while (true)
{
byte[] result;
int length;
while (!GetNext(out result, out length))
{
if (currentAddress >= endAddress)
yield break;
await WaitAsync();
}
yield return (result, length);
}
}
/// <summary>
/// Async enumerable for iterator (memory pool based version)
/// </summary>
/// <returns>Entry and entry length</returns>
public async IAsyncEnumerable<(IMemoryOwner<byte>, int)> GetAsyncEnumerable(MemoryPool<byte> pool)
{
while (true)
{
IMemoryOwner<byte> result;
int length;
while (!GetNext(pool, out result, out length))
{
if (currentAddress >= endAddress)
yield break;
await WaitAsync();
}
yield return (result, length);
}
}
#endif
/// <summary>
/// Wait for iteration to be ready to continue
/// </summary>
/// <returns></returns>
public async ValueTask WaitAsync()
{
while (true)
{
var commitTask = fasterLog.CommitTask;
if (nextAddress >= fasterLog.CommittedUntilAddress)
{
// Ignore commit exceptions
try
{
await commitTask;
}
catch { }
}
else
break;
}
}
/// <summary>
/// Get next record in iterator
/// </summary>
/// <param name="entry">Copy of entry, if found</param>
/// <param name="entryLength">Actual length of entry</param>
/// <returns></returns>
public unsafe bool GetNext(out byte[] entry, out int entryLength)
{
if (GetNextInternal(out long physicalAddress, out entryLength, out bool epochTaken))
{
if (getMemory != null)
{
// Use user delegate to allocate memory
entry = getMemory(entryLength);
if (entry.Length < entryLength)
throw new Exception("Byte array provided has invalid length");
}
else
{
// We allocate a byte array from heap
entry = new byte[entryLength];
}
fixed (byte* bp = entry)
Buffer.MemoryCopy((void*)(headerSize + physicalAddress), bp, entryLength, entryLength);
if (epochTaken)
epoch.Suspend();
return true;
}
entry = default;
return false;
}
/// <summary>
/// GetNext supporting memory pools
/// </summary>
/// <param name="pool"></param>
/// <param name="entry"></param>
/// <param name="entryLength"></param>
/// <returns></returns>
public unsafe bool GetNext(MemoryPool<byte> pool, out IMemoryOwner<byte> entry, out int entryLength)
{
if (GetNextInternal(out long physicalAddress, out entryLength, out bool epochTaken))
{
entry = pool.Rent(entryLength);
fixed (byte* bp = &entry.Memory.Span.GetPinnableReference())
Buffer.MemoryCopy((void*)(headerSize + physicalAddress), bp, entryLength, entryLength);
if (epochTaken)
epoch.Suspend();
return true;
}
entry = default;
entryLength = default;
return false;
}
/// <summary>
/// Dispose the iterator
/// </summary>
public void Dispose()
{
frame?.Dispose();
if (name != null)
PersistedIterators.TryRemove(name, out _);
}
private unsafe void BufferAndLoad(long currentAddress, long currentPage, long currentFrame)
{
if (loadedPage[currentFrame] != currentPage)
{
if (loadedPage[currentFrame] != -1)
{
WaitForFrameLoad(currentFrame);
}
allocator.AsyncReadPagesFromDeviceToFrame(currentAddress >> allocator.LogPageSizeBits, 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[currentFrame], 0, null, null, loadedCancel[currentFrame]);
loadedPage[currentFrame] = currentAddress >> allocator.LogPageSizeBits;
}
if (frameSize == 2)
{
var nextPage = currentPage + 1;
var nextFrame = (currentFrame + 1) % frameSize;
if (loadedPage[nextFrame] != nextPage)
{
if (loadedPage[nextFrame] != -1)
{
WaitForFrameLoad(nextFrame);
}
allocator.AsyncReadPagesFromDeviceToFrame(1 + (currentAddress >> allocator.LogPageSizeBits), 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[nextFrame], 0, null, null, loadedCancel[nextFrame]);
loadedPage[nextFrame] = 1 + (currentAddress >> allocator.LogPageSizeBits);
}
}
WaitForFrameLoad(currentFrame);
}
private void WaitForFrameLoad(long frame)
{
if (loaded[frame].IsSet) return;
try
{
loaded[frame].Wait(loadedCancel[frame].Token); // Ensure we have completed ongoing load
}
catch (Exception e)
{
loadedPage[frame] = -1;
loadedCancel[frame] = new CancellationTokenSource();
nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits;
throw new Exception("Page read from storage failed, skipping page. Inner exception: " + e.ToString());
}
}
private unsafe void AsyncReadPagesCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
var result = (PageAsyncReadResult<Empty>)Overlapped.Unpack(overlap).AsyncResult;
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
result.cts?.Cancel();
}
if (result.freeBuffer1 != null)
{
if (errorCode == 0)
allocator.PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, result.page);
result.freeBuffer1.Return();
result.freeBuffer1 = null;
}
if (errorCode == 0)
result.handle?.Signal();
Interlocked.MemoryBarrier();
Overlapped.Free(overlap);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private int Align(int length)
{
return (length + 3) & ~3;
}
/// <summary>
/// Retrieve physical address of next iterator value
/// (under epoch protection if it is from main page buffer)
/// </summary>
/// <param name="physicalAddress"></param>
/// <param name="entryLength"></param>
/// <param name="epochTaken"></param>
/// <returns></returns>
private unsafe bool GetNextInternal(out long physicalAddress, out int entryLength, out bool epochTaken)
{
physicalAddress = 0;
entryLength = 0;
epochTaken = false;
currentAddress = nextAddress;
while (true)
{
// Check for boundary conditions
if (currentAddress < allocator.BeginAddress)
{
Debug.WriteLine("Iterator address is less than log BeginAddress " + allocator.BeginAddress + ", adjusting iterator address");
currentAddress = allocator.BeginAddress;
}
if ((currentAddress >= endAddress) || (currentAddress >= fasterLog.CommittedUntilAddress))
{
nextAddress = currentAddress;
return false;
}
if (frameSize == 0 && currentAddress < allocator.HeadAddress)
{
throw new Exception("Iterator address is less than log HeadAddress in memory-scan mode");
}
var currentPage = currentAddress >> allocator.LogPageSizeBits;
var offset = currentAddress & allocator.PageSizeMask;
var headAddress = allocator.HeadAddress;
if (currentAddress < headAddress)
{
BufferAndLoad(currentAddress, currentPage, currentPage % frameSize);
physicalAddress = frame.GetPhysicalAddress(currentPage % frameSize, offset);
}
else
{
epoch.Resume();
headAddress = allocator.HeadAddress;
if (currentAddress < headAddress) // rare case
{
epoch.Suspend();
continue;
}
physicalAddress = allocator.GetPhysicalAddress(currentAddress);
}
// Get and check entry length
entryLength = fasterLog.GetLength((byte*)physicalAddress);
if (entryLength == 0)
{
if (currentAddress >= headAddress)
epoch.Suspend();
nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits;
if (0 != fasterLog.GetChecksum((byte*)physicalAddress))
{
var curPage = currentAddress >> allocator.LogPageSizeBits;
throw new Exception("Invalid checksum found during scan, skipping page " + curPage);
}
else
{
// We are likely at end of page, skip to next
currentAddress = nextAddress;
continue;
}
}
int recordSize = headerSize + Align(entryLength);
if ((currentAddress & allocator.PageSizeMask) + recordSize > allocator.PageSize)
{
if (currentAddress >= headAddress)
epoch.Suspend();
nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits;
throw new Exception("Invalid length of record found: " + entryLength + ", skipping page");
}
// Verify checksum if needed
if (currentAddress < headAddress)
{
if (!fasterLog.VerifyChecksum((byte*)physicalAddress, entryLength))
{
var curPage = currentAddress >> allocator.LogPageSizeBits;
nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits;
throw new Exception("Invalid checksum found during scan, skipping page " + curPage);
}
}
if ((currentAddress & allocator.PageSizeMask) + recordSize == allocator.PageSize)
nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits;
else
nextAddress = currentAddress + recordSize;
epochTaken = currentAddress >= headAddress;
return true;
}
}
}
}

@ -0,0 +1,160 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
namespace FASTER.core
{
/// <summary>
/// Recovery info for FASTER Log
/// </summary>
internal struct FasterLogRecoveryInfo
{
/// <summary>
/// Begin address
/// </summary>
public long BeginAddress;
/// <summary>
/// Flushed logical address
/// </summary>
public long FlushedUntilAddress;
/// <summary>
/// Persisted iterators
/// </summary>
public Dictionary<string, long> Iterators;
/// <summary>
/// Initialize
/// </summary>
public void Initialize()
{
BeginAddress = 0;
FlushedUntilAddress = 0;
}
/// <summary>
/// Initialize from stream
/// </summary>
/// <param name="reader"></param>
public void Initialize(BinaryReader reader)
{
int version;
long checkSum;
try
{
version = reader.ReadInt32();
checkSum = reader.ReadInt64();
BeginAddress = reader.ReadInt64();
FlushedUntilAddress = reader.ReadInt64();
}
catch (Exception e)
{
throw new Exception("Unable to recover from previous commit. Inner exception: " + e.ToString());
}
if (version != 0)
throw new Exception("Invalid version found during commit recovery");
if (checkSum != (BeginAddress ^ FlushedUntilAddress))
throw new Exception("Invalid checksum found during commit recovery");
var count = 0;
try
{
count = reader.ReadInt32();
}
catch { }
if (count > 0)
{
Iterators = new Dictionary<string, long>();
for (int i = 0; i < count; i++)
{
Iterators.Add(reader.ReadString(), reader.ReadInt64());
}
}
}
/// <summary>
/// Recover info from token
/// </summary>
/// <param name="logCommitManager"></param>
/// <returns></returns>
internal void Recover(ILogCommitManager logCommitManager)
{
var metadata = logCommitManager.GetCommitMetadata();
if (metadata == null)
throw new Exception("Invalid log commit metadata during recovery");
Initialize(new BinaryReader(new MemoryStream(metadata)));
}
/// <summary>
/// Reset
/// </summary>
public void Reset()
{
Initialize();
}
/// <summary>
/// Write info to byte array
/// </summary>
public byte[] ToByteArray()
{
using (var ms = new MemoryStream())
{
using (var writer = new BinaryWriter(ms))
{
writer.Write(0); // version
writer.Write(BeginAddress ^ FlushedUntilAddress); // checksum
writer.Write(BeginAddress);
writer.Write(FlushedUntilAddress);
if (Iterators?.Count > 0)
{
writer.Write(Iterators.Count);
foreach (var kvp in Iterators)
{
writer.Write(kvp.Key);
writer.Write(kvp.Value);
}
}
}
return ms.ToArray();
}
}
/// <summary>
/// Take snapshot of persisted iterators
/// </summary>
public void PopulateIterators()
{
if (FasterLogScanIterator.PersistedIterators.Count > 0)
{
Iterators = new Dictionary<string, long>();
foreach (var kvp in FasterLogScanIterator.PersistedIterators)
{
Iterators.Add(kvp.Key, kvp.Value.CurrentAddress);
}
}
}
/// <summary>
/// Print checkpoint info for debugging purposes
/// </summary>
public void DebugPrint()
{
Debug.WriteLine("******** Log Commit Info ********");
Debug.WriteLine("BeginAddress: {0}", BeginAddress);
Debug.WriteLine("FlushedUntilAddress: {0}", FlushedUntilAddress);
}
}
}

@ -0,0 +1,99 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System;
using System.Diagnostics;
using System.IO;
namespace FASTER.core
{
/// <summary>
/// Delegate for getting memory from user
/// </summary>
/// <param name="minLength">Minimum length of returned byte array</param>
/// <returns></returns>
public delegate byte[] GetMemory(int minLength);
/// <summary>
/// Type of checksum to add to log
/// </summary>
public enum LogChecksumType
{
/// <summary>
/// No checksums
/// </summary>
None,
/// <summary>
/// Checksum per entry
/// </summary>
PerEntry
}
/// <summary>
/// FASTER Log Settings
/// </summary>
public class FasterLogSettings
{
/// <summary>
/// Device used for log
/// </summary>
public IDevice LogDevice = new NullDevice();
/// <summary>
/// Size of a page, in bits
/// </summary>
public int PageSizeBits = 22;
/// <summary>
/// Total size of in-memory part of log, in bits
/// Should be at least one page long
/// Num pages = 2^(MemorySizeBits-PageSizeBits)
/// </summary>
public int MemorySizeBits = 23;
/// <summary>
/// Size of a segment (group of pages), in bits
/// This is the granularity of files on disk
/// </summary>
public int SegmentSizeBits = 30;
/// <summary>
/// Log commit manager
/// </summary>
public ILogCommitManager LogCommitManager = null;
/// <summary>
/// Use specified directory for storing and retrieving checkpoints
/// This is a shortcut to providing the following:
/// FasterLogSettings.LogCommitManager = new LocalLogCommitManager(LogCommitFile)
/// </summary>
public string LogCommitFile = null;
/// <summary>
/// User callback to allocate memory for read entries
/// </summary>
public GetMemory GetMemory = null;
/// <summary>
/// Type of checksum to add to log
/// </summary>
public LogChecksumType LogChecksum = LogChecksumType.None;
internal LogSettings GetLogSettings()
{
return new LogSettings
{
LogDevice = LogDevice,
PageSizeBits = PageSizeBits,
SegmentSizeBits = SegmentSizeBits,
MemorySizeBits = MemorySizeBits,
CopyReadsToTail = false,
MutableFraction = 0,
ObjectLogDevice = null,
ReadCacheSettings = null
};
}
}
}

@ -0,0 +1,27 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System.IO;
namespace FASTER.core
{
/// <summary>
/// Log commit manager
/// </summary>
public interface ILogCommitManager
{
/// <summary>
/// Perform (synchronous) commit with specified metadata
/// </summary>
/// <param name="beginAddress">Committed begin address (for information only, not necessary to persist)</param>
/// <param name="untilAddress">Address committed until (for information only, not necessary to persist)</param>
/// <param name="commitMetadata">Commit metadata - should be persisted</param>
void Commit(long beginAddress, long untilAddress, byte[] commitMetadata);
/// <summary>
/// Return prior commit metadata during recovery
/// </summary>
/// <returns></returns>
byte[] GetCommitMetadata();
}
}

@ -0,0 +1,26 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
namespace FASTER.core
{
/// <summary>
/// Interface to provide a batch of ReadOnlySpan[byte] data to FASTER
/// </summary>
public interface IReadOnlySpanBatch
{
/// <summary>
/// Number of entries in provided batch
/// </summary>
/// <returns>Number of entries</returns>
int TotalEntries();
/// <summary>
/// Retrieve batch entry at specified index
/// </summary>
/// <param name="index">Index</param>
/// <returns></returns>
ReadOnlySpan<byte> Get(int index);
}
}

@ -0,0 +1,64 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System.IO;
namespace FASTER.core
{
/// <summary>
/// Implementation of checkpoint interface for local file storage
/// </summary>
public class LocalLogCommitManager : ILogCommitManager
{
private string CommitFile;
/// <summary>
/// Create new instance of local checkpoint manager at given base directory
/// </summary>
/// <param name="CommitFile"></param>
public LocalLogCommitManager(string CommitFile)
{
this.CommitFile = CommitFile;
}
/// <summary>
/// Perform (synchronous) commit with specified metadata
/// </summary>
/// <param name="beginAddress">Committed begin address (for information only, not necessary to persist)</param>
/// <param name="untilAddress">Address committed until (for information only, not necessary to persist)</param>
/// <param name="commitMetadata">Commit metadata</param>
public void Commit(long beginAddress, long untilAddress, byte[] commitMetadata)
{
// Two phase to ensure we write metadata in single Write operation
using (var ms = new MemoryStream())
{
using (var writer = new BinaryWriter(ms))
{
writer.Write(commitMetadata.Length);
writer.Write(commitMetadata);
}
using (var writer = new BinaryWriter(new FileStream(CommitFile, FileMode.OpenOrCreate)))
{
writer.Write(ms.ToArray());
writer.Flush();
}
}
}
/// <summary>
/// Retrieve commit metadata
/// </summary>
/// <returns>Metadata, or null if invalid</returns>
public byte[] GetCommitMetadata()
{
if (!File.Exists(CommitFile))
return null;
using (var reader = new BinaryReader(new FileStream(CommitFile, FileMode.Open)))
{
var len = reader.ReadInt32();
return reader.ReadBytes(len);
}
}
}
}

@ -0,0 +1,73 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 1591
using System;
namespace FASTER.core
{
/// <summary>
/// Default empty functions base class to make it easy for users to provide
/// their own implementation
/// </summary>
/// <typeparam name="Key"></typeparam>
/// <typeparam name="Value"></typeparam>
/// <typeparam name="Input"></typeparam>
/// <typeparam name="Output"></typeparam>
/// <typeparam name="Context"></typeparam>
public abstract class FunctionsBase<Key, Value, Input, Output, Context> : IFunctions<Key, Value, Input, Output, Context>
{
public virtual void ConcurrentReader(ref Key key, ref Input input, ref Value value, ref Output dst) { }
public virtual void SingleReader(ref Key key, ref Input input, ref Value value, ref Output dst) { }
public virtual bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst) { dst = src; return true; }
public virtual void SingleWriter(ref Key key, ref Value src, ref Value dst) => dst = src;
public virtual void InitialUpdater(ref Key key, ref Input input, ref Value value) { }
public virtual void CopyUpdater(ref Key key, ref Input input, ref Value oldValue, ref Value newValue) { }
public virtual bool InPlaceUpdater(ref Key key, ref Input input, ref Value value) { return true; }
public virtual void ReadCompletionCallback(ref Key key, ref Input input, ref Output output, Context ctx, Status status) { }
public virtual void RMWCompletionCallback(ref Key key, ref Input input, Context ctx, Status status) { }
public virtual void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx) { }
public virtual void DeleteCompletionCallback(ref Key key, Context ctx) { }
public virtual void CheckpointCompletionCallback(Guid sessionId, long serialNum) { }
}
/// <summary>
/// Default empty functions base class to make it easy for users to provide
/// their own implementation
/// </summary>
/// <typeparam name="Key"></typeparam>
/// <typeparam name="Value"></typeparam>
/// <typeparam name="Context"></typeparam>
public class SimpleFunctions<Key, Value, Context> : FunctionsBase<Key, Value, Value, Value, Context>
{
private readonly Func<Value, Value, Value> merger;
public SimpleFunctions() => merger = (l, r) => l;
public SimpleFunctions(Func<Value, Value, Value> merger) => this.merger = merger;
public override void ConcurrentReader(ref Key key, ref Value input, ref Value value, ref Value dst) => dst = value;
public override void SingleReader(ref Key key, ref Value input, ref Value value, ref Value dst) => dst = value;
public override bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst) { dst = src; return true; }
public override void SingleWriter(ref Key key, ref Value src, ref Value dst) => dst = src;
public override void InitialUpdater(ref Key key, ref Value input, ref Value value) => value = input;
public override void CopyUpdater(ref Key key, ref Value input, ref Value oldValue, ref Value newValue) => newValue = merger(input, oldValue);
public override bool InPlaceUpdater(ref Key key, ref Value input, ref Value value) { value = merger(input, value); return true; }
public override void ReadCompletionCallback(ref Key key, ref Value input, ref Value output, Context ctx, Status status) { }
public override void RMWCompletionCallback(ref Key key, ref Value input, Context ctx, Status status) { }
public override void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx) { }
public override void DeleteCompletionCallback(ref Key key, Context ctx) { }
public override void CheckpointCompletionCallback(Guid sessionId, long serialNum) { }
}
public class SimpleFunctions<Key, Value> : SimpleFunctions<Key, Value, Empty>
{
public SimpleFunctions() : base() { }
public SimpleFunctions(Func<Value, Value, Value> merger) : base(merger) { }
}
}

@ -0,0 +1,26 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
namespace FASTER.core
{
/// <summary>
/// Key interface
/// </summary>
/// <typeparam name="T"></typeparam>
public interface IFasterEqualityComparer<T>
{
/// <summary>
/// Get 64-bit hash code
/// </summary>
/// <returns></returns>
long GetHashCode64(ref T k);
/// <summary>
/// Equality comparison
/// </summary>
/// <param name="k1">Left side</param>
/// <param name="k2">Right side</param>
/// <returns></returns>
bool Equals(ref T k1, ref T k2);
}
}

@ -0,0 +1,202 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Interface to FASTER key-value store
/// (customized for sample types Key, Value, Input, Output, Context)
/// Since there are pointers in the API, we cannot automatically create a
/// generic version covering arbitrary blittable types. Instead, the
/// user defines the customized interface and provides it to FASTER
/// so it can return a (generated) instance for that interface.
/// </summary>
public interface IFasterKV<Key, Value, Input, Output, Context> : IDisposable
where Key : new()
where Value : new()
{
/* Thread-related operations */
/// <summary>
/// Start a session with FASTER. FASTER sessions correspond to threads issuing
/// operations to FASTER.
/// </summary>
/// <returns>Session identifier</returns>
Guid StartSession();
/// <summary>
/// Continue a session after recovery. Provide FASTER with the identifier of the
/// session that is being continued.
/// </summary>
/// <param name="guid"></param>
/// <returns>Sequence number for resuming operations</returns>
long ContinueSession(Guid guid);
/// <summary>
/// Stop a session and de-register the thread from FASTER.
/// </summary>
void StopSession();
/// <summary>
/// Refresh the session epoch. The caller is required to invoke Refresh periodically
/// in order to guarantee system liveness.
/// </summary>
void Refresh();
/* Store Interface */
/// <summary>
/// Read operation
/// </summary>
/// <param name="key">Key of read</param>
/// <param name="input">Input argument used by Reader to select what part of value to read</param>
/// <param name="output">Reader stores the read result in output</param>
/// <param name="context">User context to identify operation in asynchronous callback</param>
/// <param name="lsn">Increasing sequence number of operation (used for recovery)</param>
/// <returns>Status of operation</returns>
Status Read(ref Key key, ref Input input, ref Output output, Context context, long lsn);
/// <summary>
/// (Blind) upsert operation
/// </summary>
/// <param name="key">Key of read</param>
/// <param name="value">Value being upserted</param>
/// <param name="context">User context to identify operation in asynchronous callback</param>
/// <param name="lsn">Increasing sequence number of operation (used for recovery)</param>
/// <returns>Status of operation</returns>
Status Upsert(ref Key key, ref Value value, Context context, long lsn);
/// <summary>
/// Atomic read-modify-write operation
/// </summary>
/// <param name="key">Key of read</param>
/// <param name="input">Input argument used by RMW callback to perform operation</param>
/// <param name="context">User context to identify operation in asynchronous callback</param>
/// <param name="lsn">Increasing sequence number of operation (used for recovery)</param>
/// <returns>Status of operation</returns>
Status RMW(ref Key key, ref Input input, Context context, long lsn);
/// <summary>
/// Delete entry (use tombstone if necessary)
/// Hash entry is removed as a best effort (if key is in memory and at
/// the head of hash chain.
/// Value is set to null (using ConcurrentWrite) if it is in mutable region
/// </summary>
/// <param name="key"></param>
/// <param name="userContext"></param>
/// <param name="monotonicSerialNum"></param>
/// <returns></returns>
Status Delete(ref Key key, Context userContext, long monotonicSerialNum);
/// <summary>
/// Complete all pending operations issued by this session
/// </summary>
/// <param name="wait">Whether we spin-wait for pending operations to complete</param>
/// <returns>Whether all pending operations have completed</returns>
bool CompletePending(bool wait);
/* Recovery */
/// <summary>
/// Take full checkpoint of FASTER
/// </summary>
/// <param name="token">Token describing checkpoint</param>
/// <returns>Whether checkpoint was initiated</returns>
bool TakeFullCheckpoint(out Guid token);
/// <summary>
/// Take checkpoint of FASTER index only (not log)
/// </summary>
/// <param name="token">Token describing checkpoin</param>
/// <returns>Whether checkpoint was initiated</returns>
bool TakeIndexCheckpoint(out Guid token);
/// <summary>
/// Take checkpoint of FASTER log only (not index)
/// </summary>
/// <param name="token">Token describing checkpoin</param>
/// <returns>Whether checkpoint was initiated</returns>
bool TakeHybridLogCheckpoint(out Guid token);
/// <summary>
/// Recover from last successfuly checkpoints
/// </summary>
void Recover();
/// <summary>
/// Recover using full checkpoint token
/// </summary>
/// <param name="fullcheckpointToken"></param>
void Recover(Guid fullcheckpointToken);
/// <summary>
/// Recover using a separate index and log checkpoint token
/// </summary>
/// <param name="indexToken"></param>
/// <param name="hybridLogToken"></param>
void Recover(Guid indexToken, Guid hybridLogToken);
/// <summary>
/// Complete ongoing checkpoint (spin-wait)
/// </summary>
/// <param name="wait"></param>
/// <returns>Whether checkpoint has completed</returns>
bool CompleteCheckpoint(bool wait);
/// <summary>
/// Grow the hash index
/// </summary>
/// <returns></returns>
bool GrowIndex();
/// <summary>
/// Get number of (non-zero) hash entries in FASTER
/// </summary>
long EntryCount { get; }
/// <summary>
/// Get size of index in #cache lines (64 bytes each)
/// </summary>
long IndexSize { get; }
/// <summary>
/// Get comparer used by this instance of FASTER
/// </summary>
IFasterEqualityComparer<Key> Comparer { get; }
/// <summary>
/// Dump distribution of #entries in hash table
/// </summary>
string DumpDistribution();
/// <summary>
/// Experimental feature
/// Check if FASTER contains key in memory (between HeadAddress
/// and tail), or between the specified fromAddress (after
/// HeadAddress) and tail
/// </summary>
/// <param name="key"></param>
/// <param name="fromAddress"></param>
/// <returns></returns>
Status ContainsKeyInMemory(ref Key key, long fromAddress = -1);
/// <summary>
/// Get accessor for FASTER hybrid log
/// </summary>
LogAccessor<Key, Value, Input, Output, Context> Log { get; }
/// <summary>
/// Get accessor for FASTER read cache
/// </summary>
LogAccessor<Key, Value, Input, Output, Context> ReadCache { get; }
}
}

@ -0,0 +1,115 @@
using System;
namespace FASTER.core
{
/// <summary>
/// Callback functions to FASTER
/// </summary>
/// <typeparam name="Key"></typeparam>
/// <typeparam name="Value"></typeparam>
/// <typeparam name="Input"></typeparam>
/// <typeparam name="Output"></typeparam>
/// <typeparam name="Context"></typeparam>
public interface IFunctions<Key, Value, Input, Output, Context>
{
/// <summary>
/// Read completion
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="output"></param>
/// <param name="ctx"></param>
/// <param name="status"></param>
void ReadCompletionCallback(ref Key key, ref Input input, ref Output output, Context ctx, Status status);
/// <summary>
/// Upsert completion
/// </summary>
/// <param name="key"></param>
/// <param name="value"></param>
/// <param name="ctx"></param>
void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx);
/// <summary>
/// RMW completion
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="ctx"></param>
/// <param name="status"></param>
void RMWCompletionCallback(ref Key key, ref Input input, Context ctx, Status status);
/// <summary>
/// Delete completion
/// </summary>
/// <param name="key"></param>
/// <param name="ctx"></param>
void DeleteCompletionCallback(ref Key key, Context ctx);
/// <summary>
/// Checkpoint completion callback (called per client session)
/// </summary>
/// <param name="sessionId">Session ID reporting persistence</param>
/// <param name="serialNum">Checkpoint offset (CPR point) for session</param>
void CheckpointCompletionCallback(Guid sessionId, long serialNum);
/// <summary>
/// Initial update for RMW
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="value"></param>
void InitialUpdater(ref Key key, ref Input input, ref Value value);
/// <summary>
/// Copy-update for RMW
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="oldValue"></param>
/// <param name="newValue"></param>
void CopyUpdater(ref Key key, ref Input input, ref Value oldValue, ref Value newValue);
/// <summary>
/// In-place update for RMW
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="value"></param>
bool InPlaceUpdater(ref Key key, ref Input input, ref Value value);
/// <summary>
/// Single reader
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="value"></param>
/// <param name="dst"></param>
void SingleReader(ref Key key, ref Input input, ref Value value, ref Output dst);
/// <summary>
/// Conncurrent reader
/// </summary>
/// <param name="key"></param>
/// <param name="input"></param>
/// <param name="value"></param>
/// <param name="dst"></param>
void ConcurrentReader(ref Key key, ref Input input, ref Value value, ref Output dst);
/// <summary>
/// Single writer
/// </summary>
/// <param name="key"></param>
/// <param name="src"></param>
/// <param name="dst"></param>
void SingleWriter(ref Key key, ref Value src, ref Value dst);
/// <summary>
/// Concurrent writer
/// </summary>
/// <param name="key"></param>
/// <param name="src"></param>
/// <param name="dst"></param>
bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst);
}
}

@ -0,0 +1,111 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.IO;
namespace FASTER.core
{
/// <summary>
/// Object serializer interface
/// </summary>
/// <typeparam name="T"></typeparam>
public interface IObjectSerializer<T>
{
/// <summary>
/// Begin serialization to given stream
/// </summary>
/// <param name="stream"></param>
void BeginSerialize(Stream stream);
/// <summary>
/// Serialize object
/// </summary>
/// <param name="obj"></param>
void Serialize(ref T obj);
/// <summary>
/// End serialization to given stream
/// </summary>
void EndSerialize();
/// <summary>
/// Begin deserialization from given stream
/// </summary>
/// <param name="stream"></param>
void BeginDeserialize(Stream stream);
/// <summary>
/// Deserialize object
/// </summary>
/// <param name="obj"></param>
void Deserialize(ref T obj);
/// <summary>
/// End deserialization from given stream
/// </summary>
void EndDeserialize();
}
/// <summary>
/// Serializer base class for binary reader and writer
/// </summary>
/// <typeparam name="T"></typeparam>
public abstract class BinaryObjectSerializer<T> : IObjectSerializer<T>
{
/// <summary>
/// Binary reader
/// </summary>
protected BinaryReader reader;
/// <summary>
/// Binary writer
/// </summary>
protected BinaryWriter writer;
/// <summary>
/// Begin deserialization
/// </summary>
/// <param name="stream"></param>
public void BeginDeserialize(Stream stream)
{
reader = new BinaryReader(stream);
}
/// <summary>
/// Deserialize
/// </summary>
/// <param name="obj"></param>
public abstract void Deserialize(ref T obj);
/// <summary>
/// End deserialize
/// </summary>
public void EndDeserialize()
{
}
/// <summary>
/// Begin serialize
/// </summary>
/// <param name="stream"></param>
public void BeginSerialize(Stream stream)
{
writer = new BinaryWriter(stream);
}
/// <summary>
/// Serialize
/// </summary>
/// <param name="obj"></param>
public abstract void Serialize(ref T obj);
/// <summary>
/// End serialize
/// </summary>
public void EndSerialize()
{
writer.Dispose();
}
}
}

@ -0,0 +1,729 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
//#define WAIT_FOR_INDEX_CHECKPOINT
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Checkpoint related function of FASTER
/// </summary>
public unsafe partial class FasterKV<Key, Value, Input, Output, Context, Functions> : FasterBase, IFasterKV<Key, Value, Input, Output, Context>
where Key : new()
where Value : new()
where Functions : IFunctions<Key, Value, Input, Output, Context>
{
private class EpochPhaseIdx
{
public const int PrepareForIndexCheckpt = 0;
public const int Prepare = 1;
public const int InProgress = 2;
public const int WaitPending = 3;
public const int WaitFlush = 4;
public const int CheckpointCompletionCallback = 5;
}
#region Starting points
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool InternalTakeCheckpoint(CheckpointType type)
{
if (_systemState.phase == Phase.GC)
{
Debug.WriteLine("Forcing completion of GC");
GarbageCollectBuckets(0, true);
}
if (_systemState.phase == Phase.REST)
{
var context = (long)type;
var currentState = SystemState.Make(Phase.REST, _systemState.version);
var nextState = GetNextState(currentState, type);
return GlobalMoveToNextState(currentState, nextState, ref context);
}
else
{
return false;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool InternalGrowIndex()
{
if (_systemState.phase == Phase.GC)
{
Debug.WriteLine("Forcing completion of GC");
GarbageCollectBuckets(0, true);
}
if (_systemState.phase == Phase.REST)
{
var version = _systemState.version;
long context = 0;
SystemState nextState = SystemState.Make(Phase.PREPARE_GROW, version);
if (GlobalMoveToNextState(SystemState.Make(Phase.REST, version), nextState, ref context))
{
return true;
}
}
return false;
}
#endregion
/// <summary>
/// Global transition function that coordinates various state machines.
/// A few characteristics about the state machine:
/// <list type="bullet">
/// <item>
/// <description>
/// Transitions happen atomically using a compare-and-swap operation. So, multiple threads can try to do the same transition. Only one will succeed.
/// </description>
/// </item>
/// <item>
/// <description>
/// Transition from state A to B happens via an intermediate state (INTERMEDIATE). This serves as a lock by a thread to perform the transition.
/// Some transitions are accompanied by actions that must be performed before the transitions such as initializing contexts, etc.
/// </description>
/// </item>
/// <item>
/// <description>
/// States can be part of multiple state machines. For example: PREP_INDEX_CHECKPOINT is part of both index-only and full checkpoints.
/// </description>
/// </item>
/// </list>
///
/// We currently support 5 different state machines:
/// <list type="number">
/// <item>
/// <term> Index-Only Checkpoint </term>
/// <description> REST -> PREP_INDEX_CHECKPOINT -> INDEX_CHECKPOINT -> REST </description>
/// </item>
/// <item>
/// <term>HybridLog-Only Checkpoint</term>
/// <description>REST -> PREPARE -> IN_PROGRESS -> WAIT_PENDING -> WAIT_FLUSH -> PERSISTENCE_CALLBACK -> REST</description>
/// </item>
/// <item>
/// <term>Full Checkpoint</term>
/// <description>REST -> PREP_INDEX_CHECKPOINT -> PREPARE -> IN_PROGRESS -> WAIT_PENDING -> WAIT_FLUSH -> PERSISTENCE_CALLBACK -> REST</description>
/// </item>
/// <item>
/// <term>GC</term>
/// <description></description>
/// </item>
/// <item>
/// <term>Grow</term>
/// <description></description>
/// </item>
/// </list>
/// </summary>
/// <param name="currentState">from state of the transition.</param>
/// <param name="nextState">to state of the transition.</param>
/// <param name="context">optional additioanl parameter for transition.</param>
/// <returns>true if transition succeeds.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool GlobalMoveToNextState(SystemState currentState, SystemState nextState, ref long context)
{
var intermediateState = SystemState.Make(Phase.INTERMEDIATE, currentState.version);
// Move from S1 to I
if (MakeTransition(currentState, intermediateState))
{
// Acquired ownership to make the transition from S1 to S2
switch (nextState.phase)
{
case Phase.PREP_INDEX_CHECKPOINT:
{
_checkpointType = (CheckpointType)context;
switch (_checkpointType)
{
case CheckpointType.INDEX_ONLY:
{
_indexCheckpointToken = Guid.NewGuid();
InitializeIndexCheckpoint(_indexCheckpointToken);
break;
}
case CheckpointType.FULL:
{
var fullCheckpointToken = Guid.NewGuid();
_indexCheckpointToken = fullCheckpointToken;
_hybridLogCheckpointToken = fullCheckpointToken;
InitializeIndexCheckpoint(_indexCheckpointToken);
InitializeHybridLogCheckpoint(_hybridLogCheckpointToken, currentState.version);
break;
}
default:
throw new Exception();
}
ObtainCurrentTailAddress(ref _indexCheckpoint.info.startLogicalAddress);
MakeTransition(intermediateState, nextState);
break;
}
case Phase.INDEX_CHECKPOINT:
{
if (UseReadCache && this.ReadCache.BeginAddress != this.ReadCache.TailAddress)
{
throw new Exception("Index checkpoint with read cache is not supported");
}
TakeIndexFuzzyCheckpoint();
MakeTransition(intermediateState, nextState);
break;
}
case Phase.PREPARE:
{
switch (currentState.phase)
{
case Phase.REST:
{
_checkpointType = (CheckpointType)context;
Debug.Assert(_checkpointType == CheckpointType.HYBRID_LOG_ONLY);
_hybridLogCheckpointToken = Guid.NewGuid();
InitializeHybridLogCheckpoint(_hybridLogCheckpointToken, currentState.version);
break;
}
case Phase.PREP_INDEX_CHECKPOINT:
{
if (UseReadCache && this.ReadCache.BeginAddress != this.ReadCache.TailAddress)
{
throw new Exception("Index checkpoint with read cache is not supported");
}
TakeIndexFuzzyCheckpoint();
break;
}
default:
throw new Exception();
}
ObtainCurrentTailAddress(ref _hybridLogCheckpoint.info.startLogicalAddress);
if (!FoldOverSnapshot)
{
_hybridLogCheckpoint.info.flushedLogicalAddress = hlog.FlushedUntilAddress;
_hybridLogCheckpoint.info.useSnapshotFile = 1;
}
MakeTransition(intermediateState, nextState);
break;
}
case Phase.IN_PROGRESS:
{
MakeTransition(intermediateState, nextState);
break;
}
case Phase.WAIT_PENDING:
{
var seg = hlog.GetSegmentOffsets();
if (seg != null)
{
_hybridLogCheckpoint.info.objectLogSegmentOffsets = new long[seg.Length];
Array.Copy(seg, _hybridLogCheckpoint.info.objectLogSegmentOffsets, seg.Length);
}
MakeTransition(intermediateState, nextState);
break;
}
case Phase.WAIT_FLUSH:
{
if (_checkpointType == CheckpointType.FULL)
{
_indexCheckpoint.info.num_buckets = overflowBucketsAllocator.GetMaxValidAddress();
ObtainCurrentTailAddress(ref _indexCheckpoint.info.finalLogicalAddress);
}
_hybridLogCheckpoint.info.headAddress = hlog.HeadAddress;
_hybridLogCheckpoint.info.beginAddress = hlog.BeginAddress;
if (FoldOverSnapshot)
{
hlog.ShiftReadOnlyToTail(out long tailAddress);
_hybridLogCheckpoint.info.finalLogicalAddress = tailAddress;
}
else
{
ObtainCurrentTailAddress(ref _hybridLogCheckpoint.info.finalLogicalAddress);
_hybridLogCheckpoint.snapshotFileDevice = checkpointManager.GetSnapshotLogDevice(_hybridLogCheckpointToken);
_hybridLogCheckpoint.snapshotFileObjectLogDevice = checkpointManager.GetSnapshotObjectLogDevice(_hybridLogCheckpointToken);
_hybridLogCheckpoint.snapshotFileDevice.Initialize(hlog.GetSegmentSize());
_hybridLogCheckpoint.snapshotFileObjectLogDevice.Initialize(hlog.GetSegmentSize());
long startPage = hlog.GetPage(_hybridLogCheckpoint.info.flushedLogicalAddress);
long endPage = hlog.GetPage(_hybridLogCheckpoint.info.finalLogicalAddress);
if (_hybridLogCheckpoint.info.finalLogicalAddress > hlog.GetStartLogicalAddress(endPage))
{
endPage++;
}
// This can be run on a new thread if we want to immediately parallelize
// the rest of the log flush
hlog.AsyncFlushPagesToDevice(startPage,
endPage,
_hybridLogCheckpoint.info.finalLogicalAddress,
_hybridLogCheckpoint.snapshotFileDevice,
_hybridLogCheckpoint.snapshotFileObjectLogDevice,
out _hybridLogCheckpoint.flushed);
}
MakeTransition(intermediateState, nextState);
break;
}
case Phase.PERSISTENCE_CALLBACK:
{
WriteHybridLogMetaInfo();
if (_checkpointType == CheckpointType.FULL)
WriteIndexMetaInfo();
MakeTransition(intermediateState, nextState);
break;
}
case Phase.GC:
{
hlog.ShiftBeginAddress(context);
int numChunks = (int)(state[resizeInfo.version].size / Constants.kSizeofChunk);
if (numChunks == 0) numChunks = 1; // at least one chunk
numPendingChunksToBeGCed = numChunks;
gcStatus = new long[numChunks];
MakeTransition(intermediateState, nextState);
break;
}
case Phase.PREPARE_GROW:
{
// Note that the transition must be done before bumping epoch here!
MakeTransition(intermediateState, nextState);
epoch.BumpCurrentEpoch(() =>
{
long _context = 0;
GlobalMoveToNextState(nextState, SystemState.Make(Phase.IN_PROGRESS_GROW, nextState.version), ref _context);
});
break;
}
case Phase.IN_PROGRESS_GROW:
{
// Set up the transition to new version of HT
int numChunks = (int)(state[resizeInfo.version].size / Constants.kSizeofChunk);
if (numChunks == 0) numChunks = 1; // at least one chunk
numPendingChunksToBeSplit = numChunks;
splitStatus = new long[numChunks];
Initialize(1 - resizeInfo.version, state[resizeInfo.version].size * 2, sectorSize);
resizeInfo.version = 1 - resizeInfo.version;
MakeTransition(intermediateState, nextState);
break;
}
case Phase.REST:
{
switch (_checkpointType)
{
case CheckpointType.INDEX_ONLY:
{
_indexCheckpoint.info.num_buckets = overflowBucketsAllocator.GetMaxValidAddress();
ObtainCurrentTailAddress(ref _indexCheckpoint.info.finalLogicalAddress);
WriteIndexMetaInfo();
_indexCheckpoint.Reset();
break;
}
case CheckpointType.FULL:
{
_indexCheckpoint.Reset();
_hybridLogCheckpoint.Reset();
break;
}
case CheckpointType.HYBRID_LOG_ONLY:
{
_hybridLogCheckpoint.Reset();
break;
}
case CheckpointType.NONE:
break;
default:
throw new Exception();
}
_checkpointType = CheckpointType.NONE;
MakeTransition(intermediateState, nextState);
break;
}
}
return true;
}
else
{
return false;
}
}
/// <summary>
/// Corresponding thread-local actions that must be performed when any state machine is active
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void HandleCheckpointingPhases()
{
var previousState = SystemState.Make(threadCtx.Value.phase, threadCtx.Value.version);
var finalState = SystemState.Copy(ref _systemState);
// Don't play around when system state is being changed
if (finalState.phase == Phase.INTERMEDIATE)
{
return;
}
// We need to move from previousState to finalState one step at a time
do
{
var currentState = default(SystemState);
if (previousState.word == finalState.word)
{
currentState.word = previousState.word;
}
else
{
currentState = GetNextState(previousState, _checkpointType);
}
switch (currentState.phase)
{
case Phase.PREP_INDEX_CHECKPOINT:
{
if (!threadCtx.Value.markers[EpochPhaseIdx.PrepareForIndexCheckpt])
{
if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.PrepareForIndexCheckpt, threadCtx.Value.version))
{
GlobalMoveToNextCheckpointState(currentState);
}
threadCtx.Value.markers[EpochPhaseIdx.PrepareForIndexCheckpt] = true;
}
break;
}
case Phase.INDEX_CHECKPOINT:
{
if (_checkpointType == CheckpointType.INDEX_ONLY)
{
// Reseting the marker for a potential FULL or INDEX_ONLY checkpoint in the future
threadCtx.Value.markers[EpochPhaseIdx.PrepareForIndexCheckpt] = false;
}
if (IsIndexFuzzyCheckpointCompleted())
{
GlobalMoveToNextCheckpointState(currentState);
}
break;
}
case Phase.PREPARE:
{
if (!threadCtx.Value.markers[EpochPhaseIdx.Prepare])
{
// Thread local action
AcquireSharedLatchesForAllPendingRequests();
var idx = Interlocked.Increment(ref _hybridLogCheckpoint.info.numThreads);
idx -= 1;
_hybridLogCheckpoint.info.guids[idx] = threadCtx.Value.guid;
if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.Prepare, threadCtx.Value.version))
{
GlobalMoveToNextCheckpointState(currentState);
}
threadCtx.Value.markers[EpochPhaseIdx.Prepare] = true;
}
break;
}
case Phase.IN_PROGRESS:
{
// Need to be very careful here as threadCtx is changing
FasterExecutionContext ctx;
if (previousState.phase == Phase.PREPARE)
{
ctx = threadCtx.Value;
}
else
{
ctx = prevThreadCtx.Value;
}
if (!ctx.markers[EpochPhaseIdx.InProgress])
{
prevThreadCtx.Value = threadCtx.Value;
InitLocalContext(prevThreadCtx.Value.guid);
if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.InProgress, ctx.version))
{
GlobalMoveToNextCheckpointState(currentState);
}
prevThreadCtx.Value.markers[EpochPhaseIdx.InProgress] = true;
}
break;
}
case Phase.WAIT_PENDING:
{
if (!prevThreadCtx.Value.markers[EpochPhaseIdx.WaitPending])
{
var notify = (prevThreadCtx.Value.ioPendingRequests.Count == 0);
notify = notify && (prevThreadCtx.Value.retryRequests.Count == 0);
if (notify)
{
if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.WaitPending, threadCtx.Value.version))
{
GlobalMoveToNextCheckpointState(currentState);
}
prevThreadCtx.Value.markers[EpochPhaseIdx.WaitPending] = true;
}
}
break;
}
case Phase.WAIT_FLUSH:
{
if (!prevThreadCtx.Value.markers[EpochPhaseIdx.WaitFlush])
{
var notify = false;
if (FoldOverSnapshot)
{
notify = (hlog.FlushedUntilAddress >= _hybridLogCheckpoint.info.finalLogicalAddress);
}
else
{
notify = (_hybridLogCheckpoint.flushed != null) && _hybridLogCheckpoint.flushed.IsSet;
}
if (_checkpointType == CheckpointType.FULL)
{
notify = notify && IsIndexFuzzyCheckpointCompleted();
}
if (notify)
{
_hybridLogCheckpoint.info.checkpointTokens.TryAdd(prevThreadCtx.Value.guid, prevThreadCtx.Value.serialNum);
if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.WaitFlush, prevThreadCtx.Value.version))
{
GlobalMoveToNextCheckpointState(currentState);
}
prevThreadCtx.Value.markers[EpochPhaseIdx.WaitFlush] = true;
}
}
break;
}
case Phase.PERSISTENCE_CALLBACK:
{
if (!prevThreadCtx.Value.markers[EpochPhaseIdx.CheckpointCompletionCallback])
{
// Thread local action
functions.CheckpointCompletionCallback(threadCtx.Value.guid, prevThreadCtx.Value.serialNum);
if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.CheckpointCompletionCallback, prevThreadCtx.Value.version))
{
GlobalMoveToNextCheckpointState(currentState);
}
prevThreadCtx.Value.markers[EpochPhaseIdx.CheckpointCompletionCallback] = true;
}
break;
}
case Phase.REST:
{
break;
}
default:
Debug.WriteLine("Error!");
break;
}
// update thread local variables
threadCtx.Value.phase = currentState.phase;
threadCtx.Value.version = currentState.version;
previousState.word = currentState.word;
} while (previousState.word != finalState.word);
}
#region Helper functions
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool GlobalMoveToNextCheckpointState(SystemState currentState)
{
long context = 0;
return GlobalMoveToNextState(currentState, GetNextState(currentState, _checkpointType), ref context);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool MakeTransition(SystemState currentState, SystemState nextState)
{
// Move from I to P2
if (Interlocked.CompareExchange(ref _systemState.word, nextState.word, currentState.word) == currentState.word)
{
Debug.WriteLine("Moved to {0}, {1}", nextState.phase, nextState.version);
return true;
}
else
{
return false;
}
}
private void AcquireSharedLatchesForAllPendingRequests()
{
foreach (var ctx in threadCtx.Value.retryRequests)
{
AcquireSharedLatch(ctx.key.Get());
}
foreach (var ctx in threadCtx.Value.ioPendingRequests.Values)
{
AcquireSharedLatch(ctx.key.Get());
}
}
/*
* We have several state machines supported by this function.
* Full Checkpoint:
* REST -> PREP_INDEX_CHECKPOINT -> PREPARE -> IN_PROGRESS
* -> WAIT_PENDING -> WAIT_FLUSH -> PERSISTENCE_CALLBACK -> REST
*
* Index Checkpoint:
* REST -> PREP_INDEX_CHECKPOINT -> INDEX_CHECKPOINT -> REST
*
* Hybrid Log Checkpoint:
* REST -> PREPARE -> IN_PROGRESS -> WAIT_PENDING -> WAIT_FLUSH ->
* -> PERSISTENCE_CALLBACK -> REST
*
* Grow :
* REST -> PREPARE_GROW -> IN_PROGRESS_GROW -> REST
*
* GC:
* REST -> GC -> REST
*/
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private SystemState GetNextState(SystemState start, CheckpointType type = CheckpointType.FULL)
{
var nextState = default(SystemState);
nextState.word = start.word;
switch (start.phase)
{
case Phase.REST:
switch (type)
{
case CheckpointType.HYBRID_LOG_ONLY:
nextState.phase = Phase.PREPARE;
break;
case CheckpointType.FULL:
case CheckpointType.INDEX_ONLY:
nextState.phase = Phase.PREP_INDEX_CHECKPOINT;
break;
}
break;
case Phase.PREP_INDEX_CHECKPOINT:
switch (type)
{
case CheckpointType.INDEX_ONLY:
nextState.phase = Phase.INDEX_CHECKPOINT;
break;
case CheckpointType.FULL:
nextState.phase = Phase.PREPARE;
break;
}
break;
case Phase.INDEX_CHECKPOINT:
switch(type)
{
case CheckpointType.FULL:
nextState.phase = Phase.PREPARE;
break;
default:
nextState.phase = Phase.REST;
break;
}
break;
case Phase.PREPARE:
nextState.phase = Phase.IN_PROGRESS;
nextState.version = start.version + 1;
break;
case Phase.IN_PROGRESS:
nextState.phase = Phase.WAIT_PENDING;
break;
case Phase.WAIT_PENDING:
nextState.phase = Phase.WAIT_FLUSH;
break;
case Phase.WAIT_FLUSH:
nextState.phase = Phase.PERSISTENCE_CALLBACK;
break;
case Phase.PERSISTENCE_CALLBACK:
nextState.phase = Phase.REST;
break;
case Phase.GC:
nextState.phase = Phase.REST;
break;
case Phase.PREPARE_GROW:
nextState.phase = Phase.IN_PROGRESS_GROW;
break;
case Phase.IN_PROGRESS_GROW:
nextState.phase = Phase.REST;
break;
}
return nextState;
}
private void WriteHybridLogMetaInfo()
{
checkpointManager.CommitLogCheckpoint(_hybridLogCheckpointToken, _hybridLogCheckpoint.info.ToByteArray());
}
private void WriteIndexMetaInfo()
{
checkpointManager.CommitIndexCheckpoint(_indexCheckpointToken, _indexCheckpoint.info.ToByteArray());
}
private bool ObtainCurrentTailAddress(ref long location)
{
var tailAddress = hlog.GetTailAddress();
return Interlocked.CompareExchange(ref location, tailAddress, 0) == 0;
}
private void InitializeIndexCheckpoint(Guid indexToken)
{
_indexCheckpoint.Initialize(indexToken, state[resizeInfo.version].size, checkpointManager);
}
private void InitializeHybridLogCheckpoint(Guid hybridLogToken, int version)
{
_hybridLogCheckpoint.Initialize(hybridLogToken, version, checkpointManager);
}
#endregion
}
}

@ -0,0 +1,134 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.IO;
namespace FASTER.core
{
class DirectoryConfiguration
{
private readonly string checkpointDir;
public DirectoryConfiguration(string checkpointDir)
{
this.checkpointDir = checkpointDir;
}
public const string index_base_folder = "index-checkpoints";
public const string index_meta_file = "info";
public const string hash_table_file = "ht";
public const string overflow_buckets_file = "ofb";
public const string snapshot_file = "snapshot";
public const string cpr_base_folder = "cpr-checkpoints";
public const string cpr_meta_file = "info";
public void CreateIndexCheckpointFolder(Guid token)
{
var directory = GetIndexCheckpointFolder(token);
Directory.CreateDirectory(directory);
DirectoryInfo directoryInfo = new System.IO.DirectoryInfo(directory);
foreach (System.IO.FileInfo file in directoryInfo.GetFiles())
file.Delete();
}
public void CreateHybridLogCheckpointFolder(Guid token)
{
var directory = GetHybridLogCheckpointFolder(token);
Directory.CreateDirectory(directory);
DirectoryInfo directoryInfo = new System.IO.DirectoryInfo(directory);
foreach (System.IO.FileInfo file in directoryInfo.GetFiles())
file.Delete();
}
public string GetIndexCheckpointFolder(Guid token = default(Guid))
{
if (token != default(Guid))
return GetMergedFolderPath(checkpointDir, index_base_folder, token.ToString());
else
return GetMergedFolderPath(checkpointDir, index_base_folder);
}
public string GetHybridLogCheckpointFolder(Guid token = default(Guid))
{
if (token != default(Guid))
return GetMergedFolderPath(checkpointDir, cpr_base_folder, token.ToString());
else
return GetMergedFolderPath(checkpointDir, cpr_base_folder);
}
public string GetIndexCheckpointMetaFileName(Guid token)
{
return GetMergedFolderPath(checkpointDir,
index_base_folder,
token.ToString(),
index_meta_file,
".dat");
}
public string GetPrimaryHashTableFileName(Guid token)
{
return GetMergedFolderPath(checkpointDir,
index_base_folder,
token.ToString(),
hash_table_file,
".dat");
}
public string GetOverflowBucketsFileName(Guid token)
{
return GetMergedFolderPath(checkpointDir,
index_base_folder,
token.ToString(),
overflow_buckets_file,
".dat");
}
public string GetHybridLogCheckpointMetaFileName(Guid token)
{
return GetMergedFolderPath(checkpointDir,
cpr_base_folder,
token.ToString(),
cpr_meta_file,
".dat");
}
public string GetHybridLogCheckpointContextFileName(Guid checkpointToken, Guid sessionToken)
{
return GetMergedFolderPath(checkpointDir,
cpr_base_folder,
checkpointToken.ToString(),
sessionToken.ToString(),
".dat");
}
public string GetLogSnapshotFileName(Guid token)
{
return GetMergedFolderPath(checkpointDir, cpr_base_folder, token.ToString(), snapshot_file, ".dat");
}
public string GetObjectLogSnapshotFileName(Guid token)
{
return GetMergedFolderPath(checkpointDir, cpr_base_folder, token.ToString(), snapshot_file, ".obj.dat");
}
private static string GetMergedFolderPath(params String[] paths)
{
String fullPath = paths[0];
for (int i = 1; i < paths.Length; i++)
{
if (i == paths.Length - 1 && paths[i].Contains("."))
{
fullPath += paths[i];
}
else
{
fullPath += Path.DirectorySeparatorChar + paths[i];
}
}
return fullPath;
}
}
}

@ -0,0 +1,111 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Interface for users to control creation and retrieval of checkpoint-related data
/// FASTER calls this interface during checkpoint/recovery in this sequence:
///
/// Checkpoint:
/// InitializeIndexCheckpoint (for index checkpoints) ->
/// GetIndexDevice (for index checkpoints) ->
/// InitializeLogCheckpoint (for log checkpoints) ->
/// GetSnapshotLogDevice (for log checkpoints in snapshot mode) ->
/// GetSnapshotObjectLogDevice (for log checkpoints in snapshot mode with objects) ->
/// CommitLogCheckpoint (for log checkpoints) ->
/// CommitIndexCheckpoint (for index checkpoints) ->
///
/// Recovery:
/// GetLatestCheckpoint (if request to recover to latest checkpoint) ->
/// GetIndexCommitMetadata ->
/// GetLogCommitMetadata ->
/// GetIndexDevice ->
/// GetSnapshotLogDevice (for recovery in snapshot mode) ->
/// GetSnapshotObjectLogDevice (for recovery in snapshot mode with objects)
///
/// Provided devices will be closed directly by FASTER when done.
/// </summary>
public interface ICheckpointManager
{
/// <summary>
/// Initialize index checkpoint
/// </summary>
/// <param name="indexToken"></param>
void InitializeIndexCheckpoint(Guid indexToken);
/// <summary>
/// Initialize log checkpoint (snapshot and fold-over)
/// </summary>
/// <param name="logToken"></param>
void InitializeLogCheckpoint(Guid logToken);
/// <summary>
/// Commit index checkpoint
/// </summary>
/// <param name="indexToken"></param>
/// <param name="commitMetadata"></param>
/// <returns></returns>
void CommitIndexCheckpoint(Guid indexToken, byte[] commitMetadata);
/// <summary>
/// Commit log checkpoint (snapshot and fold-over)
/// </summary>
/// <param name="logToken"></param>
/// <param name="commitMetadata"></param>
/// <returns></returns>
void CommitLogCheckpoint(Guid logToken, byte[] commitMetadata);
/// <summary>
/// Retrieve commit metadata for specified index checkpoint
/// </summary>
/// <param name="indexToken">Token</param>
/// <returns>Metadata, or null if invalid</returns>
byte[] GetIndexCommitMetadata(Guid indexToken);
/// <summary>
/// Retrieve commit metadata for specified log checkpoint
/// </summary>
/// <param name="logToken">Token</param>
/// <returns>Metadata, or null if invalid</returns>
byte[] GetLogCommitMetadata(Guid logToken);
/// <summary>
/// Provide device to store index checkpoint (including overflow buckets)
/// </summary>
/// <param name="indexToken"></param>
/// <returns></returns>
IDevice GetIndexDevice(Guid indexToken);
/// <summary>
/// Provide device to store snapshot of log (required only for snapshot checkpoints)
/// </summary>
/// <param name="token"></param>
/// <returns></returns>
IDevice GetSnapshotLogDevice(Guid token);
/// <summary>
/// Provide device to store snapshot of object log (required only for snapshot checkpoints)
/// </summary>
/// <param name="token"></param>
/// <returns></returns>
IDevice GetSnapshotObjectLogDevice(Guid token);
/// <summary>
/// Get latest valid checkpoint for recovery
/// </summary>
/// <param name="indexToken"></param>
/// <param name="logToken"></param>
/// <returns>true if latest valid checkpoint found, false otherwise</returns>
bool GetLatestCheckpoint(out Guid indexToken, out Guid logToken);
}
}

@ -0,0 +1,133 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace FASTER.core
{
public unsafe partial class FasterBase
{
// Derived class facing persistence API
internal IndexCheckpointInfo _indexCheckpoint;
internal void TakeIndexFuzzyCheckpoint()
{
var ht_version = resizeInfo.version;
TakeMainIndexCheckpoint(ht_version,
_indexCheckpoint.main_ht_device,
out ulong ht_num_bytes_written);
var sectorSize = _indexCheckpoint.main_ht_device.SectorSize;
var alignedIndexSize = (uint)((ht_num_bytes_written + (sectorSize - 1)) & ~(sectorSize - 1));
overflowBucketsAllocator.TakeCheckpoint(_indexCheckpoint.main_ht_device, alignedIndexSize, out ulong ofb_num_bytes_written);
_indexCheckpoint.info.num_ht_bytes = ht_num_bytes_written;
_indexCheckpoint.info.num_ofb_bytes = ofb_num_bytes_written;
}
internal void TakeIndexFuzzyCheckpoint(int ht_version, IDevice device,
out ulong numBytesWritten, IDevice ofbdevice,
out ulong ofbnumBytesWritten, out int num_ofb_buckets)
{
TakeMainIndexCheckpoint(ht_version, device, out numBytesWritten);
var sectorSize = device.SectorSize;
var alignedIndexSize = (uint)((numBytesWritten + (sectorSize - 1)) & ~(sectorSize - 1));
overflowBucketsAllocator.TakeCheckpoint(ofbdevice, alignedIndexSize, out ofbnumBytesWritten);
num_ofb_buckets = overflowBucketsAllocator.GetMaxValidAddress();
}
internal bool IsIndexFuzzyCheckpointCompleted(bool waitUntilComplete = false)
{
bool completed1 = IsMainIndexCheckpointCompleted(waitUntilComplete);
bool completed2 = overflowBucketsAllocator.IsCheckpointCompleted(waitUntilComplete);
return completed1 && completed2;
}
// Implementation of an asynchronous checkpointing scheme
// for main hash index of FASTER
private CountdownEvent mainIndexCheckpointEvent;
private void TakeMainIndexCheckpoint(int tableVersion,
IDevice device,
out ulong numBytes)
{
BeginMainIndexCheckpoint(tableVersion, device, out numBytes);
}
private void BeginMainIndexCheckpoint(
int version,
IDevice device,
out ulong numBytesWritten)
{
int numChunks = 1;
long totalSize = state[version].size * sizeof(HashBucket);
Debug.Assert(totalSize < (long)uint.MaxValue); // required since numChunks = 1
uint chunkSize = (uint)(totalSize / numChunks);
mainIndexCheckpointEvent = new CountdownEvent(numChunks);
HashBucket* start = state[version].tableAligned;
numBytesWritten = 0;
for (int index = 0; index < numChunks; index++)
{
long chunkStartBucket = (long)start + (index * chunkSize);
HashIndexPageAsyncFlushResult result = default(HashIndexPageAsyncFlushResult);
result.chunkIndex = index;
device.WriteAsync((IntPtr)chunkStartBucket, numBytesWritten, chunkSize, AsyncPageFlushCallback, result);
numBytesWritten += chunkSize;
}
}
private bool IsMainIndexCheckpointCompleted(bool waitUntilComplete = false)
{
bool completed = mainIndexCheckpointEvent.IsSet;
if (!completed && waitUntilComplete)
{
mainIndexCheckpointEvent.Wait();
return true;
}
return completed;
}
private void AsyncPageFlushCallback(
uint errorCode,
uint numBytes,
NativeOverlapped* overlap)
{
//Set the page status to flushed
var result = (HashIndexPageAsyncFlushResult)Overlapped.Unpack(overlap).AsyncResult;
try
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
}
catch (Exception ex)
{
Trace.TraceError("Completion Callback error, {0}", ex.Message);
}
finally
{
mainIndexCheckpointEvent.Signal();
Overlapped.Free(overlap);
}
}
}
}

@ -0,0 +1,144 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace FASTER.core
{
/// <summary>
///
/// </summary>
public unsafe partial class FasterBase
{
internal ICheckpointManager checkpointManager;
// Derived class exposed API
internal void RecoverFuzzyIndex(IndexCheckpointInfo info)
{
var token = info.info.token;
var ht_version = resizeInfo.version;
Debug.Assert(state[ht_version].size == info.info.table_size);
// Create devices to read from using Async API
info.main_ht_device = checkpointManager.GetIndexDevice(token);
BeginMainIndexRecovery(ht_version,
info.main_ht_device,
info.info.num_ht_bytes);
var sectorSize = info.main_ht_device.SectorSize;
var alignedIndexSize = (uint)((info.info.num_ht_bytes + (sectorSize - 1)) & ~(sectorSize - 1));
overflowBucketsAllocator.Recover(info.main_ht_device, alignedIndexSize, info.info.num_buckets, info.info.num_ofb_bytes);
// Wait until reading is complete
IsFuzzyIndexRecoveryComplete(true);
// close index checkpoint files appropriately
info.main_ht_device.Close();
// Delete all tentative entries!
DeleteTentativeEntries();
}
internal void RecoverFuzzyIndex(int ht_version, IDevice device, ulong num_ht_bytes, IDevice ofbdevice, int num_buckets, ulong num_ofb_bytes)
{
BeginMainIndexRecovery(ht_version, device, num_ht_bytes);
var sectorSize = device.SectorSize;
var alignedIndexSize = (uint)((num_ht_bytes + (sectorSize - 1)) & ~(sectorSize - 1));
overflowBucketsAllocator.Recover(ofbdevice, alignedIndexSize, num_buckets, num_ofb_bytes);
}
internal bool IsFuzzyIndexRecoveryComplete(bool waitUntilComplete = false)
{
bool completed1 = IsMainIndexRecoveryCompleted(waitUntilComplete);
bool completed2 = overflowBucketsAllocator.IsRecoveryCompleted(waitUntilComplete);
return completed1 && completed2;
}
//Main Index Recovery Functions
private CountdownEvent mainIndexRecoveryEvent;
private void BeginMainIndexRecovery(
int version,
IDevice device,
ulong num_bytes)
{
int numChunksToBeRecovered = 1;
long totalSize = state[version].size * sizeof(HashBucket);
Debug.Assert(totalSize < (long)uint.MaxValue); // required since numChunks = 1
uint chunkSize = (uint)(totalSize / numChunksToBeRecovered);
mainIndexRecoveryEvent = new CountdownEvent(numChunksToBeRecovered);
HashBucket* start = state[version].tableAligned;
ulong numBytesRead = 0;
for (int index = 0; index < numChunksToBeRecovered; index++)
{
long chunkStartBucket = (long)start + (index * chunkSize);
HashIndexPageAsyncReadResult result = default(HashIndexPageAsyncReadResult);
result.chunkIndex = index;
device.ReadAsync(numBytesRead, (IntPtr)chunkStartBucket, chunkSize, AsyncPageReadCallback, result);
numBytesRead += chunkSize;
}
Debug.Assert(numBytesRead == num_bytes);
}
private bool IsMainIndexRecoveryCompleted(
bool waitUntilComplete = false)
{
bool completed = mainIndexRecoveryEvent.IsSet;
if (!completed && waitUntilComplete)
{
mainIndexRecoveryEvent.Wait();
return true;
}
return completed;
}
private unsafe void AsyncPageReadCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
mainIndexRecoveryEvent.Signal();
Overlapped.Free(overlap);
}
internal void DeleteTentativeEntries()
{
HashBucketEntry entry = default(HashBucketEntry);
int version = resizeInfo.version;
var table_size_ = state[version].size;
var ptable_ = state[version].tableAligned;
for (long bucket = 0; bucket < table_size_; ++bucket)
{
HashBucket b = *(ptable_ + bucket);
while (true)
{
for (int bucket_entry = 0; bucket_entry < Constants.kOverflowBucketIndex; ++bucket_entry)
{
entry.word = b.bucket_entries[bucket_entry];
if (entry.Tentative)
b.bucket_entries[bucket_entry] = 0;
}
if (b.bucket_entries[Constants.kOverflowBucketIndex] == 0) break;
b = *((HashBucket*)overflowBucketsAllocator.GetPhysicalAddress((b.bucket_entries[Constants.kOverflowBucketIndex])));
}
}
}
}
}

@ -0,0 +1,206 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Implementation of checkpoint interface for local file storage
/// </summary>
public class LocalCheckpointManager : ICheckpointManager
{
private DirectoryConfiguration directoryConfiguration;
/// <summary>
/// Create new instance of local checkpoint manager at given base directory
/// </summary>
/// <param name="CheckpointDir"></param>
public LocalCheckpointManager(string CheckpointDir)
{
directoryConfiguration = new DirectoryConfiguration(CheckpointDir);
}
/// <summary>
/// Initialize index checkpoint
/// </summary>
/// <param name="indexToken"></param>
public void InitializeIndexCheckpoint(Guid indexToken)
{
directoryConfiguration.CreateIndexCheckpointFolder(indexToken);
}
/// <summary>
/// Initialize log checkpoint (snapshot and fold-over)
/// </summary>
/// <param name="logToken"></param>
public void InitializeLogCheckpoint(Guid logToken)
{
directoryConfiguration.CreateHybridLogCheckpointFolder(logToken);
}
/// <summary>
/// Commit index checkpoint
/// </summary>
/// <param name="indexToken"></param>
/// <param name="commitMetadata"></param>
public void CommitIndexCheckpoint(Guid indexToken, byte[] commitMetadata)
{
string filename = directoryConfiguration.GetIndexCheckpointMetaFileName(indexToken);
using (var writer = new BinaryWriter(new FileStream(filename, FileMode.Create)))
{
writer.Write(commitMetadata.Length);
writer.Write(commitMetadata);
writer.Flush();
}
string completed_filename = directoryConfiguration.GetIndexCheckpointFolder(indexToken);
completed_filename += Path.DirectorySeparatorChar + "completed.dat";
using (var file = new FileStream(completed_filename, FileMode.Create))
{
file.Flush();
}
}
/// <summary>
/// Commit log checkpoint (snapshot and fold-over)
/// </summary>
/// <param name="logToken"></param>
/// <param name="commitMetadata"></param>
public void CommitLogCheckpoint(Guid logToken, byte[] commitMetadata)
{
string filename = directoryConfiguration.GetHybridLogCheckpointMetaFileName(logToken);
using (var writer = new BinaryWriter(new FileStream(filename, FileMode.Create)))
{
writer.Write(commitMetadata.Length);
writer.Write(commitMetadata);
writer.Flush();
}
string completed_filename = directoryConfiguration.GetHybridLogCheckpointFolder(logToken);
completed_filename += Path.DirectorySeparatorChar + "completed.dat";
using (var file = new FileStream(completed_filename, FileMode.Create))
{
file.Flush();
}
}
/// <summary>
/// Retrieve commit metadata for specified index checkpoint
/// </summary>
/// <param name="indexToken">Token</param>
/// <returns>Metadata, or null if invalid</returns>
public byte[] GetIndexCommitMetadata(Guid indexToken)
{
var dir = new DirectoryInfo(directoryConfiguration.GetIndexCheckpointFolder(indexToken));
if (!File.Exists(dir.FullName + Path.DirectorySeparatorChar + "completed.dat"))
return null;
string filename = directoryConfiguration.GetIndexCheckpointMetaFileName(indexToken);
using (var reader = new BinaryReader(new FileStream(filename, FileMode.Open)))
{
var len = reader.ReadInt32();
return reader.ReadBytes(len);
}
}
/// <summary>
/// Retrieve commit metadata for specified log checkpoint
/// </summary>
/// <param name="logToken">Token</param>
/// <returns>Metadata, or null if invalid</returns>
public byte[] GetLogCommitMetadata(Guid logToken)
{
var dir = new DirectoryInfo(directoryConfiguration.GetHybridLogCheckpointFolder(logToken));
if (!File.Exists(dir.FullName + Path.DirectorySeparatorChar + "completed.dat"))
return null;
string checkpointInfoFile = directoryConfiguration.GetHybridLogCheckpointMetaFileName(logToken);
using (var reader = new BinaryReader(new FileStream(checkpointInfoFile, FileMode.Open)))
{
var len = reader.ReadInt32();
return reader.ReadBytes(len);
}
}
/// <summary>
/// Provide device to store index checkpoint (including overflow buckets)
/// </summary>
/// <param name="indexToken"></param>
/// <returns></returns>
public IDevice GetIndexDevice(Guid indexToken)
{
return Devices.CreateLogDevice(directoryConfiguration.GetPrimaryHashTableFileName(indexToken), false);
}
/// <summary>
/// Provide device to store snapshot of log (required only for snapshot checkpoints)
/// </summary>
/// <param name="token"></param>
/// <returns></returns>
public IDevice GetSnapshotLogDevice(Guid token)
{
return Devices.CreateLogDevice(directoryConfiguration.GetLogSnapshotFileName(token), false);
}
/// <summary>
/// Provide device to store snapshot of object log (required only for snapshot checkpoints)
/// </summary>
/// <param name="token"></param>
/// <returns></returns>
public IDevice GetSnapshotObjectLogDevice(Guid token)
{
return Devices.CreateLogDevice(directoryConfiguration.GetObjectLogSnapshotFileName(token), false);
}
/// <summary>
/// Get latest valid checkpoint for recovery
/// </summary>
/// <param name="indexToken"></param>
/// <param name="logToken"></param>
/// <returns></returns>
public bool GetLatestCheckpoint(out Guid indexToken, out Guid logToken)
{
var indexCheckpointDir = new DirectoryInfo(directoryConfiguration.GetIndexCheckpointFolder());
var dirs = indexCheckpointDir.GetDirectories();
foreach (var dir in dirs)
{
// Remove incomplete checkpoints
if (!File.Exists(dir.FullName + Path.DirectorySeparatorChar + "completed.dat"))
{
Directory.Delete(dir.FullName, true);
}
}
var latestICFolder = indexCheckpointDir.GetDirectories().OrderByDescending(f => f.LastWriteTime).First();
if (latestICFolder == null || !Guid.TryParse(latestICFolder.Name, out indexToken))
{
throw new Exception("No valid index checkpoint to recover from");
}
var hlogCheckpointDir = new DirectoryInfo(directoryConfiguration.GetHybridLogCheckpointFolder());
dirs = hlogCheckpointDir.GetDirectories();
foreach (var dir in dirs)
{
// Remove incomplete checkpoints
if (!File.Exists(dir.FullName + Path.DirectorySeparatorChar + "completed.dat"))
{
Directory.Delete(dir.FullName, true);
}
}
var latestHLCFolder = hlogCheckpointDir.GetDirectories().OrderByDescending(f => f.LastWriteTime).First();
if (latestHLCFolder == null || !Guid.TryParse(latestHLCFolder.Name, out logToken))
{
throw new Exception("No valid hybrid log checkpoint to recover from");
}
return true;
}
}
}

@ -0,0 +1,500 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#pragma warning disable 0162
using System;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
using System.Linq;
using System.Collections.Generic;
namespace FASTER.core
{
internal enum ReadStatus { Pending, Done };
internal enum FlushStatus { Pending, Done };
internal class RecoveryStatus
{
public long startPage;
public long endPage;
public long untilAddress;
public int capacity;
public IDevice recoveryDevice;
public long recoveryDevicePageOffset;
public IDevice objectLogRecoveryDevice;
public ReadStatus[] readStatus;
public FlushStatus[] flushStatus;
public RecoveryStatus(int capacity,
long startPage,
long endPage, long untilAddress)
{
this.capacity = capacity;
this.startPage = startPage;
this.endPage = endPage;
this.untilAddress = untilAddress;
readStatus = new ReadStatus[capacity];
flushStatus = new FlushStatus[capacity];
for (int i = 0; i < capacity; i++)
{
flushStatus[i] = FlushStatus.Done;
readStatus[i] = ReadStatus.Pending;
}
}
}
/// <summary>
/// Partial class for recovery code in FASTER
/// </summary>
public unsafe partial class FasterKV<Key, Value, Input, Output, Context, Functions> : FasterBase, IFasterKV<Key, Value, Input, Output, Context>
where Key : new()
where Value : new()
where Functions : IFunctions<Key, Value, Input, Output, Context>
{
private void InternalRecoverFromLatestCheckpoints()
{
checkpointManager.GetLatestCheckpoint(out Guid indexCheckpointGuid, out Guid hybridLogCheckpointGuid);
InternalRecover(indexCheckpointGuid, hybridLogCheckpointGuid);
}
private bool IsCompatible(IndexRecoveryInfo indexInfo, HybridLogRecoveryInfo recoveryInfo)
{
var l1 = indexInfo.finalLogicalAddress;
var l2 = recoveryInfo.finalLogicalAddress;
return l1 <= l2;
}
private void InternalRecover(Guid indexToken, Guid hybridLogToken)
{
Debug.WriteLine("********* Primary Recovery Information ********");
Debug.WriteLine("Index Checkpoint: {0}", indexToken);
Debug.WriteLine("HybridLog Checkpoint: {0}", hybridLogToken);
// Recovery appropriate context information
var recoveredICInfo = new IndexCheckpointInfo();
recoveredICInfo.Recover(indexToken, checkpointManager);
recoveredICInfo.info.DebugPrint();
var recoveredHLCInfo = new HybridLogCheckpointInfo();
recoveredHLCInfo.Recover(hybridLogToken, checkpointManager);
recoveredHLCInfo.info.DebugPrint();
// Check if the two checkpoints are compatible for recovery
if (!IsCompatible(recoveredICInfo.info, recoveredHLCInfo.info))
{
throw new Exception("Cannot recover from (" + indexToken.ToString() + "," + hybridLogToken.ToString() + ") checkpoint pair!\n");
}
// Set new system state after recovery
var v = recoveredHLCInfo.info.version;
_systemState.phase = Phase.REST;
_systemState.version = (v + 1);
// Recover fuzzy index from checkpoint
RecoverFuzzyIndex(recoveredICInfo);
// Recover segment offsets for object log
if (recoveredHLCInfo.info.objectLogSegmentOffsets != null)
Array.Copy(recoveredHLCInfo.info.objectLogSegmentOffsets,
hlog.GetSegmentOffsets(),
recoveredHLCInfo.info.objectLogSegmentOffsets.Length);
// Make index consistent for version v
if (FoldOverSnapshot)
{
RecoverHybridLog(recoveredICInfo.info, recoveredHLCInfo.info);
}
else
{
RecoverHybridLogFromSnapshotFile(recoveredICInfo.info, recoveredHLCInfo.info);
}
// Read appropriate hybrid log pages into memory
hlog.RestoreHybridLog(recoveredHLCInfo.info.finalLogicalAddress, recoveredHLCInfo.info.headAddress, recoveredHLCInfo.info.beginAddress);
// Recover session information
_recoveredSessions = recoveredHLCInfo.info.continueTokens;
}
private void RecoverHybridLog(IndexRecoveryInfo indexRecoveryInfo,
HybridLogRecoveryInfo recoveryInfo)
{
var fromAddress = indexRecoveryInfo.startLogicalAddress;
var untilAddress = recoveryInfo.finalLogicalAddress;
var startPage = hlog.GetPage(fromAddress);
var endPage = hlog.GetPage(untilAddress);
if ((untilAddress > hlog.GetStartLogicalAddress(endPage)) && (untilAddress > fromAddress))
{
endPage++;
}
// By default first page has one extra record
var capacity = hlog.GetCapacityNumPages();
var recoveryStatus = new RecoveryStatus(capacity, startPage, endPage, untilAddress);
int totalPagesToRead = (int)(endPage - startPage);
int numPagesToReadFirst = Math.Min(capacity, totalPagesToRead);
// Issue request to read pages as much as possible
hlog.AsyncReadPagesFromDevice(startPage, numPagesToReadFirst, untilAddress, hlog.AsyncReadPagesCallbackForRecovery, recoveryStatus);
for (long page = startPage; page < endPage; page++)
{
// Ensure page has been read into memory
int pageIndex = hlog.GetPageIndexForPage(page);
while (recoveryStatus.readStatus[pageIndex] == ReadStatus.Pending)
{
Thread.Sleep(10);
}
var startLogicalAddress = hlog.GetStartLogicalAddress(page);
var endLogicalAddress = hlog.GetStartLogicalAddress(page + 1);
var pageFromAddress = 0L;
if (fromAddress > startLogicalAddress && fromAddress < endLogicalAddress)
{
pageFromAddress = hlog.GetOffsetInPage(fromAddress);
}
var pageUntilAddress = hlog.GetPageSize();
if (endLogicalAddress > untilAddress)
{
pageUntilAddress = hlog.GetOffsetInPage(untilAddress);
}
var physicalAddress = hlog.GetPhysicalAddress(startLogicalAddress);
RecoverFromPage(fromAddress, pageFromAddress, pageUntilAddress,
startLogicalAddress, physicalAddress, recoveryInfo.version);
// OS thread flushes current page and issues a read request if necessary
recoveryStatus.readStatus[pageIndex] = ReadStatus.Pending;
recoveryStatus.flushStatus[pageIndex] = FlushStatus.Pending;
hlog.AsyncFlushPages(page, 1, AsyncFlushPageCallbackForRecovery, recoveryStatus);
}
// Assert that all pages have been flushed
var done = false;
while (!done)
{
done = true;
for (long page = startPage; page < endPage; page++)
{
int pageIndex = hlog.GetPageIndexForPage(page);
if (recoveryStatus.flushStatus[pageIndex] == FlushStatus.Pending)
{
done = false;
break;
}
}
}
}
private void RecoverHybridLogFromSnapshotFile(
IndexRecoveryInfo indexRecoveryInfo,
HybridLogRecoveryInfo recoveryInfo)
{
var fileStartAddress = recoveryInfo.flushedLogicalAddress;
var fromAddress = indexRecoveryInfo.startLogicalAddress;
var untilAddress = recoveryInfo.finalLogicalAddress;
// Compute startPage and endPage
var startPage = hlog.GetPage(fileStartAddress);
var endPage = hlog.GetPage(untilAddress);
if (untilAddress > hlog.GetStartLogicalAddress(endPage))
{
endPage++;
}
// By default first page has one extra record
var capacity = hlog.GetCapacityNumPages();
var recoveryDevice = checkpointManager.GetSnapshotLogDevice(recoveryInfo.guid);
var objectLogRecoveryDevice = checkpointManager.GetSnapshotObjectLogDevice(recoveryInfo.guid);
recoveryDevice.Initialize(hlog.GetSegmentSize());
objectLogRecoveryDevice.Initialize(hlog.GetSegmentSize());
var recoveryStatus = new RecoveryStatus(capacity, startPage, endPage, untilAddress)
{
recoveryDevice = recoveryDevice,
objectLogRecoveryDevice = objectLogRecoveryDevice,
recoveryDevicePageOffset = startPage
};
// Initially issue read request for all pages that can be held in memory
int totalPagesToRead = (int)(endPage - startPage);
int numPagesToReadFirst = Math.Min(capacity, totalPagesToRead);
hlog.AsyncReadPagesFromDevice(startPage, numPagesToReadFirst, untilAddress,
hlog.AsyncReadPagesCallbackForRecovery,
recoveryStatus,
recoveryStatus.recoveryDevicePageOffset,
recoveryStatus.recoveryDevice, recoveryStatus.objectLogRecoveryDevice);
for (long page = startPage; page < endPage; page++)
{
// Ensure the page is read from file
int pageIndex = hlog.GetPageIndexForPage(page);
while (recoveryStatus.readStatus[pageIndex] == ReadStatus.Pending)
{
Thread.Sleep(10);
}
// Page at hand
var startLogicalAddress = hlog.GetStartLogicalAddress(page);
var endLogicalAddress = hlog.GetStartLogicalAddress(page + 1);
// Perform recovery if page in fuzzy portion of the log
if ((fromAddress < endLogicalAddress) && (fromAddress < untilAddress))
{
/*
* Handling corner-cases:
* ----------------------
* When fromAddress is in the middle of the page,
* then start recovery only from corresponding offset
* in page. Similarly, if untilAddress falls in the
* middle of the page, perform recovery only until that
* offset. Otherwise, scan the entire page [0, PageSize)
*/
var pageFromAddress = 0L;
if (fromAddress > startLogicalAddress && fromAddress < endLogicalAddress)
{
pageFromAddress = hlog.GetOffsetInPage(fromAddress);
}
var pageUntilAddress = hlog.GetPageSize();
if (endLogicalAddress > untilAddress)
{
pageUntilAddress = hlog.GetOffsetInPage(untilAddress);
}
var physicalAddress = hlog.GetPhysicalAddress(startLogicalAddress);
RecoverFromPage(fromAddress, pageFromAddress, pageUntilAddress,
startLogicalAddress, physicalAddress, recoveryInfo.version);
}
// OS thread flushes current page and issues a read request if necessary
recoveryStatus.readStatus[pageIndex] = ReadStatus.Pending;
recoveryStatus.flushStatus[pageIndex] = FlushStatus.Pending;
// Write back records from snapshot to main hybrid log
hlog.AsyncFlushPages(page, 1, AsyncFlushPageCallbackForRecovery, recoveryStatus);
}
// Assert and wait until all pages have been flushed
var done = false;
while (!done)
{
done = true;
for (long page = startPage; page < endPage; page++)
{
int pageIndex = hlog.GetPageIndexForPage(page);
if (recoveryStatus.flushStatus[pageIndex] == FlushStatus.Pending)
{
done = false;
break;
}
}
}
recoveryStatus.recoveryDevice.Close();
recoveryStatus.objectLogRecoveryDevice.Close();
}
private void RecoverFromPage(long startRecoveryAddress,
long fromLogicalAddressInPage,
long untilLogicalAddressInPage,
long pageLogicalAddress,
long pagePhysicalAddress,
int version)
{
var hash = default(long);
var tag = default(ushort);
var pointer = default(long);
var recordStart = default(long);
var bucket = default(HashBucket*);
var entry = default(HashBucketEntry);
var slot = default(int);
pointer = fromLogicalAddressInPage;
while (pointer < untilLogicalAddressInPage)
{
recordStart = pagePhysicalAddress + pointer;
ref RecordInfo info = ref hlog.GetInfo(recordStart);
if (info.IsNull())
{
pointer += RecordInfo.GetLength();
continue;
}
if (!info.Invalid)
{
hash = comparer.GetHashCode64(ref hlog.GetKey(recordStart));
tag = (ushort)((ulong)hash >> Constants.kHashTagShift);
entry = default(HashBucketEntry);
FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress);
if (info.Version <= version)
{
entry.Address = pageLogicalAddress + pointer;
entry.Tag = tag;
entry.Pending = false;
entry.Tentative = false;
bucket->bucket_entries[slot] = entry.word;
}
else
{
info.Invalid = true;
if (info.PreviousAddress < startRecoveryAddress)
{
entry.Address = info.PreviousAddress;
entry.Tag = tag;
entry.Pending = false;
entry.Tentative = false;
bucket->bucket_entries[slot] = entry.word;
}
}
}
pointer += hlog.GetRecordSize(recordStart);
}
}
private void AsyncFlushPageCallbackForRecovery(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
// Set the page status to flushed
var result = (PageAsyncFlushResult<RecoveryStatus>)Overlapped.Unpack(overlap).AsyncResult;
if (Interlocked.Decrement(ref result.count) == 0)
{
int index = hlog.GetPageIndexForPage(result.page);
result.context.flushStatus[index] = FlushStatus.Done;
if (result.page + result.context.capacity < result.context.endPage)
{
long readPage = result.page + result.context.capacity;
if (FoldOverSnapshot)
{
hlog.AsyncReadPagesFromDevice(readPage, 1, result.context.untilAddress, hlog.AsyncReadPagesCallbackForRecovery, result.context);
}
else
{
hlog.AsyncReadPagesFromDevice(readPage, 1, result.context.untilAddress, hlog.AsyncReadPagesCallbackForRecovery,
result.context,
result.context.recoveryDevicePageOffset,
result.context.recoveryDevice, result.context.objectLogRecoveryDevice);
}
}
result.Free();
}
Overlapped.Free(overlap);
}
}
public unsafe abstract partial class AllocatorBase<Key, Value> : IDisposable
where Key : new()
where Value : new()
{
/// <summary>
/// Restore log
/// </summary>
/// <param name="untilAddress"></param>
/// <param name="headAddress"></param>
/// <param name="beginAddress"></param>
public void RestoreHybridLog(long untilAddress, long headAddress, long beginAddress)
{
Debug.Assert(beginAddress <= headAddress);
Debug.Assert(headAddress <= untilAddress);
// Special cases: we do not load any records into memory
if (
(beginAddress == untilAddress) || // Empty log
((headAddress == untilAddress) && (GetOffsetInPage(headAddress) == 0)) // Empty in-memory page
)
{
if (!IsAllocated(GetPageIndexForAddress(headAddress)))
AllocatePage(GetPageIndexForAddress(headAddress));
}
else
{
var tailPage = GetPage(untilAddress);
var headPage = GetPage(headAddress);
var recoveryStatus = new RecoveryStatus(GetCapacityNumPages(), headPage, tailPage, untilAddress);
for (int i = 0; i < recoveryStatus.capacity; i++)
{
recoveryStatus.readStatus[i] = ReadStatus.Done;
}
var numPages = 0;
for (var page = headPage; page <= tailPage; page++)
{
var pageIndex = GetPageIndexForPage(page);
recoveryStatus.readStatus[pageIndex] = ReadStatus.Pending;
numPages++;
}
AsyncReadPagesFromDevice(headPage, numPages, untilAddress, AsyncReadPagesCallbackForRecovery, recoveryStatus);
var done = false;
while (!done)
{
done = true;
for (long page = headPage; page <= tailPage; page++)
{
int pageIndex = GetPageIndexForPage(page);
if (recoveryStatus.readStatus[pageIndex] == ReadStatus.Pending)
{
done = false;
break;
}
}
}
}
RecoveryReset(untilAddress, headAddress, beginAddress);
}
internal void AsyncReadPagesCallbackForRecovery(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
// Set the page status to flushed
var result = (PageAsyncReadResult<RecoveryStatus>)Overlapped.Unpack(overlap).AsyncResult;
if (result.freeBuffer1 != null)
{
PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, result.page);
result.freeBuffer1.Return();
}
int index = GetPageIndexForPage(result.page);
result.context.readStatus[index] = ReadStatus.Done;
Interlocked.MemoryBarrier();
Overlapped.Free(overlap);
}
}
}

@ -0,0 +1 @@
Copied from https://github.com/microsoft/FASTER

@ -0,0 +1,90 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#define CALLOC
using System;
using System.Threading;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
namespace FASTER.core
{
internal struct AsyncGetFromDiskResult<TContext> : IAsyncResult
{
public TContext context;
public bool IsCompleted => throw new NotImplementedException();
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
public object AsyncState => throw new NotImplementedException();
public bool CompletedSynchronously => throw new NotImplementedException();
}
internal unsafe struct HashIndexPageAsyncFlushResult : IAsyncResult
{
public int chunkIndex;
public bool IsCompleted => throw new NotImplementedException();
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
public object AsyncState => throw new NotImplementedException();
public bool CompletedSynchronously => throw new NotImplementedException();
}
internal unsafe struct HashIndexPageAsyncReadResult : IAsyncResult
{
public int chunkIndex;
public bool IsCompleted => throw new NotImplementedException();
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
public object AsyncState => throw new NotImplementedException();
public bool CompletedSynchronously => throw new NotImplementedException();
}
internal struct OverflowPagesFlushAsyncResult : IAsyncResult
{
public bool IsCompleted => throw new NotImplementedException();
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
public object AsyncState => throw new NotImplementedException();
public bool CompletedSynchronously => throw new NotImplementedException();
}
internal struct OverflowPagesReadAsyncResult : IAsyncResult
{
public bool IsCompleted => throw new NotImplementedException();
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
public object AsyncState => throw new NotImplementedException();
public bool CompletedSynchronously => throw new NotImplementedException();
}
internal struct CountdownEventAsyncResult : IAsyncResult
{
public CountdownEvent countdown;
public Action action;
public bool IsCompleted => throw new NotImplementedException();
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
public object AsyncState => throw new NotImplementedException();
public bool CompletedSynchronously => throw new NotImplementedException();
}
}

@ -0,0 +1,224 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using Microsoft.Win32.SafeHandles;
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.ComponentModel;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace FASTER.core
{
/// <summary>
/// Sector aligned memory allocator
/// </summary>
public unsafe class SectorAlignedMemory
{
/// <summary>
/// Actual buffer
/// </summary>
public byte[] buffer;
/// <summary>
/// Handle
/// </summary>
internal GCHandle handle;
/// <summary>
/// Offset
/// </summary>
public int offset;
/// <summary>
/// Aligned pointer
/// </summary>
public byte* aligned_pointer;
/// <summary>
/// Valid offset
/// </summary>
public int valid_offset;
/// <summary>
/// Required bytes
/// </summary>
public int required_bytes;
/// <summary>
/// Available bytes
/// </summary>
public int available_bytes;
internal int level;
internal SectorAlignedBufferPool pool;
/// <summary>
/// Return
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Return()
{
pool.Return(this);
}
/// <summary>
/// Get valid pointer
/// </summary>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public byte* GetValidPointer()
{
return aligned_pointer + valid_offset;
}
/// <summary>
/// ToString
/// </summary>
/// <returns></returns>
public override string ToString()
{
return string.Format("{0} {1} {2} {3} {4}", (long)aligned_pointer, offset, valid_offset, required_bytes, available_bytes);
}
}
/// <summary>
/// SectorAlignedBufferPool is a pool of memory.
/// Internally, it is organized as an array of concurrent queues where each concurrent
/// queue represents a memory of size in particular range. queue[i] contains memory
/// segments each of size (2^i * sectorSize).
/// </summary>
public class SectorAlignedBufferPool
{
/// <summary>
/// Disable buffer pool
/// </summary>
public static bool Disabled = false;
private const int levels = 32;
private readonly int recordSize;
private readonly int sectorSize;
private readonly ConcurrentQueue<SectorAlignedMemory>[] queue;
/// <summary>
/// Constructor
/// </summary>
/// <param name="recordSize">Record size</param>
/// <param name="sectorSize">Sector size</param>
public SectorAlignedBufferPool(int recordSize, int sectorSize)
{
queue = new ConcurrentQueue<SectorAlignedMemory>[levels];
this.recordSize = recordSize;
this.sectorSize = sectorSize;
}
/// <summary>
/// Return
/// </summary>
/// <param name="page"></param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Return(SectorAlignedMemory page)
{
Debug.Assert(queue[page.level] != null);
page.available_bytes = 0;
page.required_bytes = 0;
page.valid_offset = 0;
Array.Clear(page.buffer, 0, page.buffer.Length);
if (!Disabled)
queue[page.level].Enqueue(page);
else
{
page.handle.Free();
page.buffer = null;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int Position(int v)
{
if (v == 1) return 0;
v--;
int r = 0; // r will be lg(v)
while (true) // unroll for more speed...
{
v = v >> 1;
if (v == 0) break;
r++;
}
return r + 1;
}
/// <summary>
/// Get buffer
/// </summary>
/// <param name="numRecords"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public unsafe SectorAlignedMemory Get(int numRecords)
{
int requiredSize = sectorSize + (((numRecords) * recordSize + (sectorSize - 1)) & ~(sectorSize - 1));
int index = Position(requiredSize / sectorSize);
if (queue[index] == null)
{
var localPool = new ConcurrentQueue<SectorAlignedMemory>();
Interlocked.CompareExchange(ref queue[index], localPool, null);
}
if (!Disabled && queue[index].TryDequeue(out SectorAlignedMemory page))
{
return page;
}
page = new SectorAlignedMemory
{
level = index,
buffer = new byte[sectorSize * (1 << index)]
};
page.handle = GCHandle.Alloc(page.buffer, GCHandleType.Pinned);
page.aligned_pointer = (byte*)(((long)page.handle.AddrOfPinnedObject() + (sectorSize - 1)) & ~(sectorSize - 1));
page.offset = (int) ((long)page.aligned_pointer - (long)page.handle.AddrOfPinnedObject());
page.pool = this;
return page;
}
/// <summary>
/// Free buffer
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Free()
{
for (int i = 0; i < levels; i++)
{
if (queue[i] == null) continue;
while (queue[i].TryDequeue(out SectorAlignedMemory result))
{
result.handle.Free();
result.buffer = null;
}
}
}
/// <summary>
/// Print pool contents
/// </summary>
public void Print()
{
for (int i = 0; i < levels; i++)
{
if (queue[i] == null) continue;
foreach (var item in queue[i])
{
Console.WriteLine(" " + item.ToString());
}
}
}
}
}

@ -0,0 +1,28 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System.Collections.Generic;
namespace FASTER.core
{
/// <summary>
/// Low-performance FASTER equality comparer wrapper around EqualityComparer.Default
/// </summary>
/// <typeparam name="T"></typeparam>
internal sealed class FasterEqualityComparer<T> : IFasterEqualityComparer<T>
{
public static readonly FasterEqualityComparer<T> Default = new FasterEqualityComparer<T>();
private static readonly EqualityComparer<T> DefaultEC = EqualityComparer<T>.Default;
public bool Equals(ref T k1, ref T k2)
{
return DefaultEC.Equals(k1, k2);
}
public long GetHashCode64(ref T k)
{
return Utility.GetHashCode(DefaultEC.GetHashCode(k));
}
}
}

@ -0,0 +1,332 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
namespace FASTER.core
{
using System;
using System.Runtime.InteropServices;
using System.Security;
using Microsoft.Win32.SafeHandles;
using System.Threading;
using System.IO;
/// <summary>
/// Interop with WINAPI for file I/O, threading, and NUMA functions.
/// </summary>
public static unsafe class Native32
{
#region Native structs
[StructLayout(LayoutKind.Sequential)]
private struct LUID
{
public uint lp;
public int hp;
}
[StructLayout(LayoutKind.Sequential)]
private struct LUID_AND_ATTRIBUTES
{
public LUID Luid;
public uint Attributes;
}
[StructLayout(LayoutKind.Sequential)]
private struct TOKEN_PRIVILEGES
{
public uint PrivilegeCount;
public LUID_AND_ATTRIBUTES Privileges;
}
[StructLayout(LayoutKind.Sequential)]
private struct MARK_HANDLE_INFO
{
public uint UsnSourceInfo;
public IntPtr VolumeHandle;
public uint HandleInfo;
}
#endregion
#region io constants and flags
internal const int ERROR_IO_PENDING = 997;
internal const uint GENERIC_READ = 0x80000000;
internal const uint GENERIC_WRITE = 0x40000000;
internal const uint FILE_FLAG_DELETE_ON_CLOSE = 0x04000000;
internal const uint FILE_FLAG_NO_BUFFERING = 0x20000000;
internal const uint FILE_FLAG_OVERLAPPED = 0x40000000;
internal const uint FILE_SHARE_DELETE = 0x00000004;
#endregion
#region io functions
[DllImport("Kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)]
internal static extern SafeFileHandle CreateFileW(
[In] string lpFileName,
[In] UInt32 dwDesiredAccess,
[In] UInt32 dwShareMode,
[In] IntPtr lpSecurityAttributes,
[In] UInt32 dwCreationDisposition,
[In] UInt32 dwFlagsAndAttributes,
[In] IntPtr hTemplateFile);
[DllImport("Kernel32.dll", SetLastError = true)]
internal static extern bool ReadFile(
[In] SafeFileHandle hFile,
[Out] IntPtr lpBuffer,
[In] UInt32 nNumberOfBytesToRead,
[Out] out UInt32 lpNumberOfBytesRead,
[In] NativeOverlapped* lpOverlapped);
[DllImport("Kernel32.dll", SetLastError = true)]
internal static extern bool WriteFile(
[In] SafeFileHandle hFile,
[In] IntPtr lpBuffer,
[In] UInt32 nNumberOfBytesToWrite,
[Out] out UInt32 lpNumberOfBytesWritten,
[In] NativeOverlapped* lpOverlapped);
internal enum EMoveMethod : uint
{
Begin = 0,
Current = 1,
End = 2
}
[DllImport("kernel32.dll", SetLastError = true)]
internal static extern uint SetFilePointer(
[In] SafeFileHandle hFile,
[In] int lDistanceToMove,
[In, Out] ref int lpDistanceToMoveHigh,
[In] EMoveMethod dwMoveMethod);
[DllImport("kernel32.dll", SetLastError = true)]
internal static extern bool SetEndOfFile(
[In] SafeFileHandle hFile);
[DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Auto)]
internal static extern bool GetDiskFreeSpace(string lpRootPathName,
out uint lpSectorsPerCluster,
out uint lpBytesPerSector,
out uint lpNumberOfFreeClusters,
out uint lpTotalNumberOfClusters);
[DllImport("kernel32.dll", SetLastError = true)]
internal static extern bool DeleteFileW([MarshalAs(UnmanagedType.LPWStr)]string lpFileName);
#endregion
#region Thread and NUMA functions
[DllImport("kernel32.dll")]
private static extern IntPtr GetCurrentThread();
[DllImport("kernel32")]
internal static extern uint GetCurrentThreadId();
[DllImport("kernel32.dll", SetLastError = true)]
private static extern uint GetCurrentProcessorNumber();
[DllImport("kernel32.dll", SetLastError = true)]
private static extern uint GetActiveProcessorCount(uint count);
[DllImport("kernel32.dll", SetLastError = true)]
private static extern ushort GetActiveProcessorGroupCount();
[DllImport("kernel32.dll", SetLastError = true)]
private static extern int SetThreadGroupAffinity(IntPtr hThread, ref GROUP_AFFINITY GroupAffinity, ref GROUP_AFFINITY PreviousGroupAffinity);
[DllImport("kernel32.dll", SetLastError = true)]
private static extern int GetThreadGroupAffinity(IntPtr hThread, ref GROUP_AFFINITY PreviousGroupAffinity);
private static readonly uint ALL_PROCESSOR_GROUPS = 0xffff;
[System.Runtime.InteropServices.StructLayoutAttribute(System.Runtime.InteropServices.LayoutKind.Sequential)]
private struct GROUP_AFFINITY
{
public ulong Mask;
public uint Group;
public uint Reserved1;
public uint Reserved2;
public uint Reserved3;
}
/// <summary>
/// Accepts thread id = 0, 1, 2, ... and sprays them round-robin
/// across all cores (viewed as a flat space). On NUMA machines,
/// this gives us [socket, core] ordering of affinitization. That is,
/// if there are N cores per socket, then thread indices of 0 to N-1 map
/// to the range [socket 0, core 0] to [socket 0, core N-1].
/// </summary>
/// <param name="threadIdx">Index of thread (from 0 onwards)</param>
public static void AffinitizeThreadRoundRobin(uint threadIdx)
{
uint nrOfProcessors = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
ushort nrOfProcessorGroups = GetActiveProcessorGroupCount();
uint nrOfProcsPerGroup = nrOfProcessors / nrOfProcessorGroups;
GROUP_AFFINITY groupAffinityThread = default(GROUP_AFFINITY);
GROUP_AFFINITY oldAffinityThread = default(GROUP_AFFINITY);
IntPtr thread = GetCurrentThread();
GetThreadGroupAffinity(thread, ref groupAffinityThread);
threadIdx = threadIdx % nrOfProcessors;
groupAffinityThread.Mask = (ulong)1L << ((int)(threadIdx % (int)nrOfProcsPerGroup));
groupAffinityThread.Group = (uint)(threadIdx / nrOfProcsPerGroup);
if (SetThreadGroupAffinity(thread, ref groupAffinityThread, ref oldAffinityThread) == 0)
{
throw new Exception("Unable to affinitize thread");
}
}
/// <summary>
/// Accepts thread id = 0, 1, 2, ... and sprays them round-robin
/// across all cores (viewed as a flat space). On NUMA machines,
/// this gives us [core, socket] ordering of affinitization. That is,
/// if there are N cores per socket, then thread indices of 0 to N-1 map
/// to the range [socket 0, core 0] to [socket N-1, core 0].
/// </summary>
/// <param name="threadIdx">Index of thread (from 0 onwards)</param>
/// <param name="nrOfProcessorGroups">Number of NUMA sockets</param>
public static void AffinitizeThreadShardedNuma(uint threadIdx, ushort nrOfProcessorGroups)
{
uint nrOfProcessors = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
uint nrOfProcsPerGroup = nrOfProcessors / nrOfProcessorGroups;
threadIdx = nrOfProcsPerGroup * (threadIdx % nrOfProcessorGroups) + (threadIdx / nrOfProcessorGroups);
AffinitizeThreadRoundRobin(threadIdx);
return;
}
#endregion
#region Advanced file ops
[DllImport("advapi32.dll", SetLastError = true)]
private static extern bool LookupPrivilegeValue(string lpSystemName, string lpName, ref LUID lpLuid);
[DllImport("kernel32.dll", SetLastError = true)]
private static extern IntPtr GetCurrentProcess();
[DllImport("advapi32", SetLastError = true)]
private static extern bool OpenProcessToken(IntPtr ProcessHandle, uint DesiredAccess, out IntPtr TokenHandle);
[DllImport("advapi32.dll", SetLastError = true)]
private static extern bool AdjustTokenPrivileges(IntPtr tokenhandle, int disableprivs, ref TOKEN_PRIVILEGES Newstate, int BufferLengthInBytes, int PreviousState, int ReturnLengthInBytes);
[DllImport("kernel32.dll", SetLastError = true)]
private static extern bool CloseHandle(IntPtr hObject);
[DllImport("Kernel32.dll", SetLastError = true)]
private static extern bool DeviceIoControl(SafeFileHandle hDevice, uint IoControlCode, void* InBuffer, int nInBufferSize, IntPtr OutBuffer, int nOutBufferSize, ref uint pBytesReturned, IntPtr Overlapped);
[DllImport("kernel32.dll", SetLastError = true)]
private static extern bool SetFilePointerEx(SafeFileHandle hFile, long liDistanceToMove, out long lpNewFilePointer, uint dwMoveMethod);
[DllImport("kernel32.dll", SetLastError = true)]
private static extern bool SetFileValidData(SafeFileHandle hFile, long ValidDataLength);
[DllImport("kernel32.dll", SetLastError = true)]
private static extern SafeFileHandle CreateFile(string filename, uint access, uint share, IntPtr securityAttributes, uint creationDisposition, uint flagsAndAttributes, IntPtr templateFile);
/// <summary>
/// Enable privilege for process
/// </summary>
/// <returns></returns>
public static bool EnableProcessPrivileges()
{
#if DOTNETCORE
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
return false;
#endif
TOKEN_PRIVILEGES token_privileges = default(TOKEN_PRIVILEGES);
token_privileges.PrivilegeCount = 1;
token_privileges.Privileges.Attributes = 0x2;
if (!LookupPrivilegeValue(null, "SeManageVolumePrivilege",
ref token_privileges.Privileges.Luid)) return false;
if (!OpenProcessToken(GetCurrentProcess(), 0x20, out IntPtr token)) return false;
if (!AdjustTokenPrivileges(token, 0, ref token_privileges, 0, 0, 0)) return false;
if (Marshal.GetLastWin32Error() != 0) return false;
CloseHandle(token);
return true;
}
private static uint CTL_CODE(uint DeviceType, uint Function, uint Method, uint Access)
{
return (((DeviceType) << 16) | ((Access) << 14) | ((Function) << 2) | (Method));
}
internal static bool EnableVolumePrivileges(string filename, SafeFileHandle handle)
{
#if DOTNETCORE
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
return false;
#endif
string volume_string = "\\\\.\\" + filename.Substring(0, 2);
uint fileCreation = unchecked((uint)FileMode.Open);
SafeFileHandle volume_handle = CreateFile(volume_string, 0, 0, IntPtr.Zero, fileCreation,
0x80, IntPtr.Zero);
if (volume_handle == null)
{
return false;
}
MARK_HANDLE_INFO mhi;
mhi.UsnSourceInfo = 0x1;
mhi.VolumeHandle = volume_handle.DangerousGetHandle();
mhi.HandleInfo = 0x1;
uint bytes_returned = 0;
bool result = DeviceIoControl(handle, CTL_CODE(0x9, 63, 0, 0),
(void*)&mhi, sizeof(MARK_HANDLE_INFO), IntPtr.Zero,
0, ref bytes_returned, IntPtr.Zero);
if (!result)
{
return false;
}
volume_handle.Close();
return true;
}
/// <summary>
/// Set file size
/// </summary>
/// <param name="file_handle"></param>
/// <param name="file_size"></param>
/// <returns></returns>
public static bool SetFileSize(SafeFileHandle file_handle, long file_size)
{
#if DOTNETCORE
if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
return false;
#endif
if (!SetFilePointerEx(file_handle, file_size, out long newFilePtr, 0))
{
return false;
}
// Set a fixed file length
if (!SetEndOfFile(file_handle))
{
return false;
}
if (!SetFileValidData(file_handle, file_size))
{
return false;
}
return true;
}
internal static int MakeHRFromErrorCode(int errorCode)
{
return unchecked(((int)0x80070000) | errorCode);
}
#endregion
}
}

@ -0,0 +1,140 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
#define CALLOC
using System;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Result of async page read
/// </summary>
/// <typeparam name="TContext"></typeparam>
public class PageAsyncReadResult<TContext> : IAsyncResult
{
internal long page;
internal TContext context;
internal CountdownEvent handle;
internal SectorAlignedMemory freeBuffer1;
internal SectorAlignedMemory freeBuffer2;
internal IOCompletionCallback callback;
internal IDevice objlogDevice;
internal object frame;
internal CancellationTokenSource cts;
/* Used for iteration */
internal long resumePtr;
internal long untilPtr;
internal long maxPtr;
/// <summary>
///
/// </summary>
public bool IsCompleted => throw new NotImplementedException();
/// <summary>
///
/// </summary>
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
/// <summary>
///
/// </summary>
public object AsyncState => throw new NotImplementedException();
/// <summary>
///
/// </summary>
public bool CompletedSynchronously => throw new NotImplementedException();
/// <summary>
/// Free
/// </summary>
public void Free()
{
if (freeBuffer1 != null)
{
freeBuffer1.Return();
freeBuffer1 = null;
}
if (freeBuffer2 != null)
{
freeBuffer2.Return();
freeBuffer2 = null;
}
}
}
/// <summary>
/// Page async flush result
/// </summary>
/// <typeparam name="TContext"></typeparam>
public class PageAsyncFlushResult<TContext> : IAsyncResult
{
/// <summary>
/// Page
/// </summary>
public long page;
/// <summary>
/// Context
/// </summary>
public TContext context;
/// <summary>
/// Count
/// </summary>
public int count;
internal bool partial;
internal long fromAddress;
internal long untilAddress;
internal CountdownEvent handle;
internal IDevice objlogDevice;
internal SectorAlignedMemory freeBuffer1;
internal SectorAlignedMemory freeBuffer2;
internal AutoResetEvent done;
/// <summary>
///
/// </summary>
public bool IsCompleted => throw new NotImplementedException();
/// <summary>
///
/// </summary>
public WaitHandle AsyncWaitHandle => throw new NotImplementedException();
/// <summary>
///
/// </summary>
public object AsyncState => throw new NotImplementedException();
/// <summary>
///
/// </summary>
public bool CompletedSynchronously => throw new NotImplementedException();
/// <summary>
/// Free
/// </summary>
public void Free()
{
if (freeBuffer1 != null)
{
freeBuffer1.Return();
freeBuffer1 = null;
}
if (freeBuffer2 != null)
{
freeBuffer2.Return();
freeBuffer2 = null;
}
if (handle != null)
{
handle.Signal();
}
}
}
}

@ -0,0 +1,232 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Runtime.CompilerServices;
namespace FASTER.core
{
/// <summary>
/// A dictionary that supports concurrency with similar interface to .NET's ConcurrentDictionary.
/// However, this dictionary changes the implementation of AddOrUpdate and GetOrAdd functions to
/// guarantee atomicity per-key for factory lambdas.
/// </summary>
/// <typeparam name="TKey">Type of keys in the dictionary</typeparam>
/// <typeparam name="TValue">Type of values in the dictionary</typeparam>
internal sealed class SafeConcurrentDictionary<TKey, TValue> : IEnumerable<KeyValuePair<TKey, TValue>>
{
private readonly ConcurrentDictionary<TKey, TValue> dictionary = new ConcurrentDictionary<TKey, TValue>();
private readonly ConcurrentDictionary<TKey, object> keyLocks = new ConcurrentDictionary<TKey, object>();
/// <summary>
/// Returns the count of the dictionary.
/// </summary>
public int Count
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
return dictionary.Count;
}
}
/// <summary>
/// Returns whether or not the dictionary is empty.
/// </summary>
public bool IsEmpty
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
return dictionary.IsEmpty;
}
}
/// <summary>
/// Gets or sets the value associated with a key.
/// </summary>
public TValue this[TKey key]
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
return dictionary[key];
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
set
{
dictionary[key] = value;
}
}
/// <summary>
/// Returns a collection of the keys in the dictionary.
/// </summary>
public ICollection<TKey> Keys
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
return dictionary.Keys;
}
}
/// <summary>
/// Returns a collection of the values in the dictionary.
/// </summary>
public ICollection<TValue> Values
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get
{
return dictionary.Values;
}
}
/// <summary>
/// Adds or updates a key/value pair to the dictionary.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public TValue AddOrUpdate(TKey key, Func<TKey, TValue> addValueFactory, Func<TKey, TValue, TValue> updateValueFactory)
{
lock (GetLock(key))
{
return dictionary.AddOrUpdate(key, addValueFactory, updateValueFactory);
}
}
/// <summary>
/// Adds or updates a key/value pair to the dictionary.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public TValue AddOrUpdate(TKey key, TValue addValue, Func<TKey, TValue, TValue> updateValueFactory)
{
lock (GetLock(key))
{
return dictionary.AddOrUpdate(key, addValue, updateValueFactory);
}
}
/// <summary>
/// Adds a key/value pair to the dictionary if it does not exist.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public TValue GetOrAdd(TKey key, Func<TKey, TValue> valueFactory)
{
if (dictionary.TryGetValue(key, out TValue value))
{
return value;
}
lock (GetLock(key))
{
return dictionary.GetOrAdd(key, valueFactory);
}
}
/// <summary>
/// Adds a key/value pair to the dictionary if it does not exist.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public TValue GetOrAdd(TKey key, TValue value)
{
return dictionary.GetOrAdd(key, value);
}
/// <summary>
/// Clears the dictionary.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void Clear()
{
dictionary.Clear();
keyLocks.Clear();
}
/// <summary>
/// Returns whether or not the dictionary contains the specified key.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool ContainsKey(TKey key)
{
return dictionary.ContainsKey(key);
}
/// <summary>
/// Returns an enumerator of the elements in the dictionary.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public IEnumerator<KeyValuePair<TKey, TValue>> GetEnumerator()
{
return dictionary.GetEnumerator();
}
IEnumerator IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
/// <summary>
/// Copies the key/value pairs to a new array.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public KeyValuePair<TKey, TValue>[] ToArray()
{
return dictionary.ToArray();
}
/// <summary>
/// Attempts to add the specified key/value to the dictionary if it does not exist.
/// Returns true or false depending on if the value was added or not, respectively.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryAdd(TKey key, TValue value)
{
return dictionary.TryAdd(key, value);
}
/// <summary>
/// Attempts to get the value for the specified key.
/// Returns true if the key was in the dictionary or false otherwise.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryGetValue(TKey key, out TValue value)
{
return dictionary.TryGetValue(key, out value);
}
/// <summary>
/// Attempts to remove the value for the specified key.
/// Returns true if the key was in the dictionary or false otherwise.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryRemove(TKey key, out TValue value)
{
return dictionary.TryRemove(key, out value);
}
/// <summary>
/// Compares the existing value for the specified key with a specified value,
/// and updates it if and only if it is a match. Returns true is updated or
/// false otherwise.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool TryUpdate(TKey key, TValue newValue, TValue comparisonValue)
{
return dictionary.TryUpdate(key, newValue, comparisonValue);
}
/// <summary>
/// Retrieves lock associated with a key (creating it if it does not exist).
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private object GetLock(TKey key)
{
return keyLocks.GetOrAdd(key, v => new object());
}
}
}

@ -0,0 +1,73 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace FASTER.core
{
internal enum ResizeOperationStatus : int { IN_PROGRESS, DONE };
[StructLayout(LayoutKind.Explicit, Size = 8)]
internal unsafe struct ResizeInfo
{
[FieldOffset(0)]
public ResizeOperationStatus status;
[FieldOffset(4)]
public int version;
[FieldOffset(0)]
public long word;
}
internal enum Phase : int {
PREP_INDEX_CHECKPOINT, INDEX_CHECKPOINT,
PREPARE, IN_PROGRESS,
WAIT_PENDING, WAIT_FLUSH,
REST,
PERSISTENCE_CALLBACK,
GC,
PREPARE_GROW, IN_PROGRESS_GROW,
INTERMEDIATE,
};
[StructLayout(LayoutKind.Explicit, Size = 8)]
internal unsafe struct SystemState
{
[FieldOffset(0)]
public Phase phase;
[FieldOffset(4)]
public int version;
[FieldOffset(0)]
public long word;
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static SystemState Copy(ref SystemState other)
{
var info = default(SystemState);
info.word = other.word;
return info;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static SystemState Make(Phase status, int version)
{
var info = default(SystemState);
info.phase = status;
info.version = version;
return info;
}
}
}

@ -0,0 +1,45 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
// *********************************************************************
// Copyright (C) Microsoft. All rights reserved.
//
// @File:
//
// @Owner:
// @Test:
//
// Purpose:
//
// Notes:
//
// @EndHeader@
// *********************************************************************
namespace FASTER.core
{
/// <summary>
/// Status result of operation on FASTER
/// </summary>
public enum Status
{
/// <summary>
/// For Read and RMW, item being read was found, and
/// the operation completed successfully
/// For Upsert, item was upserted successfully
/// </summary>
OK,
/// <summary>
/// For Read and RMW, item being read was not found
/// </summary>
NOTFOUND,
/// <summary>
/// Operation went pending (async)
/// </summary>
PENDING,
/// <summary>
/// Operation resulted in some error
/// </summary>
ERROR
}
}

@ -0,0 +1,296 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Runtime.InteropServices;
using System.Security;
using System.IO;
using System.Runtime.CompilerServices;
using Microsoft.Win32.SafeHandles;
using System.Diagnostics;
using System.Threading;
namespace FASTER.core
{
/// <summary>
/// Empty type
/// </summary>
public struct Empty
{
/// <summary>
/// Default
/// </summary>
public static readonly Empty Default = default(Empty);
}
/// <summary>
/// FASTER utility functions
/// </summary>
public static class Utility
{
/// <summary>
/// Get size of type
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="value"></param>
/// <returns></returns>
internal static unsafe int GetSize<T>(this T value)
{
T[] arr = new T[2];
return (int)((long)Unsafe.AsPointer(ref arr[1]) - (long)Unsafe.AsPointer(ref arr[0]));
}
/// <summary>
/// Is type blittable
/// </summary>
/// <typeparam name="T"></typeparam>
/// <returns></returns>
internal static bool IsBlittable<T>()
{
if (default(T) == null)
return false;
try
{
var tmp = new T[1];
var h = GCHandle.Alloc(tmp, GCHandleType.Pinned);
h.Free();
}
catch (Exception)
{
return false;
}
return true;
}
/// <summary>
/// Check if two byte arrays of given length are equal
/// </summary>
/// <param name="src"></param>
/// <param name="dst"></param>
/// <param name="length"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public unsafe static bool IsEqual(byte* src, byte* dst, int length)
{
for (int i = 0; i < length; i++)
{
if (*(src + i) != *(dst + i))
{
return false;
}
}
return true;
}
/// <summary>
/// Copy numBytes bytes from src to dest
/// </summary>
/// <param name="src"></param>
/// <param name="dest"></param>
/// <param name="numBytes"></param>
public unsafe static void Copy(byte* src, byte* dest, int numBytes)
{
for(int i = 0; i < numBytes; i++)
{
*(dest + i) = *(src + i);
}
}
/// <summary>
/// Get 64-bit hash code for a long value
/// </summary>
/// <param name="input"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static long GetHashCode(long input)
{
long local_rand = input;
long local_rand_hash = 8;
local_rand_hash = 40343 * local_rand_hash + ((local_rand) & 0xFFFF);
local_rand_hash = 40343 * local_rand_hash + ((local_rand >> 16) & 0xFFFF);
local_rand_hash = 40343 * local_rand_hash + ((local_rand >> 32) & 0xFFFF);
local_rand_hash = 40343 * local_rand_hash + (local_rand >> 48);
local_rand_hash = 40343 * local_rand_hash;
return (long)Rotr64((ulong)local_rand_hash, 45);
}
/// <summary>
/// Get 64-bit hash code for a byte array
/// </summary>
/// <param name="pbString"></param>
/// <param name="len"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe long HashBytes(byte* pbString, int len)
{
const long magicno = 40343;
char* pwString = (char*)pbString;
int cbBuf = len / 2;
ulong hashState = (ulong)len;
for (int i = 0; i < cbBuf; i++, pwString++)
hashState = magicno * hashState + *pwString;
if ((len & 1) > 0)
{
byte* pC = (byte*)pwString;
hashState = magicno * hashState + *pC;
}
return (long)Rotr64(magicno * hashState, 4);
}
/// <summary>
/// Compute XOR of all provided bytes
/// </summary>
/// <param name="src"></param>
/// <param name="length"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static unsafe ulong XorBytes(byte* src, int length)
{
ulong result = 0;
byte* curr = src;
byte* end = src + length;
while (curr + 4 * sizeof(ulong) <= end)
{
result ^= *(ulong*)curr;
result ^= *(1 + (ulong*)curr);
result ^= *(2 + (ulong*)curr);
result ^= *(3 + (ulong*)curr);
curr += 4 * sizeof(ulong);
}
while (curr + sizeof(ulong) <= end)
{
result ^= *(ulong*)curr;
curr += sizeof(ulong);
}
while (curr + 1 <= end)
{
result ^= *curr;
curr++;
}
return result;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong Rotr64(ulong x, int n)
{
return (((x) >> n) | ((x) << (64 - n)));
}
/// <summary>
/// Is power of 2
/// </summary>
/// <param name="x"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsPowerOfTwo(long x)
{
return (x > 0) && ((x & (x - 1)) == 0);
}
internal static readonly int[] MultiplyDeBruijnBitPosition2 = new int[32]
{
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
};
/// <summary>
/// Get log base 2
/// </summary>
/// <param name="x"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetLogBase2(int x)
{
return MultiplyDeBruijnBitPosition2[(uint)(x * 0x077CB531U) >> 27];
}
/// <summary>
/// Get log base 2
/// </summary>
/// <param name="value"></param>
/// <returns></returns>
public static int GetLogBase2(ulong value)
{
int i;
for (i = -1; value != 0; i++)
value >>= 1;
return (i == -1) ? 0 : i;
}
/// <summary>
/// Check if power of two
/// </summary>
/// <param name="x"></param>
/// <returns></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool Is32Bit(long x)
{
return ((ulong)x < 4294967295ul);
}
/// <summary>
/// A 32-bit murmur3 implementation.
/// </summary>
/// <param name="h"></param>
/// <returns></returns>
internal static int Murmur3(int h)
{
uint a = (uint)h;
a ^= a >> 16;
a *= 0x85ebca6b;
a ^= a >> 13;
a *= 0xc2b2ae35;
a ^= a >> 16;
return (int)a;
}
/// <summary>
/// Updates the variable to newValue only if the current value is smaller than the new value.
/// </summary>
/// <param name="variable">The variable to possibly replace</param>
/// <param name="newValue">The value that replaces the variable if successful</param>
/// <param name="oldValue">The orignal value in the variable</param>
/// <returns> if oldValue less than newValue </returns>
public static bool MonotonicUpdate(ref long variable, long newValue, out long oldValue)
{
do
{
oldValue = variable;
if (oldValue >= newValue) return false;
} while (Interlocked.CompareExchange(ref variable, newValue, oldValue) != oldValue);
return true;
}
/// <summary>
/// Updates the variable to newValue only if the current value is smaller than the new value.
/// </summary>
/// <param name="variable">The variable to possibly replace</param>
/// <param name="newValue">The value that replaces the variable if successful</param>
/// <param name="oldValue">The orignal value in the variable</param>
/// <returns>if oldValue less than or equal to newValue</returns>
public static bool MonotonicUpdate(ref int variable, int newValue, out int oldValue)
{
do
{
oldValue = variable;
if (oldValue >= newValue) return false;
} while (Interlocked.CompareExchange(ref variable, newValue, oldValue) != oldValue);
return true;
}
}
}

@ -0,0 +1,54 @@
using FASTER.core;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;
using ZeroLevel.Services.FileSystem;
using ZeroLevel.Services.Serialization;
namespace ZeroLevel.Services.Microservices.Dump
{
public class DumpStorage<T>
{
IDevice device;
FasterLog log;
public DumpStorage()
{
var folder = Path.Combine(Configuration.BaseDirectory, "dump");
if (false == Directory.Exists(folder))
{
Directory.CreateDirectory(folder);
}
device = Devices.CreateLogDevice(Path.Combine(folder, $"dump.log"),
true, true, -1, false);
log = new FasterLog(new FasterLogSettings { LogDevice = device });
}
public void Dump(T value)
{
var packet = MessageSerializer.SerializeCompatible(value);
while (!log.TryEnqueue(packet, out _)) ;
log.Commit();
}
public async Task DumpAsync(T value)
{
var packet = MessageSerializer.SerializeCompatible(value);
await log.EnqueueAndWaitForCommitAsync(packet);
}
public IEnumerable<T> ReadAndTruncate()
{
byte[] result;
using (var iter = log.Scan(log.BeginAddress, log.TailAddress))
{
while (iter.GetNext(out result, out int length))
{
yield return MessageSerializer.DeserializeCompatible<T>(result);
}
log.TruncateUntil(iter.NextAddress);
}
}
}
}

@ -36,7 +36,11 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="System.Buffers" Version="4.5.0" />
<PackageReference Include="System.Memory" Version="4.5.3" />
<PackageReference Include="System.Runtime.CompilerServices.Unsafe" Version="4.6.0" />
<PackageReference Include="System.ServiceProcess.ServiceController" Version="4.6.0" />
<PackageReference Include="System.Threading.Tasks.Extensions" Version="4.5.3" />
</ItemGroup>
<ItemGroup>

Loading…
Cancel
Save

Powered by TurnKey Linux.