From c5dfd30ee2028ac4de2f06f1ffd3e3183a417f55 Mon Sep 17 00:00:00 2001 From: "a.bozhenov" Date: Thu, 31 Oct 2019 21:39:37 +0300 Subject: [PATCH] FASTER Copy project 'https://github.com/microsoft/FASTER' to ZeroLevel Append DumpStorage --- TestApp/Program.cs | 3 + ZeroLevel.UnitTests/DumpTests.cs | 38 + .../Models/TestSerializableDTO.cs | 41 + .../FASTER/Allocator/AllocatorBase.cs | 1565 +++++++++++ .../FASTER/Allocator/AsyncIOContext.cs | 80 + .../Services/FASTER/Allocator/AtomicOwner.cs | 91 + .../FASTER/Allocator/BlittableAllocator.cs | 401 +++ .../FASTER/Allocator/BlittableFrame.cs | 65 + .../FASTER/Allocator/BlittableScanIterator.cs | 238 ++ .../Services/FASTER/Allocator/ErrorList.cs | 63 + .../FASTER/Allocator/GenericAllocator.cs | 968 +++++++ .../Services/FASTER/Allocator/GenericFrame.cs | 67 + .../FASTER/Allocator/GenericScanIterator.cs | 255 ++ .../FASTER/Allocator/IFasterScanIterator.cs | 69 + .../FASTER/Allocator/MallocFixedPageSize.cs | 656 +++++ .../FASTER/Allocator/PendingFlushList.cs | 56 + .../Allocator/VarLenBlittableAllocator.cs | 504 ++++ .../Allocator/VarLenBlittableScanIterator.cs | 228 ++ ZeroLevel/Services/FASTER/Device/Devices.cs | 52 + ZeroLevel/Services/FASTER/Device/IDevice.cs | 161 ++ .../FASTER/Device/LocalStorageDevice.cs | 303 +++ .../Device/ManagedLocalStorageDevice.cs | 302 +++ .../Services/FASTER/Device/NullDevice.cs | 88 + .../FASTER/Device/ShardedStorageDevice.cs | 312 +++ .../FASTER/Device/StorageDeviceBase.cs | 279 ++ .../FASTER/Device/TieredStorageDevice.cs | 176 ++ .../Services/FASTER/Epochs/FastThreadLocal.cs | 81 + .../Services/FASTER/Epochs/LightEpoch.cs | 450 ++++ .../FASTER/Index/Common/AddressInfo.cs | 95 + .../FASTER/Index/Common/CheckpointSettings.cs | 48 + .../Services/FASTER/Index/Common/Contexts.cs | 479 ++++ .../FASTER/Index/Common/HeapContainer.cs | 72 + .../FASTER/Index/Common/LogSettings.cs | 185 ++ .../FASTER/Index/Common/RecordInfo.cs | 243 ++ .../FASTER/Index/FASTER/Extensions.cs | 72 + .../Services/FASTER/Index/FASTER/FASTER.cs | 488 ++++ .../FASTER/Index/FASTER/FASTERBase.cs | 794 ++++++ .../FASTER/Index/FASTER/FASTERImpl.cs | 2332 +++++++++++++++++ .../FASTER/Index/FASTER/FASTERThread.cs | 364 +++ .../FASTER/Index/FASTER/LogAccessor.cs | 347 +++ .../Index/FasterLog/CommitFailureException.cs | 25 + .../FASTER/Index/FasterLog/CommitInfo.cs | 51 + .../FASTER/Index/FasterLog/FasterLog.cs | 941 +++++++ .../Index/FasterLog/FasterLogIterator.cs | 425 +++ .../Index/FasterLog/FasterLogRecoveryInfo.cs | 160 ++ .../Index/FasterLog/FasterLogSettings.cs | 99 + .../Index/FasterLog/ILogCommitManager.cs | 27 + .../Index/FasterLog/IReadOnlySpanBatch.cs | 26 + .../Index/FasterLog/LocalLogCommitManager.cs | 64 + .../FASTER/Index/Interfaces/FunctionsBase.cs | 73 + .../Interfaces/IFasterEqualityComparer.cs | 26 + .../FASTER/Index/Interfaces/IFasterKV.cs | 202 ++ .../FASTER/Index/Interfaces/IFunctions.cs | 115 + .../Index/Interfaces/IObjectSerializer.cs | 111 + .../FASTER/Index/Recovery/Checkpoint.cs | 729 ++++++ .../Index/Recovery/DirectoryConfiguration.cs | 134 + .../Index/Recovery/ICheckpointManager.cs | 111 + .../FASTER/Index/Recovery/IndexCheckpoint.cs | 133 + .../FASTER/Index/Recovery/IndexRecovery.cs | 144 + .../Index/Recovery/LocalCheckpointManager.cs | 206 ++ .../FASTER/Index/Recovery/Recovery.cs | 500 ++++ ZeroLevel/Services/FASTER/Readme.txt | 1 + .../FASTER/Utilities/AsyncResultTypes.cs | 90 + .../Services/FASTER/Utilities/BufferPool.cs | 224 ++ .../Utilities/FasterEqualityComparer.cs | 28 + .../Services/FASTER/Utilities/Native32.cs | 332 +++ .../FASTER/Utilities/PageAsyncResultTypes.cs | 140 + .../Utilities/SafeConcurrentDictionary.cs | 232 ++ .../FASTER/Utilities/StateTransitions.cs | 73 + ZeroLevel/Services/FASTER/Utilities/Status.cs | 45 + .../Services/FASTER/Utilities/Utility.cs | 296 +++ .../Microservices/Dump/DumpStorage.cs | 54 + ZeroLevel/ZeroLevel.csproj | 4 + 73 files changed, 18932 insertions(+) create mode 100644 ZeroLevel.UnitTests/DumpTests.cs create mode 100644 ZeroLevel.UnitTests/Models/TestSerializableDTO.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/AllocatorBase.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/AsyncIOContext.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/AtomicOwner.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/BlittableAllocator.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/BlittableFrame.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/BlittableScanIterator.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/ErrorList.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/GenericAllocator.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/GenericFrame.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/GenericScanIterator.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/IFasterScanIterator.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/MallocFixedPageSize.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/PendingFlushList.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/VarLenBlittableAllocator.cs create mode 100644 ZeroLevel/Services/FASTER/Allocator/VarLenBlittableScanIterator.cs create mode 100644 ZeroLevel/Services/FASTER/Device/Devices.cs create mode 100644 ZeroLevel/Services/FASTER/Device/IDevice.cs create mode 100644 ZeroLevel/Services/FASTER/Device/LocalStorageDevice.cs create mode 100644 ZeroLevel/Services/FASTER/Device/ManagedLocalStorageDevice.cs create mode 100644 ZeroLevel/Services/FASTER/Device/NullDevice.cs create mode 100644 ZeroLevel/Services/FASTER/Device/ShardedStorageDevice.cs create mode 100644 ZeroLevel/Services/FASTER/Device/StorageDeviceBase.cs create mode 100644 ZeroLevel/Services/FASTER/Device/TieredStorageDevice.cs create mode 100644 ZeroLevel/Services/FASTER/Epochs/FastThreadLocal.cs create mode 100644 ZeroLevel/Services/FASTER/Epochs/LightEpoch.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Common/AddressInfo.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Common/CheckpointSettings.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Common/Contexts.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Common/HeapContainer.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Common/LogSettings.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Common/RecordInfo.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FASTER/Extensions.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FASTER/FASTER.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FASTER/FASTERBase.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FASTER/FASTERImpl.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FASTER/FASTERThread.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FASTER/LogAccessor.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/CommitFailureException.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/CommitInfo.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/FasterLog.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogIterator.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogRecoveryInfo.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogSettings.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/ILogCommitManager.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/IReadOnlySpanBatch.cs create mode 100644 ZeroLevel/Services/FASTER/Index/FasterLog/LocalLogCommitManager.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Interfaces/FunctionsBase.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Interfaces/IFasterEqualityComparer.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Interfaces/IFasterKV.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Interfaces/IFunctions.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Interfaces/IObjectSerializer.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Recovery/Checkpoint.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Recovery/DirectoryConfiguration.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Recovery/ICheckpointManager.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Recovery/IndexCheckpoint.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Recovery/IndexRecovery.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Recovery/LocalCheckpointManager.cs create mode 100644 ZeroLevel/Services/FASTER/Index/Recovery/Recovery.cs create mode 100644 ZeroLevel/Services/FASTER/Readme.txt create mode 100644 ZeroLevel/Services/FASTER/Utilities/AsyncResultTypes.cs create mode 100644 ZeroLevel/Services/FASTER/Utilities/BufferPool.cs create mode 100644 ZeroLevel/Services/FASTER/Utilities/FasterEqualityComparer.cs create mode 100644 ZeroLevel/Services/FASTER/Utilities/Native32.cs create mode 100644 ZeroLevel/Services/FASTER/Utilities/PageAsyncResultTypes.cs create mode 100644 ZeroLevel/Services/FASTER/Utilities/SafeConcurrentDictionary.cs create mode 100644 ZeroLevel/Services/FASTER/Utilities/StateTransitions.cs create mode 100644 ZeroLevel/Services/FASTER/Utilities/Status.cs create mode 100644 ZeroLevel/Services/FASTER/Utilities/Utility.cs create mode 100644 ZeroLevel/Services/Microservices/Dump/DumpStorage.cs diff --git a/TestApp/Program.cs b/TestApp/Program.cs index 1dfcc35..46e21db 100644 --- a/TestApp/Program.cs +++ b/TestApp/Program.cs @@ -6,6 +6,9 @@ namespace TestApp { private static void Main(string[] args) { + + + Configuration.Save(Configuration.ReadFromApplicationConfig()); Bootstrap.Startup(args, () => Configuration.ReadSetFromIniFile("config.ini")) diff --git a/ZeroLevel.UnitTests/DumpTests.cs b/ZeroLevel.UnitTests/DumpTests.cs new file mode 100644 index 0000000..236574e --- /dev/null +++ b/ZeroLevel.UnitTests/DumpTests.cs @@ -0,0 +1,38 @@ +using System; +using System.Linq; +using Xunit; +using ZeroLevel.Services.Microservices.Dump; +using ZeroLevel.UnitTests.Models; + +namespace ZeroLevel.UnitTests +{ + public class DumpTests + { + [Fact] + public void DumpStorageTest() + { + // Arrange + var storage = new DumpStorage(); + var arr = new TestSerializableDTO[] { + new TestSerializableDTO { Id = 0, Title = "#1", Timestamp = DateTime.UtcNow.Ticks }, + new TestSerializableDTO { Id = 1, Title = "#2", Timestamp = DateTime.UtcNow.Ticks }, + new TestSerializableDTO { Id = 2, Title = "#3", Timestamp = DateTime.UtcNow.Ticks } + }; + + // Act + storage.Dump(arr[0]); + storage.Dump(arr[1]); + storage.Dump(arr[2]); + + // Assert + int index = 0; + foreach (var entry in storage.ReadAndTruncate()) + { + Assert.True(arr[index].Equals(entry)); + index++; + } + + Assert.True(0 == storage.ReadAndTruncate().ToArray().Length); + } + } +} diff --git a/ZeroLevel.UnitTests/Models/TestSerializableDTO.cs b/ZeroLevel.UnitTests/Models/TestSerializableDTO.cs new file mode 100644 index 0000000..6c53e6c --- /dev/null +++ b/ZeroLevel.UnitTests/Models/TestSerializableDTO.cs @@ -0,0 +1,41 @@ +using System; +using ZeroLevel.Services.Serialization; + +namespace ZeroLevel.UnitTests.Models +{ + public class TestSerializableDTO + : IBinarySerializable, IEquatable + { + public long Id { get; set; } + public string Title { get; set; } + public long Timestamp { get; set; } + + public void Deserialize(IBinaryReader reader) + { + this.Id = reader.ReadLong(); + this.Title = reader.ReadString(); + this.Timestamp = reader.ReadLong(); + } + + public override bool Equals(object obj) + { + return this.Equals(obj as TestSerializableDTO); + } + + public bool Equals(TestSerializableDTO other) + { + if (other == null) return false; + if (this.Id != other.Id) return false; + if (this.Timestamp != other.Timestamp) return false; + if (string.Compare(this.Title, other.Title, false) != 0) return false; + return true; + } + + public void Serialize(IBinaryWriter writer) + { + writer.WriteLong(this.Id); + writer.WriteString(this.Title); + writer.WriteLong(this.Timestamp); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/AllocatorBase.cs b/ZeroLevel/Services/FASTER/Allocator/AllocatorBase.cs new file mode 100644 index 0000000..b26c966 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/AllocatorBase.cs @@ -0,0 +1,1565 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace FASTER.core +{ + internal enum PMMFlushStatus : int { Flushed, InProgress }; + + internal enum PMMCloseStatus : int { Closed, Open }; + + [StructLayout(LayoutKind.Explicit)] + internal struct FullPageStatus + { + [FieldOffset(0)] + public long LastFlushedUntilAddress; + [FieldOffset(8)] + public long LastClosedUntilAddress; + } + + [StructLayout(LayoutKind.Explicit)] + internal struct PageOffset + { + [FieldOffset(0)] + public int Offset; + [FieldOffset(4)] + public int Page; + [FieldOffset(0)] + public long PageAndOffset; + } + + /// + /// Base class for hybrid log memory allocator + /// + /// + /// + public unsafe abstract partial class AllocatorBase : IDisposable + where Key : new() + where Value : new() + { + /// + /// Epoch information + /// + protected readonly LightEpoch epoch; + private readonly bool ownedEpoch; + + /// + /// Comparer + /// + protected readonly IFasterEqualityComparer comparer; + + #region Protected size definitions + /// + /// Buffer size + /// + internal readonly int BufferSize; + /// + /// Log page size + /// + internal readonly int LogPageSizeBits; + + /// + /// Page size + /// + internal readonly int PageSize; + /// + /// Page size mask + /// + internal readonly int PageSizeMask; + /// + /// Buffer size mask + /// + protected readonly int BufferSizeMask; + /// + /// Aligned page size in bytes + /// + protected readonly int AlignedPageSizeBytes; + + /// + /// Total hybrid log size (bits) + /// + protected readonly int LogTotalSizeBits; + /// + /// Total hybrid log size (bytes) + /// + protected readonly long LogTotalSizeBytes; + + /// + /// Segment size in bits + /// + protected readonly int LogSegmentSizeBits; + /// + /// Segment size + /// + protected readonly long SegmentSize; + /// + /// Segment buffer size + /// + protected readonly int SegmentBufferSize; + + /// + /// HeadOffset lag (from tail) + /// + protected readonly bool HeadOffsetExtraLag; + + /// + /// HeadOFfset lag address + /// + protected readonly long HeadOffsetLagAddress; + + /// + /// Log mutable fraction + /// + protected readonly double LogMutableFraction; + /// + /// ReadOnlyOffset lag (from tail) + /// + protected readonly long ReadOnlyLagAddress; + + #endregion + + #region Public addresses + /// + /// Read-only address + /// + public long ReadOnlyAddress; + + /// + /// Safe read-only address + /// + public long SafeReadOnlyAddress; + + /// + /// Head address + /// + public long HeadAddress; + + /// + /// Safe head address + /// + public long SafeHeadAddress; + + /// + /// Flushed until address + /// + public long FlushedUntilAddress; + + /// + /// Flushed until address + /// + public long ClosedUntilAddress; + + /// + /// Begin address + /// + public long BeginAddress; + + #endregion + + #region Protected device info + /// + /// Device + /// + protected readonly IDevice device; + /// + /// Sector size + /// + protected readonly int sectorSize; + #endregion + + #region Private page metadata + + // Array that indicates the status of each buffer page + internal readonly FullPageStatus[] PageStatusIndicator; + internal readonly PendingFlushList[] PendingFlush; + + /// + /// Global address of the current tail (next element to be allocated from the circular buffer) + /// + private PageOffset TailPageOffset; + + /// + /// Number of pending reads + /// + private int numPendingReads = 0; + #endregion + + /// + /// Buffer pool + /// + protected SectorAlignedBufferPool bufferPool; + + /// + /// Read cache + /// + protected readonly bool ReadCache = false; + + /// + /// Read cache eviction callback + /// + protected readonly Action EvictCallback = null; + + /// + /// Flush callback + /// + protected readonly Action FlushCallback = null; + + /// + /// Error handling + /// + private readonly ErrorList errorList = new ErrorList(); + + /// + /// Observer for records entering read-only region + /// + internal IObserver> OnReadOnlyObserver; + + #region Abstract methods + /// + /// Initialize + /// + public abstract void Initialize(); + /// + /// Get start logical address + /// + /// + /// + public abstract long GetStartLogicalAddress(long page); + /// + /// Get first valid logical address + /// + /// + /// + public abstract long GetFirstValidLogicalAddress(long page); + /// + /// Get physical address + /// + /// + /// + public abstract long GetPhysicalAddress(long newLogicalAddress); + /// + /// Get address info + /// + /// + /// + public abstract ref RecordInfo GetInfo(long physicalAddress); + + /// + /// Get info from byte pointer + /// + /// + /// + public abstract ref RecordInfo GetInfoFromBytePointer(byte* ptr); + + /// + /// Get key + /// + /// + /// + public abstract ref Key GetKey(long physicalAddress); + /// + /// Get value + /// + /// + /// + public abstract ref Value GetValue(long physicalAddress); + /// + /// Get address info for key + /// + /// + /// + public abstract AddressInfo* GetKeyAddressInfo(long physicalAddress); + /// + /// Get address info for value + /// + /// + /// + public abstract AddressInfo* GetValueAddressInfo(long physicalAddress); + + /// + /// Get record size + /// + /// + /// + public abstract int GetRecordSize(long physicalAddress); + + + /// + /// Get number of bytes required + /// + /// + /// + /// + public virtual int GetRequiredRecordSize(long physicalAddress, int availableBytes) => GetAverageRecordSize(); + + /// + /// Get average record size + /// + /// + public abstract int GetAverageRecordSize(); + /// + /// Get initial record size + /// + /// + /// + /// + /// + public abstract int GetInitialRecordSize(ref Key key, ref Input input); + /// + /// Get record size + /// + /// + /// + /// + public abstract int GetRecordSize(ref Key key, ref Value value); + + /// + /// Allocate page + /// + /// + internal abstract void AllocatePage(int index); + /// + /// Whether page is allocated + /// + /// + /// + protected abstract bool IsAllocated(int pageIndex); + /// + /// Populate page + /// + /// + /// + /// + internal abstract void PopulatePage(byte* src, int required_bytes, long destinationPage); + /// + /// Write async to device + /// + /// + /// + /// + /// + /// + /// + /// + /// + protected abstract void WriteAsyncToDevice(long startPage, long flushPage, int pageSize, IOCompletionCallback callback, PageAsyncFlushResult result, IDevice device, IDevice objectLogDevice); + /// + /// Read objects to memory (async) + /// + /// + /// + /// + /// + /// + protected abstract void AsyncReadRecordObjectsToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext context, SectorAlignedMemory result = default(SectorAlignedMemory)); + /// + /// Read page (async) + /// + /// + /// + /// + /// + /// + /// + /// + /// + protected abstract void ReadAsync(ulong alignedSourceAddress, int destinationPageIndex, uint aligned_read_length, IOCompletionCallback callback, PageAsyncReadResult asyncResult, IDevice device, IDevice objlogDevice); + /// + /// Clear page + /// + /// Page number to be cleared + /// Offset to clear from (if partial clear) + protected abstract void ClearPage(long page, int offset = 0); + /// + /// Write page (async) + /// + /// + /// + /// + /// + protected abstract void WriteAsync(long flushPage, IOCompletionCallback callback, PageAsyncFlushResult asyncResult); + /// + /// Retrieve full record + /// + /// + /// + /// + protected abstract bool RetrievedFullRecord(byte* record, ref AsyncIOContext ctx); + + /// + /// Retrieve value from context + /// + /// + /// + public virtual ref Key GetContextRecordKey(ref AsyncIOContext ctx) => ref ctx.key; + + /// + /// Retrieve value from context + /// + /// + /// + public virtual ref Value GetContextRecordValue(ref AsyncIOContext ctx) => ref ctx.value; + + /// + /// Get heap container for pending key + /// + /// + /// + public abstract IHeapContainer GetKeyContainer(ref Key key); + + /// + /// Get heap container for pending value + /// + /// + /// + public abstract IHeapContainer GetValueContainer(ref Value value); + + /// + /// Copy value to context + /// + /// + /// + public virtual void PutContext(ref AsyncIOContext ctx, ref Value value) => ctx.value = value; + + /// + /// Whether key has objects + /// + /// + public abstract bool KeyHasObjects(); + + /// + /// Whether value has objects + /// + /// + public abstract bool ValueHasObjects(); + + /// + /// Get segment offsets + /// + /// + public abstract long[] GetSegmentOffsets(); + + /// + /// Pull-based scan interface for HLOG + /// + /// + /// + /// + /// + public abstract IFasterScanIterator Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode = ScanBufferingMode.DoublePageBuffering); + + #endregion + + + /// + /// Instantiate base allocator + /// + /// + /// + /// + /// + /// + public AllocatorBase(LogSettings settings, IFasterEqualityComparer comparer, Action evictCallback, LightEpoch epoch, Action flushCallback) + { + if (evictCallback != null) + { + ReadCache = true; + EvictCallback = evictCallback; + } + FlushCallback = flushCallback; + + this.comparer = comparer; + if (epoch == null) + { + this.epoch = new LightEpoch(); + ownedEpoch = true; + } + else + this.epoch = epoch; + + settings.LogDevice.Initialize(1L << settings.SegmentSizeBits, epoch); + settings.ObjectLogDevice?.Initialize(1L << settings.SegmentSizeBits, epoch); + + // Page size + LogPageSizeBits = settings.PageSizeBits; + PageSize = 1 << LogPageSizeBits; + PageSizeMask = PageSize - 1; + + // Total HLOG size + LogTotalSizeBits = settings.MemorySizeBits; + LogTotalSizeBytes = 1L << LogTotalSizeBits; + BufferSize = (int)(LogTotalSizeBytes / (1L << LogPageSizeBits)); + BufferSizeMask = BufferSize - 1; + + // HeadOffset lag (from tail). + var headOffsetLagSize = BufferSize - 1; // (ReadCache ? ReadCacheHeadOffsetLagNumPages : HeadOffsetLagNumPages); + if (BufferSize > 1 && HeadOffsetExtraLag) headOffsetLagSize--; + + HeadOffsetLagAddress = (long)headOffsetLagSize << LogPageSizeBits; + + // ReadOnlyOffset lag (from tail). This should not exceed HeadOffset lag. + LogMutableFraction = settings.MutableFraction; + ReadOnlyLagAddress = Math.Min((long)(LogMutableFraction * BufferSize) << LogPageSizeBits, HeadOffsetLagAddress); + + // Segment size + LogSegmentSizeBits = settings.SegmentSizeBits; + SegmentSize = 1 << LogSegmentSizeBits; + SegmentBufferSize = 1 + (LogTotalSizeBytes / SegmentSize < 1 ? 1 : (int)(LogTotalSizeBytes / SegmentSize)); + + if (SegmentSize < PageSize) + throw new Exception("Segment must be at least of page size"); + + if (BufferSize < 1) + { + throw new Exception("Log buffer must be of size at least 1 page"); + } + + PageStatusIndicator = new FullPageStatus[BufferSize]; + PendingFlush = new PendingFlushList[BufferSize]; + for (int i = 0; i < BufferSize; i++) + PendingFlush[i] = new PendingFlushList(); + + device = settings.LogDevice; + sectorSize = (int)device.SectorSize; + AlignedPageSizeBytes = ((PageSize + (sectorSize - 1)) & ~(sectorSize - 1)); + } + + /// + /// Initialize allocator + /// + /// + protected void Initialize(long firstValidAddress) + { + Debug.Assert(firstValidAddress <= PageSize); + + bufferPool = new SectorAlignedBufferPool(1, sectorSize); + + long tailPage = firstValidAddress >> LogPageSizeBits; + int tailPageIndex = (int)(tailPage % BufferSize); + AllocatePage(tailPageIndex); + + // Allocate next page as well + int nextPageIndex = (int)(tailPage + 1) % BufferSize; + if ((!IsAllocated(nextPageIndex))) + { + AllocatePage(nextPageIndex); + } + + SafeReadOnlyAddress = firstValidAddress; + ReadOnlyAddress = firstValidAddress; + SafeHeadAddress = firstValidAddress; + HeadAddress = firstValidAddress; + ClosedUntilAddress = firstValidAddress; + FlushedUntilAddress = firstValidAddress; + BeginAddress = firstValidAddress; + + TailPageOffset.Page = (int)(firstValidAddress >> LogPageSizeBits); + TailPageOffset.Offset = (int)(firstValidAddress & PageSizeMask); + } + + /// + /// Acquire thread + /// + public void Acquire() + { + if (ownedEpoch) + epoch.Acquire(); + } + + /// + /// Release thread + /// + public void Release() + { + if (ownedEpoch) + epoch.Release(); + } + + /// + /// Dispose allocator + /// + public virtual void Dispose() + { + TailPageOffset.Page = 0; + TailPageOffset.Offset = 0; + SafeReadOnlyAddress = 0; + ReadOnlyAddress = 0; + SafeHeadAddress = 0; + HeadAddress = 0; + BeginAddress = 1; + + if (ownedEpoch) + epoch.Dispose(); + bufferPool.Free(); + + OnReadOnlyObserver?.OnCompleted(); + } + + /// + /// Delete in-memory portion of the log + /// + internal abstract void DeleteFromMemory(); + + /// + /// Segment size + /// + /// + public long GetSegmentSize() + { + return SegmentSize; + } + + /// + /// Get tail address + /// + /// + public long GetTailAddress() + { + var local = TailPageOffset; + if (local.Offset >= PageSize) + { + local.Page++; + local.Offset = 0; + } + return ((long)local.Page << LogPageSizeBits) | (uint)local.Offset; + } + + /// + /// Get page + /// + /// + /// + public long GetPage(long logicalAddress) + { + return (logicalAddress >> LogPageSizeBits); + } + + /// + /// Get page index for page + /// + /// + /// + public int GetPageIndexForPage(long page) + { + return (int)(page % BufferSize); + } + + /// + /// Get page index for address + /// + /// + /// + public int GetPageIndexForAddress(long address) + { + return (int)((address >> LogPageSizeBits) % BufferSize); + } + + /// + /// Get capacity (number of pages) + /// + /// + public int GetCapacityNumPages() + { + return BufferSize; + } + + + /// + /// Get page size + /// + /// + public long GetPageSize() + { + return PageSize; + } + + /// + /// Get offset in page + /// + /// + /// + public long GetOffsetInPage(long address) + { + return address & PageSizeMask; + } + + /// + /// Get sector size for main hlog device + /// + /// + public int GetDeviceSectorSize() + { + return sectorSize; + } + + /// + /// Try allocate, no thread spinning allowed + /// May return 0 in case of inability to allocate + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long TryAllocate(int numSlots = 1) + { + if (numSlots > PageSize) + throw new Exception("Entry does not fit on page"); + + PageOffset localTailPageOffset = default(PageOffset); + + // Necessary to check because threads keep retrying and we do not + // want to overflow offset more than once per thread + if (TailPageOffset.Offset > PageSize) + return 0; + + // Determine insertion index. + // ReSharper disable once CSharpWarnings::CS0420 +#pragma warning disable 420 + localTailPageOffset.PageAndOffset = Interlocked.Add(ref TailPageOffset.PageAndOffset, numSlots); +#pragma warning restore 420 + + int page = localTailPageOffset.Page; + int offset = localTailPageOffset.Offset - numSlots; + + #region HANDLE PAGE OVERFLOW + if (localTailPageOffset.Offset > PageSize) + { + if (offset > PageSize) + { + return 0; + } + + // The thread that "makes" the offset incorrect + // is the one that is elected to fix it and + // shift read-only/head. + + long shiftAddress = ((long)(localTailPageOffset.Page + 1)) << LogPageSizeBits; + PageAlignedShiftReadOnlyAddress(shiftAddress); + PageAlignedShiftHeadAddress(shiftAddress); + + if (CannotAllocate(localTailPageOffset.Page + 1)) + { + // We should not allocate the next page; reset to end of page + // so that next attempt can retry + localTailPageOffset.Offset = PageSize; + Interlocked.Exchange(ref TailPageOffset.PageAndOffset, localTailPageOffset.PageAndOffset); + return 0; + } + + // Allocate next page in advance, if needed + int nextPageIndex = (localTailPageOffset.Page + 2) % BufferSize; + if ((!IsAllocated(nextPageIndex))) + { + AllocatePage(nextPageIndex); + } + + localTailPageOffset.Page++; + localTailPageOffset.Offset = 0; + TailPageOffset = localTailPageOffset; + + return 0; + } + #endregion + + return (((long)page) << LogPageSizeBits) | ((long)offset); + } + + private bool CannotAllocate(int page) + { + return + (page >= BufferSize + (ClosedUntilAddress >> LogPageSizeBits)); + } + + /// + /// Used by applications to make the current state of the database immutable quickly + /// + /// + public bool ShiftReadOnlyToTail(out long tailAddress) + { + tailAddress = GetTailAddress(); + long localTailAddress = tailAddress; + long currentReadOnlyOffset = ReadOnlyAddress; + if (Utility.MonotonicUpdate(ref ReadOnlyAddress, tailAddress, out long oldReadOnlyOffset)) + { + epoch.BumpCurrentEpoch(() => OnPagesMarkedReadOnly(localTailAddress)); + return true; + } + return false; + } + + /// + /// Used by applications to move read-only forward + /// + /// + public bool ShiftReadOnlyAddress(long newReadOnlyAddress) + { + if (Utility.MonotonicUpdate(ref ReadOnlyAddress, newReadOnlyAddress, out long oldReadOnlyOffset)) + { + epoch.BumpCurrentEpoch(() => OnPagesMarkedReadOnly(newReadOnlyAddress)); + return true; + } + return false; + } + + /// + /// Shift begin address + /// + /// + public void ShiftBeginAddress(long newBeginAddress) + { + // First update the begin address + var b = Utility.MonotonicUpdate(ref BeginAddress, newBeginAddress, out long oldBeginAddress); + b = b && (oldBeginAddress >> LogSegmentSizeBits != newBeginAddress >> LogSegmentSizeBits); + + // Then the head address + var h = Utility.MonotonicUpdate(ref HeadAddress, newBeginAddress, out long old); + + // Finally the read-only address + var r = Utility.MonotonicUpdate(ref ReadOnlyAddress, newBeginAddress, out old); + + if (h || r || b) + { + epoch.Resume(); + // Clean up until begin address + epoch.BumpCurrentEpoch(() => + { + if (r) + { + Utility.MonotonicUpdate(ref SafeReadOnlyAddress, newBeginAddress, out long _old); + Utility.MonotonicUpdate(ref FlushedUntilAddress, newBeginAddress, out _old); + } + if (h) OnPagesClosed(newBeginAddress); + + if (b) TruncateUntilAddress(newBeginAddress); + }); + epoch.Suspend(); + } + } + + /// + /// Wraps when an allocator potentially has to interact with multiple devices + /// + /// + protected virtual void TruncateUntilAddress(long toAddress) + { + device.TruncateUntilAddress(toAddress); + } + + /// + /// Seal: make sure there are no longer any threads writing to the page + /// Flush: send page to secondary store + /// + /// + public void OnPagesMarkedReadOnly(long newSafeReadOnlyAddress) + { + if (Utility.MonotonicUpdate(ref SafeReadOnlyAddress, newSafeReadOnlyAddress, out long oldSafeReadOnlyAddress)) + { + Debug.WriteLine("SafeReadOnly shifted from {0:X} to {1:X}", oldSafeReadOnlyAddress, newSafeReadOnlyAddress); + OnReadOnlyObserver?.OnNext(Scan(oldSafeReadOnlyAddress, newSafeReadOnlyAddress, ScanBufferingMode.NoBuffering)); + AsyncFlushPages(oldSafeReadOnlyAddress, newSafeReadOnlyAddress); + } + } + + /// + /// Action to be performed for when all threads have + /// agreed that a page range is closed. + /// + /// + public void OnPagesClosed(long newSafeHeadAddress) + { + if (Utility.MonotonicUpdate(ref SafeHeadAddress, newSafeHeadAddress, out long oldSafeHeadAddress)) + { + Debug.WriteLine("SafeHeadOffset shifted from {0:X} to {1:X}", oldSafeHeadAddress, newSafeHeadAddress); + + for (long closePageAddress = oldSafeHeadAddress & ~PageSizeMask; closePageAddress < newSafeHeadAddress; closePageAddress += PageSize) + { + if (newSafeHeadAddress < closePageAddress + PageSize) + { + // Partial page - do not close + return; + } + + int closePage = (int)(closePageAddress >> LogPageSizeBits); + int closePageIndex = closePage % BufferSize; + + if (!IsAllocated(closePageIndex)) + AllocatePage(closePageIndex); + else + ClearPage(closePage); + Utility.MonotonicUpdate(ref PageStatusIndicator[closePageIndex].LastClosedUntilAddress, closePageAddress + PageSize, out _); + ShiftClosedUntilAddress(); + if (ClosedUntilAddress > FlushedUntilAddress) + { + throw new Exception($"Closed address {ClosedUntilAddress} exceeds flushed address {FlushedUntilAddress}"); + } + } + } + } + + private void DebugPrintAddresses(long closePageAddress) + { + var _flush = FlushedUntilAddress; + var _readonly = ReadOnlyAddress; + var _safereadonly = SafeReadOnlyAddress; + var _tail = GetTailAddress(); + var _head = HeadAddress; + var _safehead = SafeHeadAddress; + + Console.WriteLine("ClosePageAddress: {0}.{1}", GetPage(closePageAddress), GetOffsetInPage(closePageAddress)); + Console.WriteLine("FlushedUntil: {0}.{1}", GetPage(_flush), GetOffsetInPage(_flush)); + Console.WriteLine("Tail: {0}.{1}", GetPage(_tail), GetOffsetInPage(_tail)); + Console.WriteLine("Head: {0}.{1}", GetPage(_head), GetOffsetInPage(_head)); + Console.WriteLine("SafeHead: {0}.{1}", GetPage(_safehead), GetOffsetInPage(_safehead)); + Console.WriteLine("ReadOnly: {0}.{1}", GetPage(_readonly), GetOffsetInPage(_readonly)); + Console.WriteLine("SafeReadOnly: {0}.{1}", GetPage(_safereadonly), GetOffsetInPage(_safereadonly)); + } + + /// + /// Called every time a new tail page is allocated. Here the read-only is + /// shifted only to page boundaries unlike ShiftReadOnlyToTail where shifting + /// can happen to any fine-grained address. + /// + /// + private void PageAlignedShiftReadOnlyAddress(long currentTailAddress) + { + long currentReadOnlyAddress = ReadOnlyAddress; + long pageAlignedTailAddress = currentTailAddress & ~PageSizeMask; + long desiredReadOnlyAddress = (pageAlignedTailAddress - ReadOnlyLagAddress); + if (Utility.MonotonicUpdate(ref ReadOnlyAddress, desiredReadOnlyAddress, out long oldReadOnlyAddress)) + { + Debug.WriteLine("Allocate: Moving read-only offset from {0:X} to {1:X}", oldReadOnlyAddress, desiredReadOnlyAddress); + epoch.BumpCurrentEpoch(() => OnPagesMarkedReadOnly(desiredReadOnlyAddress)); + } + } + + /// + /// Called whenever a new tail page is allocated or when the user is checking for a failed memory allocation + /// Tries to shift head address based on the head offset lag size. + /// + /// + private void PageAlignedShiftHeadAddress(long currentTailAddress) + { + //obtain local values of variables that can change + long currentHeadAddress = HeadAddress; + long currentFlushedUntilAddress = FlushedUntilAddress; + long pageAlignedTailAddress = currentTailAddress & ~PageSizeMask; + long desiredHeadAddress = (pageAlignedTailAddress - HeadOffsetLagAddress); + + long newHeadAddress = desiredHeadAddress; + if (currentFlushedUntilAddress < newHeadAddress) + { + newHeadAddress = currentFlushedUntilAddress; + } + newHeadAddress = newHeadAddress & ~PageSizeMask; + + if (ReadCache && (newHeadAddress > HeadAddress)) + EvictCallback(HeadAddress, newHeadAddress); + + if (Utility.MonotonicUpdate(ref HeadAddress, newHeadAddress, out long oldHeadAddress)) + { + Debug.WriteLine("Allocate: Moving head offset from {0:X} to {1:X}", oldHeadAddress, newHeadAddress); + epoch.BumpCurrentEpoch(() => OnPagesClosed(newHeadAddress)); + } + } + + /// + /// Tries to shift head address to specified value + /// + /// + public long ShiftHeadAddress(long desiredHeadAddress) + { + //obtain local values of variables that can change + long currentFlushedUntilAddress = FlushedUntilAddress; + + long newHeadAddress = desiredHeadAddress; + if (currentFlushedUntilAddress < newHeadAddress) + { + newHeadAddress = currentFlushedUntilAddress; + } + + if (ReadCache && (newHeadAddress > HeadAddress)) + EvictCallback(HeadAddress, newHeadAddress); + + if (Utility.MonotonicUpdate(ref HeadAddress, newHeadAddress, out long oldHeadAddress)) + { + Debug.WriteLine("Allocate: Moving head offset from {0:X} to {1:X}", oldHeadAddress, newHeadAddress); + epoch.BumpCurrentEpoch(() => OnPagesClosed(newHeadAddress)); + } + return newHeadAddress; + } + + /// + /// Every async flush callback tries to update the flushed until address to the latest value possible + /// Is there a better way to do this with enabling fine-grained addresses (not necessarily at page boundaries)? + /// + protected void ShiftFlushedUntilAddress() + { + long currentFlushedUntilAddress = FlushedUntilAddress; + long page = GetPage(currentFlushedUntilAddress); + + bool update = false; + long pageLastFlushedAddress = PageStatusIndicator[page % BufferSize].LastFlushedUntilAddress; + while (pageLastFlushedAddress >= currentFlushedUntilAddress && currentFlushedUntilAddress >= (page << LogPageSizeBits)) + { + currentFlushedUntilAddress = pageLastFlushedAddress; + update = true; + page++; + pageLastFlushedAddress = PageStatusIndicator[page % BufferSize].LastFlushedUntilAddress; + } + + if (update) + { + if (Utility.MonotonicUpdate(ref FlushedUntilAddress, currentFlushedUntilAddress, out long oldFlushedUntilAddress)) + { + uint errorCode = 0; + if (errorList.Count > 0) + { + errorCode = errorList.CheckAndWait(oldFlushedUntilAddress, currentFlushedUntilAddress); + } + FlushCallback?.Invoke( + new CommitInfo + { + BeginAddress = BeginAddress, + FromAddress = oldFlushedUntilAddress, + UntilAddress = currentFlushedUntilAddress, + ErrorCode = errorCode + }); + + if (errorList.Count > 0) + { + errorList.RemoveUntil(currentFlushedUntilAddress); + } + } + } + } + + /// + /// Shift ClosedUntil address + /// + protected void ShiftClosedUntilAddress() + { + long currentClosedUntilAddress = ClosedUntilAddress; + long page = GetPage(currentClosedUntilAddress); + + bool update = false; + long pageLastClosedAddress = PageStatusIndicator[page % BufferSize].LastClosedUntilAddress; + while (pageLastClosedAddress >= currentClosedUntilAddress && currentClosedUntilAddress >= (page << LogPageSizeBits)) + { + currentClosedUntilAddress = pageLastClosedAddress; + update = true; + page++; + pageLastClosedAddress = PageStatusIndicator[(int)(page % BufferSize)].LastClosedUntilAddress; + } + + if (update) + { + Utility.MonotonicUpdate(ref ClosedUntilAddress, currentClosedUntilAddress, out long oldClosedUntilAddress); + } + } + + /// + /// Reset for recovery + /// + /// + /// + /// + public void RecoveryReset(long tailAddress, long headAddress, long beginAddress) + { + long tailPage = GetPage(tailAddress); + long offsetInPage = GetOffsetInPage(tailAddress); + TailPageOffset.Page = (int)tailPage; + TailPageOffset.Offset = (int)offsetInPage; + + // allocate next page as well - this is an invariant in the allocator! + var pageIndex = (TailPageOffset.Page % BufferSize); + var nextPageIndex = (pageIndex + 1) % BufferSize; + if (tailAddress > 0) + if (!IsAllocated(nextPageIndex)) + AllocatePage(nextPageIndex); + + BeginAddress = beginAddress; + HeadAddress = headAddress; + SafeHeadAddress = headAddress; + ClosedUntilAddress = headAddress; + FlushedUntilAddress = tailAddress; + ReadOnlyAddress = tailAddress; + SafeReadOnlyAddress = tailAddress; + + // for the last page which contains tailoffset, it must be open + pageIndex = GetPageIndexForAddress(tailAddress); + + // clear the last page starting from tail address + ClearPage(pageIndex, (int)GetOffsetInPage(tailAddress)); + + // Printing debug info + Debug.WriteLine("******* Recovered HybridLog Stats *******"); + Debug.WriteLine("Head Address: {0}", HeadAddress); + Debug.WriteLine("Safe Head Address: {0}", SafeHeadAddress); + Debug.WriteLine("ReadOnly Address: {0}", ReadOnlyAddress); + Debug.WriteLine("Safe ReadOnly Address: {0}", SafeReadOnlyAddress); + Debug.WriteLine("Tail Address: {0}", tailAddress); + } + + /// + /// Invoked by users to obtain a record from disk. It uses sector aligned memory to read + /// the record efficiently into memory. + /// + /// + /// + /// + /// + /// + internal void AsyncReadRecordToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext context, SectorAlignedMemory result = default(SectorAlignedMemory)) + { + ulong fileOffset = (ulong)(AlignedPageSizeBytes * (fromLogical >> LogPageSizeBits) + (fromLogical & PageSizeMask)); + ulong alignedFileOffset = (ulong)(((long)fileOffset / sectorSize) * sectorSize); + + uint alignedReadLength = (uint)((long)fileOffset + numBytes - (long)alignedFileOffset); + alignedReadLength = (uint)((alignedReadLength + (sectorSize - 1)) & ~(sectorSize - 1)); + + var record = bufferPool.Get((int)alignedReadLength); + record.valid_offset = (int)(fileOffset - alignedFileOffset); + record.available_bytes = (int)(alignedReadLength - (fileOffset - alignedFileOffset)); + record.required_bytes = numBytes; + + var asyncResult = default(AsyncGetFromDiskResult>); + asyncResult.context = context; + asyncResult.context.record = record; + device.ReadAsync(alignedFileOffset, + (IntPtr)asyncResult.context.record.aligned_pointer, + alignedReadLength, + callback, + asyncResult); + } + + /// + /// Read record to memory - simple version + /// + /// + /// + /// + /// + internal void AsyncReadRecordToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, ref SimpleReadContext context) + { + ulong fileOffset = (ulong)(AlignedPageSizeBytes * (fromLogical >> LogPageSizeBits) + (fromLogical & PageSizeMask)); + ulong alignedFileOffset = (ulong)(((long)fileOffset / sectorSize) * sectorSize); + + uint alignedReadLength = (uint)((long)fileOffset + numBytes - (long)alignedFileOffset); + alignedReadLength = (uint)((alignedReadLength + (sectorSize - 1)) & ~(sectorSize - 1)); + + context.record = bufferPool.Get((int)alignedReadLength); + context.record.valid_offset = (int)(fileOffset - alignedFileOffset); + context.record.available_bytes = (int)(alignedReadLength - (fileOffset - alignedFileOffset)); + context.record.required_bytes = numBytes; + + device.ReadAsync(alignedFileOffset, + (IntPtr)context.record.aligned_pointer, + alignedReadLength, + callback, + context); + } + + /// + /// Read pages from specified device + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + public void AsyncReadPagesFromDevice( + long readPageStart, + int numPages, + long untilAddress, + IOCompletionCallback callback, + TContext context, + long devicePageOffset = 0, + IDevice logDevice = null, IDevice objectLogDevice = null) + { + AsyncReadPagesFromDevice(readPageStart, numPages, untilAddress, callback, context, + out _, devicePageOffset, logDevice, objectLogDevice); + } + + /// + /// Read pages from specified device + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + private void AsyncReadPagesFromDevice( + long readPageStart, + int numPages, + long untilAddress, + IOCompletionCallback callback, + TContext context, + out CountdownEvent completed, + long devicePageOffset = 0, + IDevice device = null, IDevice objectLogDevice = null) + { + var usedDevice = device; + IDevice usedObjlogDevice = objectLogDevice; + + if (device == null) + { + usedDevice = this.device; + } + + completed = new CountdownEvent(numPages); + for (long readPage = readPageStart; readPage < (readPageStart + numPages); readPage++) + { + int pageIndex = (int)(readPage % BufferSize); + if (!IsAllocated(pageIndex)) + { + // Allocate a new page + AllocatePage(pageIndex); + } + else + { + ClearPage(readPage); + } + var asyncResult = new PageAsyncReadResult() + { + page = readPage, + context = context, + handle = completed, + maxPtr = PageSize + }; + + ulong offsetInFile = (ulong)(AlignedPageSizeBytes * readPage); + uint readLength = (uint)AlignedPageSizeBytes; + long adjustedUntilAddress = (AlignedPageSizeBytes * (untilAddress >> LogPageSizeBits) + (untilAddress & PageSizeMask)); + + if (adjustedUntilAddress > 0 && ((adjustedUntilAddress - (long)offsetInFile) < PageSize)) + { + readLength = (uint)(adjustedUntilAddress - (long)offsetInFile); + asyncResult.maxPtr = readLength; + readLength = (uint)((readLength + (sectorSize - 1)) & ~(sectorSize - 1)); + } + + if (device != null) + offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset)); + + ReadAsync(offsetInFile, pageIndex, readLength, callback, asyncResult, usedDevice, usedObjlogDevice); + } + } + + /// + /// Flush page range to disk + /// Called when all threads have agreed that a page range is sealed. + /// + /// + /// + public void AsyncFlushPages(long fromAddress, long untilAddress) + { + long startPage = fromAddress >> LogPageSizeBits; + long endPage = untilAddress >> LogPageSizeBits; + int numPages = (int)(endPage - startPage); + + long offsetInStartPage = GetOffsetInPage(fromAddress); + long offsetInEndPage = GetOffsetInPage(untilAddress); + + // Extra (partial) page being flushed + if (offsetInEndPage > 0) + numPages++; + + /* Request asynchronous writes to the device. If waitForPendingFlushComplete + * is set, then a CountDownEvent is set in the callback handle. + */ + for (long flushPage = startPage; flushPage < (startPage + numPages); flushPage++) + { + long pageStartAddress = flushPage << LogPageSizeBits; + long pageEndAddress = (flushPage + 1) << LogPageSizeBits; + + var asyncResult = new PageAsyncFlushResult + { + page = flushPage, + count = 1, + partial = false, + fromAddress = pageStartAddress, + untilAddress = pageEndAddress + }; + if ( + ((fromAddress > pageStartAddress) && (fromAddress < pageEndAddress)) || + ((untilAddress > pageStartAddress) && (untilAddress < pageEndAddress)) + ) + { + asyncResult.partial = true; + + if (untilAddress < pageEndAddress) + asyncResult.untilAddress = untilAddress; + + if (fromAddress > pageStartAddress) + asyncResult.fromAddress = fromAddress; + } + + // Partial page starting point, need to wait until the + // ongoing adjacent flush is completed to ensure correctness + if (GetOffsetInPage(asyncResult.fromAddress) > 0) + { + // Enqueue work in shared queue + var index = GetPageIndexForAddress(asyncResult.fromAddress); + PendingFlush[index].Add(asyncResult); + if (PendingFlush[index].RemoveAdjacent(FlushedUntilAddress, out PageAsyncFlushResult request)) + { + WriteAsync(request.fromAddress >> LogPageSizeBits, AsyncFlushPageCallback, request); + } + } + else + WriteAsync(flushPage, AsyncFlushPageCallback, asyncResult); + } + } + + /// + /// Flush pages asynchronously + /// + /// + /// + /// + /// + /// + public void AsyncFlushPages( + long flushPageStart, + int numPages, + IOCompletionCallback callback, + TContext context) + { + for (long flushPage = flushPageStart; flushPage < (flushPageStart + numPages); flushPage++) + { + int pageIndex = GetPageIndexForPage(flushPage); + var asyncResult = new PageAsyncFlushResult() + { + page = flushPage, + context = context, + count = 1, + partial = false, + untilAddress = (flushPage + 1) << LogPageSizeBits + }; + + WriteAsync(flushPage, callback, asyncResult); + } + } + + /// + /// Flush pages from startPage (inclusive) to endPage (exclusive) + /// to specified log device and obj device + /// + /// + /// + /// + /// + /// + /// + public void AsyncFlushPagesToDevice(long startPage, long endPage, long endLogicalAddress, IDevice device, IDevice objectLogDevice, out CountdownEvent completed) + { + int totalNumPages = (int)(endPage - startPage); + completed = new CountdownEvent(totalNumPages); + + for (long flushPage = startPage; flushPage < endPage; flushPage++) + { + var asyncResult = new PageAsyncFlushResult + { + handle = completed, + count = 1 + }; + + var pageSize = PageSize; + + if (flushPage == endPage - 1) + pageSize = (int)(endLogicalAddress - (flushPage << LogPageSizeBits)); + + // Intended destination is flushPage + WriteAsyncToDevice(startPage, flushPage, pageSize, AsyncFlushPageToDeviceCallback, asyncResult, device, objectLogDevice); + } + } + + /// + /// Async get from disk + /// + /// + /// + /// + /// + public void AsyncGetFromDisk(long fromLogical, + int numBytes, + AsyncIOContext context, + SectorAlignedMemory result = default(SectorAlignedMemory)) + { + if (epoch.IsProtected()) // Do not spin for unprotected IO threads + { + while (numPendingReads > 120) + { + Thread.Yield(); + epoch.ProtectAndDrain(); + } + } + Interlocked.Increment(ref numPendingReads); + + if (result == null) + AsyncReadRecordToMemory(fromLogical, numBytes, AsyncGetFromDiskCallback, context, result); + else + AsyncReadRecordObjectsToMemory(fromLogical, numBytes, AsyncGetFromDiskCallback, context, result); + } + + private void AsyncGetFromDiskCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + var result = (AsyncGetFromDiskResult>)Overlapped.Unpack(overlap).AsyncResult; + Interlocked.Decrement(ref numPendingReads); + + var ctx = result.context; + + var record = ctx.record.GetValidPointer(); + int requiredBytes = GetRequiredRecordSize((long)record, ctx.record.available_bytes); + if (ctx.record.available_bytes >= requiredBytes) + { + // We have the complete record. + if (RetrievedFullRecord(record, ref ctx)) + { + if (comparer.Equals(ref ctx.request_key.Get(), ref GetContextRecordKey(ref ctx))) + { + // The keys are same, so I/O is complete + // ctx.record = result.record; + ctx.callbackQueue.Add(ctx); + } + else + { + var oldAddress = ctx.logicalAddress; + + // Keys are not same. I/O is not complete + ctx.logicalAddress = GetInfoFromBytePointer(record).PreviousAddress; + if (ctx.logicalAddress >= BeginAddress) + { + ctx.record.Return(); + ctx.record = ctx.objBuffer = default(SectorAlignedMemory); + AsyncGetFromDisk(ctx.logicalAddress, requiredBytes, ctx); + } + else + { + ctx.callbackQueue.Add(ctx); + } + } + } + } + else + { + ctx.record.Return(); + AsyncGetFromDisk(ctx.logicalAddress, requiredBytes, ctx); + } + + Overlapped.Free(overlap); + } + + // static DateTime last = DateTime.Now; + + /// + /// IOCompletion callback for page flush + /// + /// + /// + /// + private void AsyncFlushPageCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + /* + if (DateTime.Now - last > TimeSpan.FromSeconds(7)) + { + last = DateTime.Now; + errorCode = 1; + Console.WriteLine("Disk error"); + }*/ + + + // Set the page status to flushed + PageAsyncFlushResult result = (PageAsyncFlushResult)Overlapped.Unpack(overlap).AsyncResult; + + if (Interlocked.Decrement(ref result.count) == 0) + { + if (errorCode != 0) + { + errorList.Add(result.fromAddress); + } + Utility.MonotonicUpdate(ref PageStatusIndicator[result.page % BufferSize].LastFlushedUntilAddress, result.untilAddress, out _); + ShiftFlushedUntilAddress(); + result.Free(); + } + + var _flush = FlushedUntilAddress; + if (GetOffsetInPage(_flush) > 0 && PendingFlush[GetPage(_flush) % BufferSize].RemoveAdjacent(_flush, out PageAsyncFlushResult request)) + { + WriteAsync(request.fromAddress >> LogPageSizeBits, AsyncFlushPageCallback, request); + } + + Overlapped.Free(overlap); + } + + /// + /// IOCompletion callback for page flush + /// + /// + /// + /// + private void AsyncFlushPageToDeviceCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + PageAsyncFlushResult result = (PageAsyncFlushResult)Overlapped.Unpack(overlap).AsyncResult; + + if (Interlocked.Decrement(ref result.count) == 0) + { + result.Free(); + } + Overlapped.Free(overlap); + } + + /// + /// Shallow copy + /// + /// + /// + public virtual void ShallowCopy(ref Key src, ref Key dst) + { + dst = src; + } + + /// + /// Shallow copy + /// + /// + /// + public virtual void ShallowCopy(ref Value src, ref Value dst) + { + dst = src; + } + + private string PrettyPrint(long address) + { + return $"{GetPage(address)}:{GetOffsetInPage(address)}"; + } + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/AsyncIOContext.cs b/ZeroLevel/Services/FASTER/Allocator/AsyncIOContext.cs new file mode 100644 index 0000000..f0ce5fa --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/AsyncIOContext.cs @@ -0,0 +1,80 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Concurrent; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Async IO context for PMM + /// + public unsafe struct AsyncIOContext + { + /// + /// Id + /// + public long id; + + /// + /// Key + /// + public IHeapContainer request_key; + + /// + /// Retrieved key + /// + public Key key; + + /// + /// Retrieved value + /// + public Value value; + + /// + /// Logical address + /// + public long logicalAddress; + + /// + /// Record buffer + /// + public SectorAlignedMemory record; + + /// + /// Object buffer + /// + public SectorAlignedMemory objBuffer; + + /// + /// Callback queue + /// + public BlockingCollection> callbackQueue; + + /// + /// Dispose + /// + public void Dispose() + { + // Do not dispose request_key as it is a shallow copy + // of the key in pendingContext + record.Return(); + } + } + + internal class SimpleReadContext : IAsyncResult + { + public long logicalAddress; + public SectorAlignedMemory record; + public SemaphoreSlim completedRead; + + public object AsyncState => throw new NotImplementedException(); + + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + public bool CompletedSynchronously => throw new NotImplementedException(); + + public bool IsCompleted => throw new NotImplementedException(); + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/AtomicOwner.cs b/ZeroLevel/Services/FASTER/Allocator/AtomicOwner.cs new file mode 100644 index 0000000..ad7de38 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/AtomicOwner.cs @@ -0,0 +1,91 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System.Threading; +using System.Runtime.InteropServices; +using System; + +namespace FASTER.core +{ + [StructLayout(LayoutKind.Explicit)] + struct AtomicOwner + { + [FieldOffset(0)] + int owner; + [FieldOffset(4)] + int count; + [FieldOffset(0)] + long atomic; + + /// + /// Enqueue token + /// true: success + caller is new owner + /// false: success + someone else is owner + /// + /// + public bool Enqueue() + { + while (true) + { + var older = this; + var newer = older; + newer.count++; + if (older.owner == 0) + newer.owner = 1; + + if (Interlocked.CompareExchange(ref this.atomic, newer.atomic, older.atomic) == older.atomic) + { + return older.owner == 0; + } + } + } + + /// + /// Dequeue token (caller is/remains owner) + /// true: successful dequeue + /// false: failed dequeue + /// + /// + public bool Dequeue() + { + while (true) + { + var older = this; + var newer = older; + newer.count--; + + if (Interlocked.CompareExchange(ref this.atomic, newer.atomic, older.atomic) == older.atomic) + { + return newer.count > 0; + } + } + } + + /// + /// Release queue ownership + /// true: successful release + /// false: failed release + /// + /// + public bool Release() + { + while (true) + { + var older = this; + var newer = older; + + if (newer.count > 0) + return false; + + if (newer.owner == 0) + throw new Exception("Invalid release by non-owner thread"); + newer.owner = 0; + + if (Interlocked.CompareExchange(ref this.atomic, newer.atomic, older.atomic) == older.atomic) + { + return true; + } + } + } + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/BlittableAllocator.cs b/ZeroLevel/Services/FASTER/Allocator/BlittableAllocator.cs new file mode 100644 index 0000000..a164e71 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/BlittableAllocator.cs @@ -0,0 +1,401 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Runtime.InteropServices; + +#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member + +namespace FASTER.core +{ + public unsafe sealed class BlittableAllocator : AllocatorBase + where Key : new() + where Value : new() + { + // Circular buffer definition + private byte[][] values; + private GCHandle[] handles; + private long[] pointers; + private readonly GCHandle ptrHandle; + private readonly long* nativePointers; + + // Record sizes + private static readonly int recordSize = Utility.GetSize(default(Record)); + private static readonly int keySize = Utility.GetSize(default(Key)); + private static readonly int valueSize = Utility.GetSize(default(Value)); + + public BlittableAllocator(LogSettings settings, IFasterEqualityComparer comparer, Action evictCallback = null, LightEpoch epoch = null, Action flushCallback = null) + : base(settings, comparer, evictCallback, epoch, flushCallback) + { + values = new byte[BufferSize][]; + handles = new GCHandle[BufferSize]; + pointers = new long[BufferSize]; + + ptrHandle = GCHandle.Alloc(pointers, GCHandleType.Pinned); + nativePointers = (long*)ptrHandle.AddrOfPinnedObject(); + } + + public override void Initialize() + { + Initialize(Constants.kFirstValidAddress); + } + + public override ref RecordInfo GetInfo(long physicalAddress) + { + return ref Unsafe.AsRef((void*)physicalAddress); + } + + public override ref RecordInfo GetInfoFromBytePointer(byte* ptr) + { + return ref Unsafe.AsRef(ptr); + } + + public override ref Key GetKey(long physicalAddress) + { + return ref Unsafe.AsRef((byte*)physicalAddress + RecordInfo.GetLength()); + } + + public override ref Value GetValue(long physicalAddress) + { + return ref Unsafe.AsRef((byte*)physicalAddress + RecordInfo.GetLength() + keySize); + } + + public override int GetRecordSize(long physicalAddress) + { + return recordSize; + } + + public override int GetAverageRecordSize() + { + return recordSize; + } + + public override int GetInitialRecordSize(ref Key key, ref Input input) + { + return recordSize; + } + + public override int GetRecordSize(ref Key key, ref Value value) + { + return recordSize; + } + + /// + /// Dispose memory allocator + /// + public override void Dispose() + { + if (values != null) + { + for (int i = 0; i < values.Length; i++) + { + if (handles[i].IsAllocated) + handles[i].Free(); + values[i] = null; + } + } + handles = null; + pointers = null; + values = null; + base.Dispose(); + } + + public override AddressInfo* GetKeyAddressInfo(long physicalAddress) + { + return (AddressInfo*)((byte*)physicalAddress + RecordInfo.GetLength()); + } + + public override AddressInfo* GetValueAddressInfo(long physicalAddress) + { + return (AddressInfo*)((byte*)physicalAddress + RecordInfo.GetLength() + keySize); + } + + /// + /// Allocate memory page, pinned in memory, and in sector aligned form, if possible + /// + /// + internal override void AllocatePage(int index) + { + var adjustedSize = PageSize + 2 * sectorSize; + byte[] tmp = new byte[adjustedSize]; + Array.Clear(tmp, 0, adjustedSize); + + handles[index] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + long p = (long)handles[index].AddrOfPinnedObject(); + pointers[index] = (p + (sectorSize - 1)) & ~(sectorSize - 1); + values[index] = tmp; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override long GetPhysicalAddress(long logicalAddress) + { + // Offset within page + int offset = (int)(logicalAddress & ((1L << LogPageSizeBits) - 1)); + + // Index of page within the circular buffer + int pageIndex = (int)((logicalAddress >> LogPageSizeBits) & (BufferSize - 1)); + return *(nativePointers + pageIndex) + offset; + } + + protected override bool IsAllocated(int pageIndex) + { + return values[pageIndex] != null; + } + + protected override void WriteAsync(long flushPage, IOCompletionCallback callback, PageAsyncFlushResult asyncResult) + { + WriteAsync((IntPtr)pointers[flushPage % BufferSize], + (ulong)(AlignedPageSizeBytes * flushPage), + (uint)AlignedPageSizeBytes, + callback, + asyncResult, device); + } + + protected override void WriteAsyncToDevice + (long startPage, long flushPage, int pageSize, IOCompletionCallback callback, + PageAsyncFlushResult asyncResult, IDevice device, IDevice objectLogDevice) + { + var alignedPageSize = (pageSize + (sectorSize - 1)) & ~(sectorSize - 1); + + WriteAsync((IntPtr)pointers[flushPage % BufferSize], + (ulong)(AlignedPageSizeBytes * (flushPage - startPage)), + (uint)alignedPageSize, callback, asyncResult, + device); + } + + /// + /// Get start logical address + /// + /// + /// + public override long GetStartLogicalAddress(long page) + { + return page << LogPageSizeBits; + } + + + /// + /// Get first valid logical address + /// + /// + /// + public override long GetFirstValidLogicalAddress(long page) + { + if (page == 0) + return (page << LogPageSizeBits) + Constants.kFirstValidAddress; + + return page << LogPageSizeBits; + } + + protected override void ClearPage(long page, int offset) + { + if (offset == 0) + Array.Clear(values[page % BufferSize], offset, values[page % BufferSize].Length - offset); + else + { + // Adjust array offset for cache alignment + offset += (int)(pointers[page % BufferSize] - (long)handles[page % BufferSize].AddrOfPinnedObject()); + Array.Clear(values[page % BufferSize], offset, values[page % BufferSize].Length - offset); + } + } + + /// + /// Delete in-memory portion of the log + /// + internal override void DeleteFromMemory() + { + for (int i = 0; i < values.Length; i++) + { + if (handles[i].IsAllocated) + handles[i].Free(); + values[i] = null; + } + handles = null; + pointers = null; + values = null; + } + + + private void WriteAsync(IntPtr alignedSourceAddress, ulong alignedDestinationAddress, uint numBytesToWrite, + IOCompletionCallback callback, PageAsyncFlushResult asyncResult, + IDevice device) + { + if (asyncResult.partial) + { + // Write only required bytes within the page + int aligned_start = (int)((asyncResult.fromAddress - (asyncResult.page << LogPageSizeBits))); + aligned_start = (aligned_start / sectorSize) * sectorSize; + + int aligned_end = (int)((asyncResult.untilAddress - (asyncResult.page << LogPageSizeBits))); + aligned_end = ((aligned_end + (sectorSize - 1)) & ~(sectorSize - 1)); + + numBytesToWrite = (uint)(aligned_end - aligned_start); + device.WriteAsync(alignedSourceAddress + aligned_start, alignedDestinationAddress + (ulong)aligned_start, numBytesToWrite, callback, asyncResult); + } + else + { + device.WriteAsync(alignedSourceAddress, alignedDestinationAddress, + numBytesToWrite, callback, asyncResult); + } + } + + protected override void ReadAsync( + ulong alignedSourceAddress, int destinationPageIndex, uint aligned_read_length, + IOCompletionCallback callback, PageAsyncReadResult asyncResult, IDevice device, IDevice objlogDevice) + { + device.ReadAsync(alignedSourceAddress, (IntPtr)pointers[destinationPageIndex], + aligned_read_length, callback, asyncResult); + } + + /// + /// Invoked by users to obtain a record from disk. It uses sector aligned memory to read + /// the record efficiently into memory. + /// + /// + /// + /// + /// + /// + protected override void AsyncReadRecordObjectsToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext context, SectorAlignedMemory result = default(SectorAlignedMemory)) + { + throw new InvalidOperationException("AsyncReadRecordObjectsToMemory invalid for BlittableAllocator"); + } + + /// + /// Retrieve objects from object log + /// + /// + /// + /// + protected override bool RetrievedFullRecord(byte* record, ref AsyncIOContext ctx) + { + ShallowCopy(ref GetKey((long)record), ref ctx.key); + ShallowCopy(ref GetValue((long)record), ref ctx.value); + return true; + } + + /// + /// Whether KVS has keys to serialize/deserialize + /// + /// + public override bool KeyHasObjects() + { + return false; + } + + /// + /// Whether KVS has values to serialize/deserialize + /// + /// + public override bool ValueHasObjects() + { + return false; + } + + public override IHeapContainer GetKeyContainer(ref Key key) => new StandardHeapContainer(ref key); + public override IHeapContainer GetValueContainer(ref Value value) => new StandardHeapContainer(ref value); + + public override long[] GetSegmentOffsets() + { + return null; + } + + internal override void PopulatePage(byte* src, int required_bytes, long destinationPage) + { + throw new Exception("BlittableAllocator memory pages are sector aligned - use direct copy"); + // Buffer.MemoryCopy(src, (void*)pointers[destinationPage % BufferSize], required_bytes, required_bytes); + } + + /// + /// Iterator interface for scanning FASTER log + /// + /// + /// + /// + /// + public override IFasterScanIterator Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode) + { + return new BlittableScanIterator(this, beginAddress, endAddress, scanBufferingMode); + } + + + /// + /// Read pages from specified device + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + internal void AsyncReadPagesFromDeviceToFrame( + long readPageStart, + int numPages, + long untilAddress, + IOCompletionCallback callback, + TContext context, + BlittableFrame frame, + out CountdownEvent completed, + long devicePageOffset = 0, + IDevice device = null, + IDevice objectLogDevice = null, + CancellationTokenSource cts = null) + { + var usedDevice = device; + IDevice usedObjlogDevice = objectLogDevice; + + if (device == null) + { + usedDevice = this.device; + } + + completed = new CountdownEvent(numPages); + for (long readPage = readPageStart; readPage < (readPageStart + numPages); readPage++) + { + int pageIndex = (int)(readPage % frame.frameSize); + if (frame.frame[pageIndex] == null) + { + frame.Allocate(pageIndex); + } + else + { + frame.Clear(pageIndex); + } + var asyncResult = new PageAsyncReadResult() + { + page = readPage, + context = context, + handle = completed, + frame = frame, + cts = cts + }; + + ulong offsetInFile = (ulong)(AlignedPageSizeBytes * readPage); + + uint readLength = (uint)AlignedPageSizeBytes; + long adjustedUntilAddress = (AlignedPageSizeBytes * (untilAddress >> LogPageSizeBits) + (untilAddress & PageSizeMask)); + + if (adjustedUntilAddress > 0 && ((adjustedUntilAddress - (long)offsetInFile) < PageSize)) + { + readLength = (uint)(adjustedUntilAddress - (long)offsetInFile); + readLength = (uint)((readLength + (sectorSize - 1)) & ~(sectorSize - 1)); + } + + if (device != null) + offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset)); + + usedDevice.ReadAsync(offsetInFile, (IntPtr)frame.pointers[pageIndex], readLength, callback, asyncResult); + } + } + } +} + + diff --git a/ZeroLevel/Services/FASTER/Allocator/BlittableFrame.cs b/ZeroLevel/Services/FASTER/Allocator/BlittableFrame.cs new file mode 100644 index 0000000..98a123f --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/BlittableFrame.cs @@ -0,0 +1,65 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Runtime.InteropServices; + +namespace FASTER.core +{ + /// + /// A frame is an in-memory circular buffer of log pages + /// + internal sealed class BlittableFrame : IDisposable + { + public readonly int frameSize, pageSize, sectorSize; + public readonly byte[][] frame; + public GCHandle[] handles; + public long[] pointers; + + public BlittableFrame(int frameSize, int pageSize, int sectorSize) + { + this.frameSize = frameSize; + this.pageSize = pageSize; + this.sectorSize = sectorSize; + + frame = new byte[frameSize][]; + handles = new GCHandle[frameSize]; + pointers = new long[frameSize]; + } + + public void Allocate(int index) + { + var adjustedSize = pageSize + 2 * sectorSize; + byte[] tmp = new byte[adjustedSize]; + Array.Clear(tmp, 0, adjustedSize); + + handles[index] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + long p = (long)handles[index].AddrOfPinnedObject(); + pointers[index] = (p + (sectorSize - 1)) & ~(sectorSize - 1); + frame[index] = tmp; + } + + public void Clear(int pageIndex) + { + Array.Clear(frame[pageIndex], 0, frame[pageIndex].Length); + } + + public long GetPhysicalAddress(long frameNumber, long offset) + { + return pointers[frameNumber % frameSize] + offset; + } + + public void Dispose() + { + for (int i = 0; i < frameSize; i++) + { + if (handles[i] != default(GCHandle)) + handles[i].Free(); + frame[i] = null; + pointers[i] = 0; + } + } + } +} + + diff --git a/ZeroLevel/Services/FASTER/Allocator/BlittableScanIterator.cs b/ZeroLevel/Services/FASTER/Allocator/BlittableScanIterator.cs new file mode 100644 index 0000000..05cb1ae --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/BlittableScanIterator.cs @@ -0,0 +1,238 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; +using System.Diagnostics; + +namespace FASTER.core +{ + /// + /// Scan iterator for hybrid log + /// + public class BlittableScanIterator : IFasterScanIterator + where Key : new() + where Value : new() + { + private readonly int frameSize; + private readonly BlittableAllocator hlog; + private readonly long beginAddress, endAddress; + private readonly BlittableFrame frame; + private readonly CountdownEvent[] loaded; + + private bool first = true; + private long currentAddress, nextAddress; + private long currentPhysicalAddress; + + /// + /// Current address + /// + public long CurrentAddress => currentAddress; + + /// + /// Constructor + /// + /// + /// + /// + /// + public unsafe BlittableScanIterator(BlittableAllocator hlog, long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode) + { + this.hlog = hlog; + + if (beginAddress == 0) + beginAddress = hlog.GetFirstValidLogicalAddress(0); + + this.beginAddress = beginAddress; + this.endAddress = endAddress; + currentAddress = -1; + nextAddress = beginAddress; + + if (scanBufferingMode == ScanBufferingMode.SinglePageBuffering) + frameSize = 1; + else if (scanBufferingMode == ScanBufferingMode.DoublePageBuffering) + frameSize = 2; + else if (scanBufferingMode == ScanBufferingMode.NoBuffering) + { + frameSize = 0; + return; + } + + frame = new BlittableFrame(frameSize, hlog.PageSize, hlog.GetDeviceSectorSize()); + loaded = new CountdownEvent[frameSize]; + + // Only load addresses flushed to disk + if (nextAddress < hlog.HeadAddress) + { + var frameNumber = (nextAddress >> hlog.LogPageSizeBits) % frameSize; + hlog.AsyncReadPagesFromDeviceToFrame + (nextAddress >> hlog.LogPageSizeBits, + 1, endAddress, AsyncReadPagesCallback, Empty.Default, + frame, out loaded[frameNumber]); + } + } + + /// + /// Gets reference to current key + /// + /// + public ref Key GetKey() + { + return ref hlog.GetKey(currentPhysicalAddress); + } + + /// + /// Gets reference to current value + /// + /// + public ref Value GetValue() + { + return ref hlog.GetValue(currentPhysicalAddress); + } + + /// + /// Get next record + /// + /// + /// True if record found, false if end of scan + public bool GetNext(out RecordInfo recordInfo) + { + recordInfo = default(RecordInfo); + + currentAddress = nextAddress; + while (true) + { + // Check for boundary conditions + if (currentAddress >= endAddress) + { + return false; + } + + if (currentAddress < hlog.BeginAddress) + { + throw new Exception("Iterator address is less than log BeginAddress " + hlog.BeginAddress); + } + + if (frameSize == 0 && currentAddress < hlog.HeadAddress) + { + throw new Exception("Iterator address is less than log HeadAddress in memory-scan mode"); + } + + var currentPage = currentAddress >> hlog.LogPageSizeBits; + var offset = currentAddress & hlog.PageSizeMask; + + if (currentAddress < hlog.HeadAddress) + BufferAndLoad(currentAddress, currentPage, currentPage % frameSize); + + var physicalAddress = default(long); + if (currentAddress >= hlog.HeadAddress) + physicalAddress = hlog.GetPhysicalAddress(currentAddress); + else + physicalAddress = frame.GetPhysicalAddress(currentPage % frameSize, offset); + + // Check if record fits on page, if not skip to next page + var recordSize = hlog.GetRecordSize(physicalAddress); + if ((currentAddress & hlog.PageSizeMask) + recordSize > hlog.PageSize) + { + currentAddress = (1 + (currentAddress >> hlog.LogPageSizeBits)) << hlog.LogPageSizeBits; + continue; + } + + ref var info = ref hlog.GetInfo(physicalAddress); + if (info.Invalid || info.IsNull()) + { + currentAddress += recordSize; + continue; + } + + currentPhysicalAddress = physicalAddress; + recordInfo = info; + nextAddress = currentAddress + recordSize; + return true; + } + } + + /// + /// Get next record in iterator + /// + /// + /// + /// + /// + public bool GetNext(out RecordInfo recordInfo, out Key key, out Value value) + { + key = default(Key); + value = default(Value); + + if (GetNext(out recordInfo)) + { + key = GetKey(); + value = GetValue(); + return true; + } + + return false; + } + + /// + /// Dispose the iterator + /// + public void Dispose() + { + frame?.Dispose(); + } + + private unsafe void BufferAndLoad(long currentAddress, long currentPage, long currentFrame) + { + if (first || (currentAddress & hlog.PageSizeMask) == 0) + { + // Prefetch pages based on buffering mode + if (frameSize == 1) + { + if (!first) + { + hlog.AsyncReadPagesFromDeviceToFrame(currentAddress >> hlog.LogPageSizeBits, 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[currentFrame]); + } + } + else + { + var endPage = endAddress >> hlog.LogPageSizeBits; + if ((endPage > currentPage) && + ((endPage > currentPage + 1) || ((endAddress & hlog.PageSizeMask) != 0))) + { + hlog.AsyncReadPagesFromDeviceToFrame(1 + (currentAddress >> hlog.LogPageSizeBits), 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[(currentPage + 1) % frameSize]); + } + } + first = false; + } + loaded[currentFrame].Wait(); + } + + private unsafe void AsyncReadPagesCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + var result = (PageAsyncReadResult)Overlapped.Unpack(overlap).AsyncResult; + + if (result.freeBuffer1 != null) + { + hlog.PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, result.page); + result.freeBuffer1.Return(); + result.freeBuffer1 = null; + } + + if (result.handle != null) + { + result.handle.Signal(); + } + + Interlocked.MemoryBarrier(); + Overlapped.Free(overlap); + } + } +} + + diff --git a/ZeroLevel/Services/FASTER/Allocator/ErrorList.cs b/ZeroLevel/Services/FASTER/Allocator/ErrorList.cs new file mode 100644 index 0000000..59d8b48 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/ErrorList.cs @@ -0,0 +1,63 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System.Collections.Generic; +using System.Threading; + +namespace FASTER.core +{ + class ErrorList + { + private readonly List errorList; + + public ErrorList() => errorList = new List(); + + public void Add(long address) + { + lock (errorList) + errorList.Add(address); + } + + public uint CheckAndWait(long oldFlushedUntilAddress, long currentFlushedUntilAddress) + { + bool done = false; + uint errorCode = 0; + while (!done) + { + done = true; + lock (errorList) + { + for (int i = 0; i < errorList.Count; i++) + { + if (errorList[i] >= oldFlushedUntilAddress && errorList[i] < currentFlushedUntilAddress) + { + errorCode = 1; + } + else if (errorList[i] < oldFlushedUntilAddress) + { + done = false; // spin barrier for other threads during exception + Thread.Yield(); + } + } + } + } + return errorCode; + } + + public void RemoveUntil(long currentFlushedUntilAddress) + { + lock (errorList) + { + for (int i = 0; i < errorList.Count; i++) + { + if (errorList[i] < currentFlushedUntilAddress) + { + errorList.RemoveAt(i); + } + } + } + + } + public int Count => errorList.Count; + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/GenericAllocator.cs b/ZeroLevel/Services/FASTER/Allocator/GenericAllocator.cs new file mode 100644 index 0000000..23b747b --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/GenericAllocator.cs @@ -0,0 +1,968 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Collections.Generic; +using System.IO; +using System.Diagnostics; +using System.Runtime.InteropServices; + +#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member + +namespace FASTER.core +{ + [StructLayout(LayoutKind.Sequential, Pack=1)] + public struct Record + { + public RecordInfo info; + public Key key; + public Value value; + } + + + public unsafe sealed class GenericAllocator : AllocatorBase + where Key : new() + where Value : new() + { + // Circular buffer definition + internal Record[][] values; + + // Object log related variables + private readonly IDevice objectLogDevice; + // Size of object chunks beign written to storage + private const int ObjectBlockSize = 100 * (1 << 20); + // Tail offsets per segment, in object log + public readonly long[] segmentOffsets; + // Record sizes + private static readonly int recordSize = Utility.GetSize(default(Record)); + private readonly SerializerSettings SerializerSettings; + private readonly bool keyBlittable = Utility.IsBlittable(); + private readonly bool valueBlittable = Utility.IsBlittable(); + + public GenericAllocator(LogSettings settings, SerializerSettings serializerSettings, IFasterEqualityComparer comparer, Action evictCallback = null, LightEpoch epoch = null, Action flushCallback = null) + : base(settings, comparer, evictCallback, epoch, flushCallback) + { + SerializerSettings = serializerSettings; + + if ((!keyBlittable) && (settings.LogDevice as NullDevice == null) && ((SerializerSettings == null) || (SerializerSettings.keySerializer == null))) + { + throw new Exception("Key is not blittable, but no serializer specified via SerializerSettings"); + } + + if ((!valueBlittable) && (settings.LogDevice as NullDevice == null) && ((SerializerSettings == null) || (SerializerSettings.valueSerializer == null))) + { + throw new Exception("Value is not blittable, but no serializer specified via SerializerSettings"); + } + + values = new Record[BufferSize][]; + segmentOffsets = new long[SegmentBufferSize]; + + objectLogDevice = settings.ObjectLogDevice; + + if ((settings.LogDevice as NullDevice == null) && (KeyHasObjects() || ValueHasObjects())) + { + if (objectLogDevice == null) + throw new Exception("Objects in key/value, but object log not provided during creation of FASTER instance"); + } + } + + public override void Initialize() + { + Initialize(recordSize); + } + + /// + /// Get start logical address + /// + /// + /// + public override long GetStartLogicalAddress(long page) + { + return page << LogPageSizeBits; + } + + /// + /// Get first valid logical address + /// + /// + /// + public override long GetFirstValidLogicalAddress(long page) + { + if (page == 0) + return (page << LogPageSizeBits) + recordSize; + + return page << LogPageSizeBits; + } + + public override ref RecordInfo GetInfo(long physicalAddress) + { + // Offset within page + int offset = (int)(physicalAddress & PageSizeMask); + + // Index of page within the circular buffer + int pageIndex = (int)((physicalAddress >> LogPageSizeBits) & BufferSizeMask); + + return ref values[pageIndex][offset/recordSize].info; + } + + public override ref RecordInfo GetInfoFromBytePointer(byte* ptr) + { + return ref Unsafe.AsRef>(ptr).info; + } + + + public override ref Key GetKey(long physicalAddress) + { + // Offset within page + int offset = (int)(physicalAddress & PageSizeMask); + + // Index of page within the circular buffer + int pageIndex = (int)((physicalAddress >> LogPageSizeBits) & BufferSizeMask); + + return ref values[pageIndex][offset / recordSize].key; + } + + public override ref Value GetValue(long physicalAddress) + { + // Offset within page + int offset = (int)(physicalAddress & PageSizeMask); + + // Index of page within the circular buffer + int pageIndex = (int)((physicalAddress >> LogPageSizeBits) & BufferSizeMask); + + return ref values[pageIndex][offset / recordSize].value; + } + + public override int GetRecordSize(long physicalAddress) + { + return recordSize; + } + + public override int GetAverageRecordSize() + { + return recordSize; + } + + public override int GetInitialRecordSize(ref Key key, ref Input input) + { + return recordSize; + } + + public override int GetRecordSize(ref Key key, ref Value value) + { + return recordSize; + } + + /// + /// Dispose memory allocator + /// + public override void Dispose() + { + if (values != null) + { + for (int i = 0; i < values.Length; i++) + { + values[i] = null; + } + values = null; + } + base.Dispose(); + } + + /// + /// Delete in-memory portion of the log + /// + internal override void DeleteFromMemory() + { + for (int i = 0; i < values.Length; i++) + { + values[i] = null; + } + values = null; + } + + public override AddressInfo* GetKeyAddressInfo(long physicalAddress) + { + return (AddressInfo*)Unsafe.AsPointer(ref Unsafe.AsRef>((byte*)physicalAddress).key); + } + + public override AddressInfo* GetValueAddressInfo(long physicalAddress) + { + return (AddressInfo*)Unsafe.AsPointer(ref Unsafe.AsRef>((byte*)physicalAddress).value); + } + + /// + /// Allocate memory page, pinned in memory, and in sector aligned form, if possible + /// + /// + internal override void AllocatePage(int index) + { + values[index] = AllocatePage(); + } + + internal Record[] AllocatePage() + { + Record[] tmp; + if (PageSize % recordSize == 0) + tmp = new Record[PageSize / recordSize]; + else + tmp = new Record[1 + (PageSize / recordSize)]; + Array.Clear(tmp, 0, tmp.Length); + return tmp; + } + + public override long GetPhysicalAddress(long logicalAddress) + { + return logicalAddress; + } + + protected override bool IsAllocated(int pageIndex) + { + return values[pageIndex] != null; + } + + protected override void TruncateUntilAddress(long toAddress) + { + base.TruncateUntilAddress(toAddress); + objectLogDevice.TruncateUntilAddress(toAddress); + } + + protected override void WriteAsync(long flushPage, IOCompletionCallback callback, PageAsyncFlushResult asyncResult) + { + WriteAsync(flushPage, + (ulong)(AlignedPageSizeBytes * flushPage), + (uint)PageSize, + callback, + asyncResult, device, objectLogDevice); + } + + protected override void WriteAsyncToDevice + (long startPage, long flushPage, int pageSize, IOCompletionCallback callback, + PageAsyncFlushResult asyncResult, IDevice device, IDevice objectLogDevice) + { + // We are writing to separate device, so use fresh segment offsets + WriteAsync(flushPage, + (ulong)(AlignedPageSizeBytes * (flushPage - startPage)), + (uint)pageSize, callback, asyncResult, + device, objectLogDevice, flushPage, new long[SegmentBufferSize]); + } + + + + protected override void ClearPage(long page, int offset) + { + Array.Clear(values[page % BufferSize], offset / recordSize, values[page % BufferSize].Length - offset / recordSize); + + // Close segments + var thisCloseSegment = page >> (LogSegmentSizeBits - LogPageSizeBits); + var nextCloseSegment = (page + 1) >> (LogSegmentSizeBits - LogPageSizeBits); + + if (thisCloseSegment != nextCloseSegment) + { + // We are clearing the last page in current segment + segmentOffsets[thisCloseSegment % SegmentBufferSize] = 0; + } + } + + private void WriteAsync(long flushPage, ulong alignedDestinationAddress, uint numBytesToWrite, + IOCompletionCallback callback, PageAsyncFlushResult asyncResult, + IDevice device, IDevice objlogDevice, long intendedDestinationPage = -1, long[] localSegmentOffsets = null) + { + // Short circuit if we are using a null device + if (device as NullDevice != null) + { + device.WriteAsync(IntPtr.Zero, 0, 0, numBytesToWrite, callback, asyncResult); + return; + } + + int start = 0, aligned_start = 0, end = (int)numBytesToWrite; + if (asyncResult.partial) + { + start = (int)((asyncResult.fromAddress - (asyncResult.page << LogPageSizeBits))); + aligned_start = (start / sectorSize) * sectorSize; + end = (int)((asyncResult.untilAddress - (asyncResult.page << LogPageSizeBits))); + } + + // Check if user did not override with special segment offsets + if (localSegmentOffsets == null) localSegmentOffsets = segmentOffsets; + + var src = values[flushPage % BufferSize]; + var buffer = bufferPool.Get((int)numBytesToWrite); + + if (aligned_start < start && (KeyHasObjects() || ValueHasObjects())) + { + // Do not read back the invalid header of page 0 + if ((flushPage > 0) || (start > GetFirstValidLogicalAddress(flushPage))) + { + // Get the overlapping HLOG from disk as we wrote it with + // object pointers previously. This avoids object reserialization + PageAsyncReadResult result = + new PageAsyncReadResult + { + handle = new CountdownEvent(1) + }; + device.ReadAsync(alignedDestinationAddress + (ulong)aligned_start, (IntPtr)buffer.aligned_pointer + aligned_start, + (uint)sectorSize, AsyncReadPageCallback, result); + result.handle.Wait(); + } + fixed (RecordInfo* pin = &src[0].info) + { + Debug.Assert(buffer.aligned_pointer + numBytesToWrite <= (byte*)buffer.handle.AddrOfPinnedObject() + buffer.buffer.Length); + + Buffer.MemoryCopy((void*)((long)Unsafe.AsPointer(ref src[0]) + start), buffer.aligned_pointer + start, + numBytesToWrite - start, numBytesToWrite - start); + } + } + else + { + fixed (RecordInfo* pin = &src[0].info) + { + Debug.Assert(buffer.aligned_pointer + numBytesToWrite <= (byte*)buffer.handle.AddrOfPinnedObject() + buffer.buffer.Length); + + Buffer.MemoryCopy((void*)((long)Unsafe.AsPointer(ref src[0]) + aligned_start), buffer.aligned_pointer + aligned_start, + numBytesToWrite - aligned_start, numBytesToWrite - aligned_start); + } + } + + long ptr = (long)buffer.aligned_pointer; + List addr = new List(); + asyncResult.freeBuffer1 = buffer; + + MemoryStream ms = new MemoryStream(); + IObjectSerializer keySerializer = null; + IObjectSerializer valueSerializer = null; + + if (KeyHasObjects()) + { + keySerializer = SerializerSettings.keySerializer(); + keySerializer.BeginSerialize(ms); + } + if (ValueHasObjects()) + { + valueSerializer = SerializerSettings.valueSerializer(); + valueSerializer.BeginSerialize(ms); + } + + + for (int i=start/recordSize; iAddress = pos; + key_address->Size = (int)(ms.Position - pos); + addr.Add((long)key_address); + } + + if (ValueHasObjects() && !src[i].info.Tombstone) + { + long pos = ms.Position; + valueSerializer.Serialize(ref src[i].value); + var value_address = GetValueAddressInfo((long)(buffer.aligned_pointer + i * recordSize)); + value_address->Address = pos; + value_address->Size = (int)(ms.Position - pos); + addr.Add((long)value_address); + } + } + + if (ms.Position > ObjectBlockSize || i == (end / recordSize) - 1) + { + var memoryStreamLength = (int)ms.Position; + + var _objBuffer = bufferPool.Get(memoryStreamLength); + + asyncResult.done = new AutoResetEvent(false); + + var _alignedLength = (memoryStreamLength + (sectorSize - 1)) & ~(sectorSize - 1); + + var _objAddr = Interlocked.Add(ref localSegmentOffsets[(long)(alignedDestinationAddress >> LogSegmentSizeBits) % SegmentBufferSize], _alignedLength) - _alignedLength; + + if (KeyHasObjects()) + keySerializer.EndSerialize(); + if (ValueHasObjects()) + valueSerializer.EndSerialize(); + + ms.Close(); + + fixed (void* src_ = ms.GetBuffer()) + Buffer.MemoryCopy(src_, _objBuffer.aligned_pointer, memoryStreamLength, memoryStreamLength); + + foreach (var address in addr) + ((AddressInfo*)address)->Address += _objAddr; + + + if (i < (end / recordSize) - 1) + { + ms = new MemoryStream(); + if (KeyHasObjects()) + keySerializer.BeginSerialize(ms); + if (ValueHasObjects()) + valueSerializer.BeginSerialize(ms); + + objlogDevice.WriteAsync( + (IntPtr)_objBuffer.aligned_pointer, + (int)(alignedDestinationAddress >> LogSegmentSizeBits), + (ulong)_objAddr, (uint)_alignedLength, AsyncFlushPartialObjectLogCallback, asyncResult); + + // Wait for write to complete before resuming next write + asyncResult.done.WaitOne(); + _objBuffer.Return(); + } + else + { + // need to write both page and object cache + Interlocked.Increment(ref asyncResult.count); + + asyncResult.freeBuffer2 = _objBuffer; + objlogDevice.WriteAsync( + (IntPtr)_objBuffer.aligned_pointer, + (int)(alignedDestinationAddress >> LogSegmentSizeBits), + (ulong)_objAddr, (uint)_alignedLength, callback, asyncResult); + } + } + } + + if (asyncResult.partial) + { + var aligned_end = (int)((asyncResult.untilAddress - (asyncResult.page << LogPageSizeBits))); + aligned_end = ((aligned_end + (sectorSize - 1)) & ~(sectorSize - 1)); + numBytesToWrite = (uint)(aligned_end - aligned_start); + } + + var alignedNumBytesToWrite = (uint)((numBytesToWrite + (sectorSize - 1)) & ~(sectorSize - 1)); + + // Finally write the hlog page + device.WriteAsync((IntPtr)buffer.aligned_pointer + aligned_start, alignedDestinationAddress + (ulong)aligned_start, + alignedNumBytesToWrite, callback, asyncResult); + } + + private void AsyncReadPageCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + // Set the page status to flushed + var result = (PageAsyncReadResult)Overlapped.Unpack(overlap).AsyncResult; + + result.handle.Signal(); + + Overlapped.Free(overlap); + } + + protected override void ReadAsync( + ulong alignedSourceAddress, int destinationPageIndex, uint aligned_read_length, + IOCompletionCallback callback, PageAsyncReadResult asyncResult, IDevice device, IDevice objlogDevice) + { + asyncResult.freeBuffer1 = bufferPool.Get((int)aligned_read_length); + asyncResult.freeBuffer1.required_bytes = (int)aligned_read_length; + + if (!(KeyHasObjects() || ValueHasObjects())) + { + device.ReadAsync(alignedSourceAddress, (IntPtr)asyncResult.freeBuffer1.aligned_pointer, + aligned_read_length, callback, asyncResult); + return; + } + + asyncResult.callback = callback; + + if (objlogDevice == null) + { + Debug.Assert(objectLogDevice != null); + objlogDevice = objectLogDevice; + } + asyncResult.objlogDevice = objlogDevice; + + device.ReadAsync(alignedSourceAddress, (IntPtr)asyncResult.freeBuffer1.aligned_pointer, + aligned_read_length, AsyncReadPageWithObjectsCallback, asyncResult); + } + + + /// + /// IOCompletion callback for page flush + /// + /// + /// + /// + private void AsyncFlushPartialObjectLogCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + // Set the page status to flushed + PageAsyncFlushResult result = (PageAsyncFlushResult)Overlapped.Unpack(overlap).AsyncResult; + result.done.Set(); + + Overlapped.Free(overlap); + } + + private void AsyncReadPageWithObjectsCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + PageAsyncReadResult result = (PageAsyncReadResult)Overlapped.Unpack(overlap).AsyncResult; + + Record[] src; + + // We are reading into a frame + if (result.frame != null) + { + var frame = (GenericFrame)result.frame; + src = frame.GetPage(result.page % frame.frameSize); + } + else + src = values[result.page % BufferSize]; + + + // Deserialize all objects until untilptr + if (result.resumePtr < result.untilPtr) + { + MemoryStream ms = new MemoryStream(result.freeBuffer2.buffer); + ms.Seek(result.freeBuffer2.offset, SeekOrigin.Begin); + Deserialize(result.freeBuffer1.GetValidPointer(), result.resumePtr, result.untilPtr, src, ms); + ms.Dispose(); + + result.freeBuffer2.Return(); + result.freeBuffer2 = null; + result.resumePtr = result.untilPtr; + } + + // If we have processed entire page, return + if (result.untilPtr >= result.maxPtr) + { + result.Free(); + + // Call the "real" page read callback + result.callback(errorCode, numBytes, overlap); + return; + } + + // We will be re-issuing I/O, so free current overlap + Overlapped.Free(overlap); + + // We will now be able to process all records until (but not including) untilPtr + GetObjectInfo(result.freeBuffer1.GetValidPointer(), ref result.untilPtr, result.maxPtr, ObjectBlockSize, out long startptr, out long size); + + // Object log fragment should be aligned by construction + Debug.Assert(startptr % sectorSize == 0); + + if (size > int.MaxValue) + throw new Exception("Unable to read object page, total size greater than 2GB: " + size); + + var alignedLength = (size + (sectorSize - 1)) & ~(sectorSize - 1); + var objBuffer = bufferPool.Get((int)alignedLength); + result.freeBuffer2 = objBuffer; + + // Request objects from objlog + result.objlogDevice.ReadAsync( + (int)(result.page >> (LogSegmentSizeBits - LogPageSizeBits)), + (ulong)startptr, + (IntPtr)objBuffer.aligned_pointer, (uint)alignedLength, AsyncReadPageWithObjectsCallback, result); + } + + /// + /// Invoked by users to obtain a record from disk. It uses sector aligned memory to read + /// the record efficiently into memory. + /// + /// + /// + /// + /// + /// + protected override void AsyncReadRecordObjectsToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext context, SectorAlignedMemory result = default(SectorAlignedMemory)) + { + ulong fileOffset = (ulong)(AlignedPageSizeBytes * (fromLogical >> LogPageSizeBits) + (fromLogical & PageSizeMask)); + ulong alignedFileOffset = (ulong)(((long)fileOffset / sectorSize) * sectorSize); + + uint alignedReadLength = (uint)((long)fileOffset + numBytes - (long)alignedFileOffset); + alignedReadLength = (uint)((alignedReadLength + (sectorSize - 1)) & ~(sectorSize - 1)); + + var record = bufferPool.Get((int)alignedReadLength); + record.valid_offset = (int)(fileOffset - alignedFileOffset); + record.available_bytes = (int)(alignedReadLength - (fileOffset - alignedFileOffset)); + record.required_bytes = numBytes; + + var asyncResult = default(AsyncGetFromDiskResult>); + asyncResult.context = context; + asyncResult.context.record = result; + asyncResult.context.objBuffer = record; + objectLogDevice.ReadAsync( + (int)(context.logicalAddress >> LogSegmentSizeBits), + alignedFileOffset, + (IntPtr)asyncResult.context.objBuffer.aligned_pointer, + alignedReadLength, + callback, + asyncResult); + } + + /// + /// Read pages from specified device + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + internal void AsyncReadPagesFromDeviceToFrame( + long readPageStart, + int numPages, + long untilAddress, + IOCompletionCallback callback, + TContext context, + GenericFrame frame, + out CountdownEvent completed, + long devicePageOffset = 0, + IDevice device = null, IDevice objectLogDevice = null) + { + var usedDevice = device; + IDevice usedObjlogDevice = objectLogDevice; + + if (device == null) + { + usedDevice = this.device; + } + + completed = new CountdownEvent(numPages); + for (long readPage = readPageStart; readPage < (readPageStart + numPages); readPage++) + { + int pageIndex = (int)(readPage % frame.frameSize); + if (frame.GetPage(pageIndex) == null) + { + frame.Allocate(pageIndex); + } + else + { + frame.Clear(pageIndex); + } + var asyncResult = new PageAsyncReadResult() + { + page = readPage, + context = context, + handle = completed, + maxPtr = PageSize, + frame = frame, + }; + + ulong offsetInFile = (ulong)(AlignedPageSizeBytes * readPage); + uint readLength = (uint)AlignedPageSizeBytes; + long adjustedUntilAddress = (AlignedPageSizeBytes * (untilAddress >> LogPageSizeBits) + (untilAddress & PageSizeMask)); + + if (adjustedUntilAddress > 0 && ((adjustedUntilAddress - (long)offsetInFile) < PageSize)) + { + readLength = (uint)(adjustedUntilAddress - (long)offsetInFile); + asyncResult.maxPtr = readLength; + readLength = (uint)((readLength + (sectorSize - 1)) & ~(sectorSize - 1)); + } + + if (device != null) + offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset)); + + ReadAsync(offsetInFile, pageIndex, readLength, callback, asyncResult, usedDevice, usedObjlogDevice); + } + } + + + #region Page handlers for objects + /// + /// Deseialize part of page from stream + /// + /// + /// From pointer + /// Until pointer + /// + /// Stream + public void Deserialize(byte *raw, long ptr, long untilptr, Record[] src, Stream stream) + { + IObjectSerializer keySerializer = null; + IObjectSerializer valueSerializer = null; + + long streamStartPos = stream.Position; + long start_addr = -1; + if (KeyHasObjects()) + { + keySerializer = SerializerSettings.keySerializer(); + keySerializer.BeginDeserialize(stream); + } + if (ValueHasObjects()) + { + valueSerializer = SerializerSettings.valueSerializer(); + valueSerializer.BeginDeserialize(stream); + } + + while (ptr < untilptr) + { + ref Record record = ref Unsafe.AsRef>(raw + ptr); + src[ptr / recordSize].info = record.info; + + if (!record.info.Invalid) + { + if (KeyHasObjects()) + { + var key_addr = GetKeyAddressInfo((long)raw + ptr); + if (start_addr == -1) start_addr = key_addr->Address; + if (stream.Position != streamStartPos + key_addr->Address - start_addr) + { + stream.Seek(streamStartPos + key_addr->Address - start_addr, SeekOrigin.Begin); + } + + src[ptr / recordSize].key = new Key(); + keySerializer.Deserialize(ref src[ptr/recordSize].key); + } + else + { + src[ptr / recordSize].key = record.key; + } + + if (!record.info.Tombstone) + { + if (ValueHasObjects()) + { + var value_addr = GetValueAddressInfo((long)raw + ptr); + if (start_addr == -1) start_addr = value_addr->Address; + if (stream.Position != streamStartPos + value_addr->Address - start_addr) + { + stream.Seek(streamStartPos + value_addr->Address - start_addr, SeekOrigin.Begin); + } + + src[ptr / recordSize].value = new Value(); + valueSerializer.Deserialize(ref src[ptr / recordSize].value); + } + else + { + src[ptr / recordSize].value = record.value; + } + } + } + ptr += GetRecordSize(ptr); + } + if (KeyHasObjects()) + { + keySerializer.EndDeserialize(); + } + if (ValueHasObjects()) + { + valueSerializer.EndDeserialize(); + } + } + + /// + /// Get location and range of object log addresses for specified log page + /// + /// + /// + /// + /// + /// + /// + public void GetObjectInfo(byte* raw, ref long ptr, long untilptr, int objectBlockSize, out long startptr, out long size) + { + long minObjAddress = long.MaxValue; + long maxObjAddress = long.MinValue; + + while (ptr < untilptr) + { + ref Record record = ref Unsafe.AsRef>(raw + ptr); + + if (!record.info.Invalid) + { + if (KeyHasObjects()) + { + var key_addr = GetKeyAddressInfo((long)raw + ptr); + var addr = key_addr->Address; + + // If object pointer is greater than kObjectSize from starting object pointer + if (minObjAddress != long.MaxValue && (addr - minObjAddress > objectBlockSize)) + { + break; + } + + if (addr < minObjAddress) minObjAddress = addr; + addr += key_addr->Size; + if (addr > maxObjAddress) maxObjAddress = addr; + } + + + if (ValueHasObjects() && !record.info.Tombstone) + { + var value_addr = GetValueAddressInfo((long)raw + ptr); + var addr = value_addr->Address; + + // If object pointer is greater than kObjectSize from starting object pointer + if (minObjAddress != long.MaxValue && (addr - minObjAddress > objectBlockSize)) + { + break; + } + + if (addr < minObjAddress) minObjAddress = addr; + addr += value_addr->Size; + if (addr > maxObjAddress) maxObjAddress = addr; + } + } + ptr += GetRecordSize(ptr); + } + + // Handle the case where no objects are to be written + if (minObjAddress == long.MaxValue && maxObjAddress == long.MinValue) + { + minObjAddress = 0; + maxObjAddress = 0; + } + + startptr = minObjAddress; + size = maxObjAddress - minObjAddress; + } + + /// + /// Retrieve objects from object log + /// + /// + /// + /// + protected override bool RetrievedFullRecord(byte* record, ref AsyncIOContext ctx) + { + if (!KeyHasObjects()) + { + ShallowCopy(ref Unsafe.AsRef>(record).key, ref ctx.key); + } + if (!ValueHasObjects()) + { + ShallowCopy(ref Unsafe.AsRef>(record).value, ref ctx.value); + } + + if (!(KeyHasObjects() || ValueHasObjects())) + return true; + + if (ctx.objBuffer == null) + { + // Issue IO for objects + long startAddress = -1; + long endAddress = -1; + if (KeyHasObjects()) + { + var x = GetKeyAddressInfo((long)record); + startAddress = x->Address; + endAddress = x->Address + x->Size; + } + + if (ValueHasObjects() && !GetInfoFromBytePointer(record).Tombstone) + { + var x = GetValueAddressInfo((long)record); + if (startAddress == -1) + startAddress = x->Address; + endAddress = x->Address + x->Size; + } + + // We are limited to a 2GB size per key-value + if (endAddress-startAddress > int.MaxValue) + throw new Exception("Size of key-value exceeds max of 2GB: " + (endAddress - startAddress)); + + AsyncGetFromDisk(startAddress, (int)(endAddress - startAddress), ctx, ctx.record); + return false; + } + + // Parse the key and value objects + MemoryStream ms = new MemoryStream(ctx.objBuffer.buffer); + ms.Seek(ctx.objBuffer.offset + ctx.objBuffer.valid_offset, SeekOrigin.Begin); + + if (KeyHasObjects()) + { + ctx.key = new Key(); + + var keySerializer = SerializerSettings.keySerializer(); + keySerializer.BeginDeserialize(ms); + keySerializer.Deserialize(ref ctx.key); + keySerializer.EndDeserialize(); + } + + if (ValueHasObjects() && !GetInfoFromBytePointer(record).Tombstone) + { + ctx.value = new Value(); + + var valueSerializer = SerializerSettings.valueSerializer(); + valueSerializer.BeginDeserialize(ms); + valueSerializer.Deserialize(ref ctx.value); + valueSerializer.EndDeserialize(); + } + + ctx.objBuffer.Return(); + return true; + } + + /// + /// Whether KVS has keys to serialize/deserialize + /// + /// + public override bool KeyHasObjects() + { + return SerializerSettings.keySerializer != null; + } + + /// + /// Whether KVS has values to serialize/deserialize + /// + /// + public override bool ValueHasObjects() + { + return SerializerSettings.valueSerializer != null; + } + #endregion + + public override IHeapContainer GetKeyContainer(ref Key key) => new StandardHeapContainer(ref key); + public override IHeapContainer GetValueContainer(ref Value value) => new StandardHeapContainer(ref value); + + public override long[] GetSegmentOffsets() + { + return segmentOffsets; + } + + internal override void PopulatePage(byte* src, int required_bytes, long destinationPage) + { + PopulatePage(src, required_bytes, ref values[destinationPage % BufferSize]); + } + + internal void PopulatePageFrame(byte* src, int required_bytes, Record[] frame) + { + PopulatePage(src, required_bytes, ref frame); + } + + internal void PopulatePage(byte* src, int required_bytes, ref Record[] destinationPage) + { + fixed (RecordInfo* pin = &destinationPage[0].info) + { + Debug.Assert(required_bytes <= recordSize * destinationPage.Length); + + Buffer.MemoryCopy(src, Unsafe.AsPointer(ref destinationPage[0]), required_bytes, required_bytes); + } + } + + /// + /// Iterator interface for scanning FASTER log + /// + /// + /// + /// + /// + public override IFasterScanIterator Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode) + { + return new GenericScanIterator(this, beginAddress, endAddress, scanBufferingMode); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/GenericFrame.cs b/ZeroLevel/Services/FASTER/Allocator/GenericFrame.cs new file mode 100644 index 0000000..acb502b --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/GenericFrame.cs @@ -0,0 +1,67 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; + +namespace FASTER.core +{ + /// + /// A frame is an in-memory circular buffer of log pages + /// + internal sealed class GenericFrame : IDisposable + { + private readonly Record[][] frame; + public readonly int frameSize, pageSize; + private readonly int recordSize = Utility.GetSize(default(Record)); + + public GenericFrame(int frameSize, int pageSize) + { + this.frameSize = frameSize; + this.pageSize = pageSize; + frame = new Record[frameSize][]; + } + + public void Allocate(int index) + { + Record[] tmp; + if (pageSize % recordSize == 0) + tmp = new Record[pageSize / recordSize]; + else + tmp = new Record[1 + (pageSize / recordSize)]; + Array.Clear(tmp, 0, tmp.Length); + frame[index] = tmp; + } + + public void Clear(int pageIndex) + { + Array.Clear(frame[pageIndex], 0, frame[pageIndex].Length); + } + + public ref Key GetKey(long frameNumber, long offset) + { + return ref frame[frameNumber][offset].key; + } + + public ref Value GetValue(long frameNumber, long offset) + { + return ref frame[frameNumber][offset].value; + } + + public ref RecordInfo GetInfo(long frameNumber, long offset) + { + return ref frame[frameNumber][offset].info; + } + + public ref Record[] GetPage(long frameNumber) + { + return ref frame[frameNumber]; + } + + public void Dispose() + { + Array.Clear(frame, 0, frame.Length); + } + } +} + + diff --git a/ZeroLevel/Services/FASTER/Allocator/GenericScanIterator.cs b/ZeroLevel/Services/FASTER/Allocator/GenericScanIterator.cs new file mode 100644 index 0000000..8cb7e1e --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/GenericScanIterator.cs @@ -0,0 +1,255 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; +using System.Diagnostics; + +namespace FASTER.core +{ + /// + /// Scan iterator for hybrid log + /// + public class GenericScanIterator : IFasterScanIterator + where Key : new() + where Value : new() + { + private readonly int frameSize; + private readonly GenericAllocator hlog; + private readonly long beginAddress, endAddress; + private readonly GenericFrame frame; + private readonly CountdownEvent[] loaded; + private readonly int recordSize; + + private bool first = true; + private long currentAddress, nextAddress; + private Key currentKey; + private Value currentValue; + + /// + /// Current address + /// + public long CurrentAddress => currentAddress; + + /// + /// Constructor + /// + /// + /// + /// + /// + public unsafe GenericScanIterator(GenericAllocator hlog, long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode) + { + this.hlog = hlog; + + if (beginAddress == 0) + beginAddress = hlog.GetFirstValidLogicalAddress(0); + + this.beginAddress = beginAddress; + this.endAddress = endAddress; + + recordSize = hlog.GetRecordSize(0); + currentAddress = -1; + nextAddress = beginAddress; + + if (scanBufferingMode == ScanBufferingMode.SinglePageBuffering) + frameSize = 1; + else if (scanBufferingMode == ScanBufferingMode.DoublePageBuffering) + frameSize = 2; + else if (scanBufferingMode == ScanBufferingMode.NoBuffering) + { + frameSize = 0; + return; + } + + frame = new GenericFrame(frameSize, hlog.PageSize); + loaded = new CountdownEvent[frameSize]; + + // Only load addresses flushed to disk + if (nextAddress < hlog.HeadAddress) + { + var frameNumber = (nextAddress >> hlog.LogPageSizeBits) % frameSize; + hlog.AsyncReadPagesFromDeviceToFrame + (nextAddress >> hlog.LogPageSizeBits, + 1, endAddress, AsyncReadPagesCallback, Empty.Default, + frame, out loaded[frameNumber]); + } + } + + /// + /// Gets reference to current key + /// + /// + public ref Key GetKey() + { + return ref currentKey; + } + + /// + /// Gets reference to current value + /// + /// + public ref Value GetValue() + { + return ref currentValue; + } + + /// + /// Get next record in iterator + /// + /// + /// + public bool GetNext(out RecordInfo recordInfo) + { + recordInfo = default(RecordInfo); + currentKey = default(Key); + currentValue = default(Value); + + currentAddress = nextAddress; + while (true) + { + // Check for boundary conditions + if (currentAddress >= endAddress) + { + return false; + } + + if (currentAddress < hlog.BeginAddress) + { + throw new Exception("Iterator address is less than log BeginAddress " + hlog.BeginAddress); + } + + if (frameSize == 0 && currentAddress < hlog.HeadAddress) + { + throw new Exception("Iterator address is less than log HeadAddress in memory-scan mode"); + } + + var currentPage = currentAddress >> hlog.LogPageSizeBits; + + var offset = (currentAddress & hlog.PageSizeMask) / recordSize; + + if (currentAddress < hlog.HeadAddress) + BufferAndLoad(currentAddress, currentPage, currentPage % frameSize); + + // Check if record fits on page, if not skip to next page + if ((currentAddress & hlog.PageSizeMask) + recordSize > hlog.PageSize) + { + currentAddress = (1 + (currentAddress >> hlog.LogPageSizeBits)) << hlog.LogPageSizeBits; + continue; + } + + if (currentAddress >= hlog.HeadAddress) + { + // Read record from cached page memory + nextAddress = currentAddress + recordSize; + + var page = currentPage % hlog.BufferSize; + + if (hlog.values[page][offset].info.Invalid) + continue; + + recordInfo = hlog.values[page][offset].info; + currentKey = hlog.values[page][offset].key; + currentValue = hlog.values[page][offset].value; + return true; + } + + nextAddress = currentAddress + recordSize; + + var currentFrame = currentPage % frameSize; + + if (frame.GetInfo(currentFrame, offset).Invalid) + continue; + + recordInfo = frame.GetInfo(currentFrame, offset); + currentKey = frame.GetKey(currentFrame, offset); + currentValue = frame.GetValue(currentFrame, offset); + return true; + } + } + + /// + /// Get next record using iterator + /// + /// + /// + /// + /// + public bool GetNext(out RecordInfo recordInfo, out Key key, out Value value) + { + key = default(Key); + value = default(Value); + + if (GetNext(out recordInfo)) + { + key = currentKey; + value = currentValue; + return true; + } + + return false; + } + + private unsafe void BufferAndLoad(long currentAddress, long currentPage, long currentFrame) + { + if (first || (currentAddress & hlog.PageSizeMask) == 0) + { + // Prefetch pages based on buffering mode + if (frameSize == 1) + { + if (!first) + { + hlog.AsyncReadPagesFromDeviceToFrame(currentAddress >> hlog.LogPageSizeBits, 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[currentFrame]); + } + } + else + { + var endPage = endAddress >> hlog.LogPageSizeBits; + if ((endPage > currentPage) && + ((endPage > currentPage + 1) || ((endAddress & hlog.PageSizeMask) != 0))) + { + hlog.AsyncReadPagesFromDeviceToFrame(1 + (currentAddress >> hlog.LogPageSizeBits), 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[(currentPage + 1) % frameSize]); + } + } + first = false; + } + loaded[currentFrame].Wait(); + } + + /// + /// Dispose iterator + /// + public void Dispose() + { + if (loaded != null) + for (int i = 0; i < frameSize; i++) + loaded[i]?.Wait(); + + frame?.Dispose(); + } + + private unsafe void AsyncReadPagesCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + var result = (PageAsyncReadResult)Overlapped.Unpack(overlap).AsyncResult; + + if (result.freeBuffer1 != null) + { + hlog.PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, ref frame.GetPage(result.page % frame.frameSize)); + result.freeBuffer1.Return(); + } + + if (result.handle != null) + { + result.handle.Signal(); + } + + Interlocked.MemoryBarrier(); + Overlapped.Free(overlap); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/IFasterScanIterator.cs b/ZeroLevel/Services/FASTER/Allocator/IFasterScanIterator.cs new file mode 100644 index 0000000..4275ee4 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/IFasterScanIterator.cs @@ -0,0 +1,69 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; + +namespace FASTER.core +{ + /// + /// Scan buffering mode + /// + public enum ScanBufferingMode + { + /// + /// Buffer only current page being scanned + /// + SinglePageBuffering, + + /// + /// Buffer current and next page in scan sequence + /// + DoublePageBuffering, + + /// + /// Do not buffer - with this mode, you can only scan records already in main memory + /// + NoBuffering + } + + /// + /// Scan iterator interface for FASTER log + /// + /// + /// + public interface IFasterScanIterator : IDisposable + { + /// + /// Gets reference to current key + /// + /// + ref Key GetKey(); + + /// + /// Gets reference to current value + /// + /// + ref Value GetValue(); + + /// + /// Get next record + /// + /// + /// True if record found, false if end of scan + bool GetNext(out RecordInfo recordInfo); + + /// + /// Get next record + /// + /// + /// + /// + /// True if record found, false if end of scan + bool GetNext(out RecordInfo recordInfo, out Key key, out Value value); + + /// + /// Current address + /// + long CurrentAddress { get; } + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/MallocFixedPageSize.cs b/ZeroLevel/Services/FASTER/Allocator/MallocFixedPageSize.cs new file mode 100644 index 0000000..846551c --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/MallocFixedPageSize.cs @@ -0,0 +1,656 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +#define CALLOC + +using System; +using System.Collections.Generic; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Memory allocator for objects + /// + /// + public unsafe class MallocFixedPageSize : IDisposable + { + private const bool ForceUnpinnedAllocation = false; + + private const int PageSizeBits = 16; + private const int PageSize = 1 << PageSizeBits; + private const int PageSizeMask = PageSize - 1; + private const int LevelSizeBits = 12; + private const int LevelSize = 1 << LevelSizeBits; + private const int LevelSizeMask = LevelSize - 1; + + private T[][] values = new T[LevelSize][]; + private GCHandle[] handles = new GCHandle[LevelSize]; + private IntPtr[] pointers = new IntPtr[LevelSize]; + + private T[] values0; + private readonly GCHandle handles0; + private readonly IntPtr pointers0; + private readonly int RecordSize; + private readonly int AlignedPageSize; + + private volatile int writeCacheLevel; + + private volatile int count; + + private readonly bool IsPinned; + private readonly bool ReturnPhysicalAddress; + + private CountdownEvent checkpointEvent; + + private readonly LightEpoch epoch; + private readonly bool ownedEpoch; + + private FastThreadLocal> freeList; + + /// + /// Create new instance + /// + /// + /// + public MallocFixedPageSize(bool returnPhysicalAddress = false, LightEpoch epoch = null) + { + freeList = new FastThreadLocal>(); + if (epoch == null) + { + this.epoch = new LightEpoch(); + ownedEpoch = true; + } + else + this.epoch = epoch; + + values[0] = new T[PageSize]; + +#if !(CALLOC) + Array.Clear(values[0], 0, PageSize); +#endif + ReturnPhysicalAddress = returnPhysicalAddress; + + if (ForceUnpinnedAllocation) + { + IsPinned = false; + ReturnPhysicalAddress = false; + } + else + { + IsPinned = true; + if (default(T) == null) + { + IsPinned = false; + ReturnPhysicalAddress = false; + } + else + { + // The surefire way to check if a type is blittable + // it to try GCHandle.Alloc with a handle type of Pinned. + // If it throws an exception, we know the type is not blittable. + try + { + handles[0] = GCHandle.Alloc(values[0], GCHandleType.Pinned); + pointers[0] = handles[0].AddrOfPinnedObject(); + handles0 = handles[0]; + pointers0 = pointers[0]; + RecordSize = Marshal.SizeOf(values[0][0]); + AlignedPageSize = RecordSize * PageSize; + } + catch (Exception) + { + IsPinned = false; + ReturnPhysicalAddress = false; + } + } + } + + values0 = values[0]; + writeCacheLevel = -1; + Interlocked.MemoryBarrier(); + + BulkAllocate(); // null pointer + } + + /// + /// Get physical address + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public long GetPhysicalAddress(long address) + { + if (ReturnPhysicalAddress) + { + return address; + } + else + { + return + (long)pointers[address >> PageSizeBits] + + (long)(address & PageSizeMask) * RecordSize; + } + } + + /// + /// Get object + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public ref T Get(long index) + { + if (this.ReturnPhysicalAddress) + throw new Exception("Physical pointer returned by allocator: de-reference pointer to get records instead of calling Get"); + + return ref values + [index >> PageSizeBits] + [index & PageSizeMask]; + } + + + /// + /// Set object + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Set(long index, ref T value) + { + if (this.ReturnPhysicalAddress) + throw new Exception("Physical pointer returned by allocator: de-reference pointer to set records instead of calling Set (otherwise, set ForceUnpinnedAllocation to true)"); + + values + [index >> PageSizeBits] + [index & PageSizeMask] + = value; + } + + + + /// + /// Free object + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void FreeAtEpoch(long pointer, int removed_epoch = -1) + { + if (!ReturnPhysicalAddress) + { + values[pointer >> PageSizeBits][pointer & PageSizeMask] = default(T); + } + + freeList.InitializeThread(); + + if (freeList.Value == null) + freeList.Value = new Queue(); + freeList.Value.Enqueue(new FreeItem { removed_item = pointer, removal_epoch = removed_epoch }); + } + + private const int kAllocateChunkSize = 16; + + /// + /// Warning: cannot mix 'n' match use of + /// Allocate and BulkAllocate + /// + /// + public long BulkAllocate() + { + // Determine insertion index. + // ReSharper disable once CSharpWarnings::CS0420 +#pragma warning disable 420 + int index = Interlocked.Add(ref count, kAllocateChunkSize) - kAllocateChunkSize; +#pragma warning restore 420 + + int offset = index & PageSizeMask; + int baseAddr = index >> PageSizeBits; + + // Handle indexes in first batch specially because they do not use write cache. + if (baseAddr == 0) + { + // If index 0, then allocate space for next level. + if (index == 0) + { + var tmp = new T[PageSize]; +#if !(CALLOC) + Array.Clear(tmp, 0, PageSize); +#endif + + if (IsPinned) + { + handles[1] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + pointers[1] = handles[1].AddrOfPinnedObject(); + } + values[1] = tmp; + Interlocked.MemoryBarrier(); + } + + // Return location. + if (ReturnPhysicalAddress) + return (((long)pointers0) + index * RecordSize); + else + return index; + } + + // See if write cache contains corresponding array. + var cache = writeCacheLevel; + T[] array; + + if (cache != -1) + { + // Write cache is correct array only if index is within [arrayCapacity, 2*arrayCapacity). + if (cache == baseAddr) + { + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers[baseAddr]) + (long)offset * RecordSize; + else + return index; + } + } + + // Write cache did not work, so get level information from index. + // int level = GetLevelFromIndex(index); + + // Spin-wait until level has an allocated array. + var spinner = new SpinWait(); + while (true) + { + array = values[baseAddr]; + if (array != null) + { + break; + } + spinner.SpinOnce(); + } + + // Perform extra actions if inserting at offset 0 of level. + if (offset == 0) + { + // Update write cache to point to current level. + writeCacheLevel = baseAddr; + Interlocked.MemoryBarrier(); + + // Allocate for next page + int newBaseAddr = baseAddr + 1; + var tmp = new T[PageSize]; + +#if !(CALLOC) + Array.Clear(tmp, 0, PageSize); +#endif + + if (IsPinned) + { + handles[newBaseAddr] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + pointers[newBaseAddr] = handles[newBaseAddr].AddrOfPinnedObject(); + } + values[newBaseAddr] = tmp; + + Interlocked.MemoryBarrier(); + } + + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers[baseAddr]) + (long)offset * RecordSize; + else + return index; + } + + /// + /// Allocate + /// + /// + public long Allocate() + { + freeList.InitializeThread(); + if (freeList.Value == null) + { + freeList.Value = new Queue(); + } + if (freeList.Value.Count > 0) + { + if (freeList.Value.Peek().removal_epoch <= epoch.SafeToReclaimEpoch) + return freeList.Value.Dequeue().removed_item; + + //if (freeList.Count % 64 == 0) + // LightEpoch.Instance.BumpCurrentEpoch(); + } + + // Determine insertion index. + // ReSharper disable once CSharpWarnings::CS0420 +#pragma warning disable 420 + int index = Interlocked.Increment(ref count) - 1; +#pragma warning restore 420 + + int offset = index & PageSizeMask; + int baseAddr = index >> PageSizeBits; + + // Handle indexes in first batch specially because they do not use write cache. + if (baseAddr == 0) + { + // If index 0, then allocate space for next level. + if (index == 0) + { + var tmp = new T[PageSize]; + +#if !(CALLOC) + Array.Clear(tmp, 0, PageSize); +#endif + + if (IsPinned) + { + handles[1] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + pointers[1] = handles[1].AddrOfPinnedObject(); + } + values[1] = tmp; + Interlocked.MemoryBarrier(); + } + + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers0) + index * RecordSize; + else + return index; + } + + // See if write cache contains corresponding array. + var cache = writeCacheLevel; + T[] array; + + if (cache != -1) + { + // Write cache is correct array only if index is within [arrayCapacity, 2*arrayCapacity). + if (cache == baseAddr) + { + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers[baseAddr]) + (long)offset * RecordSize; + else + return index; + } + } + + // Write cache did not work, so get level information from index. + // int level = GetLevelFromIndex(index); + + // Spin-wait until level has an allocated array. + var spinner = new SpinWait(); + while (true) + { + array = values[baseAddr]; + if (array != null) + { + break; + } + spinner.SpinOnce(); + } + + // Perform extra actions if inserting at offset 0 of level. + if (offset == 0) + { + // Update write cache to point to current level. + writeCacheLevel = baseAddr; + Interlocked.MemoryBarrier(); + + // Allocate for next page + int newBaseAddr = baseAddr + 1; + var tmp = new T[PageSize]; + +#if !(CALLOC) + Array.Clear(tmp, 0, PageSize); +#endif + + if (IsPinned) + { + handles[newBaseAddr] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + pointers[newBaseAddr] = handles[newBaseAddr].AddrOfPinnedObject(); + } + values[newBaseAddr] = tmp; + + Interlocked.MemoryBarrier(); + } + + // Return location. + if (ReturnPhysicalAddress) + return ((long)pointers[baseAddr]) + (long)offset * RecordSize; + else + return index; + } + + /// + /// Acquire thread + /// + public void Acquire() + { + if (ownedEpoch) + epoch.Acquire(); + freeList.InitializeThread(); + } + + /// + /// Release thread + /// + public void Release() + { + if (ownedEpoch) + epoch.Release(); + freeList.DisposeThread(); + } + + /// + /// Dispose + /// + public void Dispose() + { + for (int i = 0; i < values.Length; i++) + { + if (IsPinned && (handles[i].IsAllocated)) handles[i].Free(); + values[i] = null; + } + handles = null; + pointers = null; + values = null; + values0 = null; + count = 0; + if (ownedEpoch) + epoch.Dispose(); + freeList.Dispose(); + } + + + #region Checkpoint + + /// + /// Public facing persistence API + /// + /// + /// + /// + public void TakeCheckpoint(IDevice device, ulong start_offset, out ulong numBytes) + { + BeginCheckpoint(device, start_offset, out numBytes); + } + + /// + /// Is checkpoint complete + /// + /// + /// + public bool IsCheckpointCompleted(bool waitUntilComplete = false) + { + bool completed = checkpointEvent.IsSet; + if (!completed && waitUntilComplete) + { + checkpointEvent.Wait(); + return true; + } + return completed; + } + + + internal void BeginCheckpoint(IDevice device, ulong offset, out ulong numBytesWritten) + { + int localCount = count; + int recordsCountInLastLevel = localCount & PageSizeMask; + int numCompleteLevels = localCount >> PageSizeBits; + int numLevels = numCompleteLevels + (recordsCountInLastLevel > 0 ? 1 : 0); + checkpointEvent = new CountdownEvent(numLevels); + + uint alignedPageSize = PageSize * (uint)RecordSize; + uint lastLevelSize = (uint)recordsCountInLastLevel * (uint)RecordSize; + + + int sectorSize = (int)device.SectorSize; + numBytesWritten = 0; + for (int i = 0; i < numLevels; i++) + { + OverflowPagesFlushAsyncResult result = default(OverflowPagesFlushAsyncResult); + uint writeSize = (uint)((i == numCompleteLevels) ? (lastLevelSize + (sectorSize - 1)) & ~(sectorSize - 1) : alignedPageSize); + + device.WriteAsync(pointers[i], offset + numBytesWritten, writeSize, AsyncFlushCallback, result); + numBytesWritten += writeSize; + } + } + + private void AsyncFlushCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + try + { + if (errorCode != 0) + { + System.Diagnostics.Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + } + catch (Exception ex) + { + System.Diagnostics.Trace.TraceError("Completion Callback error, {0}", ex.Message); + } + finally + { + checkpointEvent.Signal(); + Overlapped.Free(overlap); + } + } + + /// + /// Max valid address + /// + /// + public int GetMaxValidAddress() + { + return count; + } + + /// + /// Get page size + /// + /// + public int GetPageSize() + { + return PageSize; + } + #endregion + + #region Recover + /// + /// Recover + /// + /// + /// + /// + /// + public void Recover(IDevice device, ulong offset, int buckets, ulong numBytes) + { + BeginRecovery(device, offset, buckets, numBytes, out ulong numBytesRead); + } + + /// + /// Check if recovery complete + /// + /// + /// + public bool IsRecoveryCompleted(bool waitUntilComplete = false) + { + bool completed = (numLevelsToBeRecovered == 0); + if (!completed && waitUntilComplete) + { + while (numLevelsToBeRecovered != 0) + { + Thread.Sleep(10); + } + } + return completed; + } + + // Implementation of asynchronous recovery + private int numLevelsToBeRecovered; + + internal void BeginRecovery(IDevice device, + ulong offset, + int buckets, + ulong numBytesToRead, + out ulong numBytesRead) + { + // Allocate as many records in memory + while (count < buckets) + { + Allocate(); + } + + int numRecords = (int)numBytesToRead / RecordSize; + int recordsCountInLastLevel = numRecords & PageSizeMask; + int numCompleteLevels = numRecords >> PageSizeBits; + int numLevels = numCompleteLevels + (recordsCountInLastLevel > 0 ? 1 : 0); + + numLevelsToBeRecovered = numLevels; + + numBytesRead = 0; + uint alignedPageSize = (uint)PageSize * (uint)RecordSize; + uint lastLevelSize = (uint)recordsCountInLastLevel * (uint)RecordSize; + for (int i = 0; i < numLevels; i++) + { + //read a full page + uint length = (uint)PageSize * (uint)RecordSize; ; + OverflowPagesReadAsyncResult result = default(OverflowPagesReadAsyncResult); + device.ReadAsync(offset + numBytesRead, pointers[i], length, AsyncPageReadCallback, result); + numBytesRead += (i == numCompleteLevels) ? lastLevelSize : alignedPageSize; + } + } + + private void AsyncPageReadCallback( + uint errorCode, + uint numBytes, + NativeOverlapped* overlap) + { + try + { + if (errorCode != 0) + { + System.Diagnostics.Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + } + catch (Exception ex) + { + System.Diagnostics.Trace.TraceError("Completion Callback error, {0}", ex.Message); + } + finally + { + Interlocked.Decrement(ref numLevelsToBeRecovered); + Overlapped.Free(overlap); + } + } + #endregion + } + + internal struct FreeItem + { + public long removed_item; + public int removal_epoch; + } +} diff --git a/ZeroLevel/Services/FASTER/Allocator/PendingFlushList.cs b/ZeroLevel/Services/FASTER/Allocator/PendingFlushList.cs new file mode 100644 index 0000000..0896481 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/PendingFlushList.cs @@ -0,0 +1,56 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; + +namespace FASTER.core +{ + class PendingFlushList + { + const int maxSize = 8; + const int maxRetries = 10; + public PageAsyncFlushResult[] list; + + public PendingFlushList() + { + list = new PageAsyncFlushResult[maxSize]; + } + + public void Add(PageAsyncFlushResult t) + { + int retries = 0; + do + { + for (int i = 0; i < maxSize; i++) + { + if (list[i] == default) + { + if (Interlocked.CompareExchange(ref list[i], t, default) == default) + { + return; + } + } + } + } while (retries++ < maxRetries); + throw new Exception("Unable to add item to list"); + } + + public bool RemoveAdjacent(long address, out PageAsyncFlushResult request) + { + for (int i=0; i : AllocatorBase + where Key : new() + where Value : new() + { + public const int kRecordAlignment = 8; // RecordInfo has a long field, so it should be aligned to 8-bytes + + // Circular buffer definition + private byte[][] values; + private GCHandle[] handles; + private long[] pointers; + private readonly GCHandle ptrHandle; + private readonly long* nativePointers; + private readonly bool fixedSizeKey; + private readonly bool fixedSizeValue; + + internal readonly IVariableLengthStruct KeyLength; + internal readonly IVariableLengthStruct ValueLength; + + public VariableLengthBlittableAllocator(LogSettings settings, VariableLengthStructSettings vlSettings, IFasterEqualityComparer comparer, Action evictCallback = null, LightEpoch epoch = null, Action flushCallback = null) + : base(settings, comparer, evictCallback, epoch, flushCallback) + { + values = new byte[BufferSize][]; + handles = new GCHandle[BufferSize]; + pointers = new long[BufferSize]; + + ptrHandle = GCHandle.Alloc(pointers, GCHandleType.Pinned); + nativePointers = (long*)ptrHandle.AddrOfPinnedObject(); + + KeyLength = vlSettings.keyLength; + ValueLength = vlSettings.valueLength; + + if (KeyLength == null) + { + fixedSizeKey = true; + KeyLength = new FixedLengthStruct(); + } + + if (ValueLength == null) + { + fixedSizeValue = true; + ValueLength = new FixedLengthStruct(); + } + } + + public override void Initialize() + { + Initialize(Constants.kFirstValidAddress); + } + + public override ref RecordInfo GetInfo(long physicalAddress) + { + return ref Unsafe.AsRef((void*)physicalAddress); + } + + public override ref RecordInfo GetInfoFromBytePointer(byte* ptr) + { + return ref Unsafe.AsRef(ptr); + } + + public override ref Key GetKey(long physicalAddress) + { + return ref Unsafe.AsRef((byte*)physicalAddress + RecordInfo.GetLength()); + } + + public override ref Value GetValue(long physicalAddress) + { + return ref Unsafe.AsRef((byte*)physicalAddress + RecordInfo.GetLength() + KeySize(physicalAddress)); + } + + private int KeySize(long physicalAddress) + { + return KeyLength.GetLength(ref GetKey(physicalAddress)); + } + + private int ValueSize(long physicalAddress) + { + return ValueLength.GetLength(ref GetValue(physicalAddress)); + } + + public override int GetRecordSize(long physicalAddress) + { + ref var recordInfo = ref GetInfo(physicalAddress); + if (recordInfo.IsNull()) + return RecordInfo.GetLength(); + + var size = RecordInfo.GetLength() + KeySize(physicalAddress) + ValueSize(physicalAddress); + size = (size + kRecordAlignment - 1) & (~(kRecordAlignment - 1)); + return size; + } + + public override int GetRequiredRecordSize(long physicalAddress, int availableBytes) + { + // We need at least [record size] + [average key size] + [average value size] + var reqBytes = GetAverageRecordSize(); + if (availableBytes < reqBytes) + { + return reqBytes; + } + + // We need at least [record size] + [actual key size] + [average value size] + reqBytes = RecordInfo.GetLength() + KeySize(physicalAddress) + ValueLength.GetAverageLength(); + if (availableBytes < reqBytes) + { + return reqBytes; + } + + // We need at least [record size] + [actual key size] + [actual value size] + reqBytes = RecordInfo.GetLength() + KeySize(physicalAddress) + ValueSize(physicalAddress); + reqBytes = (reqBytes + kRecordAlignment - 1) & (~(kRecordAlignment - 1)); + return reqBytes; + } + + public override int GetAverageRecordSize() + { + return RecordInfo.GetLength() + + kRecordAlignment + + KeyLength.GetAverageLength() + + ValueLength.GetAverageLength(); + } + + public override int GetInitialRecordSize(ref Key key, ref TInput input) + { + var actualSize = RecordInfo.GetLength() + + KeyLength.GetLength(ref key) + + ValueLength.GetInitialLength(ref input); + + return (actualSize + kRecordAlignment - 1) & (~(kRecordAlignment - 1)); + } + + public override int GetRecordSize(ref Key key, ref Value value) + { + var actualSize = RecordInfo.GetLength() + + KeyLength.GetLength(ref key) + + ValueLength.GetLength(ref value); + + return (actualSize + kRecordAlignment - 1) & (~(kRecordAlignment - 1)); + } + + public override void ShallowCopy(ref Key src, ref Key dst) + { + Buffer.MemoryCopy( + Unsafe.AsPointer(ref src), + Unsafe.AsPointer(ref dst), + KeyLength.GetLength(ref src), + KeyLength.GetLength(ref src)); + } + + public override void ShallowCopy(ref Value src, ref Value dst) + { + Buffer.MemoryCopy( + Unsafe.AsPointer(ref src), + Unsafe.AsPointer(ref dst), + ValueLength.GetLength(ref src), + ValueLength.GetLength(ref src)); + } + + /// + /// Dispose memory allocator + /// + public override void Dispose() + { + if (values != null) + { + for (int i = 0; i < values.Length; i++) + { + if (handles[i].IsAllocated) + handles[i].Free(); + values[i] = null; + } + } + handles = null; + pointers = null; + values = null; + base.Dispose(); + } + + public override AddressInfo* GetKeyAddressInfo(long physicalAddress) + { + throw new NotSupportedException(); + } + + public override AddressInfo* GetValueAddressInfo(long physicalAddress) + { + throw new NotSupportedException(); + } + + /// + /// Allocate memory page, pinned in memory, and in sector aligned form, if possible + /// + /// + internal override void AllocatePage(int index) + { + var adjustedSize = PageSize + 2 * sectorSize; + byte[] tmp = new byte[adjustedSize]; + Array.Clear(tmp, 0, adjustedSize); + + handles[index] = GCHandle.Alloc(tmp, GCHandleType.Pinned); + long p = (long)handles[index].AddrOfPinnedObject(); + pointers[index] = (p + (sectorSize - 1)) & ~(sectorSize - 1); + values[index] = tmp; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public override long GetPhysicalAddress(long logicalAddress) + { + // Offset within page + int offset = (int)(logicalAddress & ((1L << LogPageSizeBits) - 1)); + + // Index of page within the circular buffer + int pageIndex = (int)((logicalAddress >> LogPageSizeBits) & (BufferSize - 1)); + return *(nativePointers + pageIndex) + offset; + } + + protected override bool IsAllocated(int pageIndex) + { + return values[pageIndex] != null; + } + + protected override void WriteAsync(long flushPage, IOCompletionCallback callback, PageAsyncFlushResult asyncResult) + { + WriteAsync((IntPtr)pointers[flushPage % BufferSize], + (ulong)(AlignedPageSizeBytes * flushPage), + (uint)AlignedPageSizeBytes, + callback, + asyncResult, device); + } + + protected override void WriteAsyncToDevice + (long startPage, long flushPage, int pageSize, IOCompletionCallback callback, + PageAsyncFlushResult asyncResult, IDevice device, IDevice objectLogDevice) + { + var alignedPageSize = (pageSize + (sectorSize - 1)) & ~(sectorSize - 1); + + WriteAsync((IntPtr)pointers[flushPage % BufferSize], + (ulong)(AlignedPageSizeBytes * (flushPage - startPage)), + (uint)alignedPageSize, callback, asyncResult, + device); + } + + /// + /// Get start logical address + /// + /// + /// + public override long GetStartLogicalAddress(long page) + { + return page << LogPageSizeBits; + } + + + /// + /// Get first valid logical address + /// + /// + /// + public override long GetFirstValidLogicalAddress(long page) + { + if (page == 0) + return (page << LogPageSizeBits) + Constants.kFirstValidAddress; + + return page << LogPageSizeBits; + } + + protected override void ClearPage(long page, int offset) + { + if (offset == 0) + Array.Clear(values[page % BufferSize], offset, values[page % BufferSize].Length - offset); + else + { + // Adjust array offset for cache alignment + offset += (int)(pointers[page % BufferSize] - (long)handles[page % BufferSize].AddrOfPinnedObject()); + Array.Clear(values[page % BufferSize], offset, values[page % BufferSize].Length - offset); + } + } + + /// + /// Delete in-memory portion of the log + /// + internal override void DeleteFromMemory() + { + for (int i = 0; i < values.Length; i++) + { + if (handles[i].IsAllocated) + handles[i].Free(); + values[i] = null; + } + handles = null; + pointers = null; + values = null; + } + + + private void WriteAsync(IntPtr alignedSourceAddress, ulong alignedDestinationAddress, uint numBytesToWrite, + IOCompletionCallback callback, PageAsyncFlushResult asyncResult, + IDevice device) + { + if (asyncResult.partial) + { + // Write only required bytes within the page + int aligned_start = (int)((asyncResult.fromAddress - (asyncResult.page << LogPageSizeBits))); + aligned_start = (aligned_start / sectorSize) * sectorSize; + + int aligned_end = (int)((asyncResult.untilAddress - (asyncResult.page << LogPageSizeBits))); + aligned_end = ((aligned_end + (sectorSize - 1)) & ~(sectorSize - 1)); + + numBytesToWrite = (uint)(aligned_end - aligned_start); + device.WriteAsync(alignedSourceAddress + aligned_start, alignedDestinationAddress + (ulong)aligned_start, numBytesToWrite, callback, asyncResult); + } + else + { + device.WriteAsync(alignedSourceAddress, alignedDestinationAddress, + numBytesToWrite, callback, asyncResult); + } + } + + protected override void ReadAsync( + ulong alignedSourceAddress, int destinationPageIndex, uint aligned_read_length, + IOCompletionCallback callback, PageAsyncReadResult asyncResult, IDevice device, IDevice objlogDevice) + { + device.ReadAsync(alignedSourceAddress, (IntPtr)pointers[destinationPageIndex], + aligned_read_length, callback, asyncResult); + } + + /// + /// Invoked by users to obtain a record from disk. It uses sector aligned memory to read + /// the record efficiently into memory. + /// + /// + /// + /// + /// + /// + protected override void AsyncReadRecordObjectsToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext context, SectorAlignedMemory result = default(SectorAlignedMemory)) + { + throw new InvalidOperationException("AsyncReadRecordObjectsToMemory invalid for BlittableAllocator"); + } + + /// + /// Retrieve objects from object log + /// + /// + /// + /// + protected override bool RetrievedFullRecord(byte* record, ref AsyncIOContext ctx) + { + return true; + } + + + public override ref Key GetContextRecordKey(ref AsyncIOContext ctx) + { + return ref GetKey((long)ctx.record.GetValidPointer()); + } + + public override ref Value GetContextRecordValue(ref AsyncIOContext ctx) + { + return ref GetValue((long)ctx.record.GetValidPointer()); + } + + public override IHeapContainer GetKeyContainer(ref Key key) + { + if (fixedSizeKey) return new StandardHeapContainer(ref key); + else return new VarLenHeapContainer(ref key, KeyLength, bufferPool); + } + + public override IHeapContainer GetValueContainer(ref Value value) + { + if (fixedSizeValue) return new StandardHeapContainer(ref value); + else return new VarLenHeapContainer(ref value, ValueLength, bufferPool); + } + + /// + /// Whether KVS has keys to serialize/deserialize + /// + /// + public override bool KeyHasObjects() + { + return false; + } + + /// + /// Whether KVS has values to serialize/deserialize + /// + /// + public override bool ValueHasObjects() + { + return false; + } + + public override long[] GetSegmentOffsets() + { + return null; + } + + internal override void PopulatePage(byte* src, int required_bytes, long destinationPage) + { + throw new Exception("BlittableAllocator memory pages are sector aligned - use direct copy"); + // Buffer.MemoryCopy(src, (void*)pointers[destinationPage % BufferSize], required_bytes, required_bytes); + } + + /// + /// Iterator interface for scanning FASTER log + /// + /// + /// + /// + /// + public override IFasterScanIterator Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode) + { + return new VariableLengthBlittableScanIterator(this, beginAddress, endAddress, scanBufferingMode); + } + + + /// + /// Read pages from specified device + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + internal void AsyncReadPagesFromDeviceToFrame( + long readPageStart, + int numPages, + long untilAddress, + IOCompletionCallback callback, + TContext context, + BlittableFrame frame, + out CountdownEvent completed, + long devicePageOffset = 0, + IDevice device = null, IDevice objectLogDevice = null) + { + var usedDevice = device; + IDevice usedObjlogDevice = objectLogDevice; + + if (device == null) + { + usedDevice = this.device; + } + + completed = new CountdownEvent(numPages); + for (long readPage = readPageStart; readPage < (readPageStart + numPages); readPage++) + { + int pageIndex = (int)(readPage % frame.frameSize); + if (frame.frame[pageIndex] == null) + { + frame.Allocate(pageIndex); + } + else + { + frame.Clear(pageIndex); + } + var asyncResult = new PageAsyncReadResult() + { + page = readPage, + context = context, + handle = completed, + frame = frame + }; + + ulong offsetInFile = (ulong)(AlignedPageSizeBytes * readPage); + + uint readLength = (uint)AlignedPageSizeBytes; + long adjustedUntilAddress = (AlignedPageSizeBytes * (untilAddress >> LogPageSizeBits) + (untilAddress & PageSizeMask)); + + if (adjustedUntilAddress > 0 && ((adjustedUntilAddress - (long)offsetInFile) < PageSize)) + { + readLength = (uint)(adjustedUntilAddress - (long)offsetInFile); + readLength = (uint)((readLength + (sectorSize - 1)) & ~(sectorSize - 1)); + } + + if (device != null) + offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset)); + + usedDevice.ReadAsync(offsetInFile, (IntPtr)frame.pointers[pageIndex], readLength, callback, asyncResult); + } + } + } +} + + diff --git a/ZeroLevel/Services/FASTER/Allocator/VarLenBlittableScanIterator.cs b/ZeroLevel/Services/FASTER/Allocator/VarLenBlittableScanIterator.cs new file mode 100644 index 0000000..bce1991 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Allocator/VarLenBlittableScanIterator.cs @@ -0,0 +1,228 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; +using System.Diagnostics; + +namespace FASTER.core +{ + /// + /// Scan iterator for hybrid log + /// + public class VariableLengthBlittableScanIterator : IFasterScanIterator + where Key : new() + where Value : new() + { + private readonly int frameSize; + private readonly VariableLengthBlittableAllocator hlog; + private readonly long beginAddress, endAddress; + private readonly BlittableFrame frame; + private readonly CountdownEvent[] loaded; + + private bool first = true; + private long currentAddress, nextAddress; + private long currentPhysicalAddress; + + /// + /// Current address + /// + public long CurrentAddress => currentAddress; + + /// + /// Constructor + /// + /// + /// + /// + /// + public unsafe VariableLengthBlittableScanIterator(VariableLengthBlittableAllocator hlog, long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode) + { + this.hlog = hlog; + + if (beginAddress == 0) + beginAddress = hlog.GetFirstValidLogicalAddress(0); + + this.beginAddress = beginAddress; + this.endAddress = endAddress; + currentAddress = -1; + nextAddress = beginAddress; + + if (scanBufferingMode == ScanBufferingMode.SinglePageBuffering) + frameSize = 1; + else if (scanBufferingMode == ScanBufferingMode.DoublePageBuffering) + frameSize = 2; + else if (scanBufferingMode == ScanBufferingMode.NoBuffering) + { + frameSize = 0; + return; + } + + frame = new BlittableFrame(frameSize, hlog.PageSize, hlog.GetDeviceSectorSize()); + loaded = new CountdownEvent[frameSize]; + + // Only load addresses flushed to disk + if (nextAddress < hlog.HeadAddress) + { + var frameNumber = (nextAddress >> hlog.LogPageSizeBits) % frameSize; + hlog.AsyncReadPagesFromDeviceToFrame + (nextAddress >> hlog.LogPageSizeBits, + 1, endAddress, AsyncReadPagesCallback, Empty.Default, + frame, out loaded[frameNumber]); + } + } + + /// + /// Gets reference to current key + /// + /// + public ref Key GetKey() + { + return ref hlog.GetKey(currentPhysicalAddress); + } + + /// + /// Gets reference to current value + /// + /// + public ref Value GetValue() + { + return ref hlog.GetValue(currentPhysicalAddress); + } + + /// + /// Get next record in iterator + /// + /// + /// + public bool GetNext(out RecordInfo recordInfo) + { + recordInfo = default(RecordInfo); + + currentAddress = nextAddress; + while (true) + { + // Check for boundary conditions + if (currentAddress >= endAddress) + { + return false; + } + + if (currentAddress < hlog.BeginAddress) + { + throw new Exception("Iterator address is less than log BeginAddress " + hlog.BeginAddress); + } + + if (frameSize == 0 && currentAddress < hlog.HeadAddress) + { + throw new Exception("Iterator address is less than log HeadAddress in memory-scan mode"); + } + + var currentPage = currentAddress >> hlog.LogPageSizeBits; + var offset = currentAddress & hlog.PageSizeMask; + + if (currentAddress < hlog.HeadAddress) + BufferAndLoad(currentAddress, currentPage, currentPage % frameSize); + + var physicalAddress = default(long); + if (currentAddress >= hlog.HeadAddress) + physicalAddress = hlog.GetPhysicalAddress(currentAddress); + else + physicalAddress = frame.GetPhysicalAddress(currentPage % frameSize, offset); + + // Check if record fits on page, if not skip to next page + var recordSize = hlog.GetRecordSize(physicalAddress); + if ((currentAddress & hlog.PageSizeMask) + recordSize > hlog.PageSize) + { + currentAddress = (1 + (currentAddress >> hlog.LogPageSizeBits)) << hlog.LogPageSizeBits; + continue; + } + + ref var info = ref hlog.GetInfo(physicalAddress); + if (info.Invalid || info.IsNull()) + { + currentAddress += recordSize; + continue; + } + + currentPhysicalAddress = physicalAddress; + recordInfo = info; + nextAddress = currentAddress + recordSize; + return true; + } + } + + /// + /// Get next record in iterator + /// + /// + /// + /// + /// + public bool GetNext(out RecordInfo recordInfo, out Key key, out Value value) + { + throw new NotSupportedException("Use GetNext(out RecordInfo) to retrieve references to key/value"); + } + + /// + /// Dispose the iterator + /// + public void Dispose() + { + frame?.Dispose(); + } + + private unsafe void BufferAndLoad(long currentAddress, long currentPage, long currentFrame) + { + if (first || (currentAddress & hlog.PageSizeMask) == 0) + { + // Prefetch pages based on buffering mode + if (frameSize == 1) + { + if (!first) + { + hlog.AsyncReadPagesFromDeviceToFrame(currentAddress >> hlog.LogPageSizeBits, 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[currentFrame]); + } + } + else + { + var endPage = endAddress >> hlog.LogPageSizeBits; + if ((endPage > currentPage) && + ((endPage > currentPage + 1) || ((endAddress & hlog.PageSizeMask) != 0))) + { + hlog.AsyncReadPagesFromDeviceToFrame(1 + (currentAddress >> hlog.LogPageSizeBits), 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[(currentPage + 1) % frameSize]); + } + } + first = false; + } + loaded[currentFrame].Wait(); + } + + private unsafe void AsyncReadPagesCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + var result = (PageAsyncReadResult)Overlapped.Unpack(overlap).AsyncResult; + + if (result.freeBuffer1 != null) + { + hlog.PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, result.page); + result.freeBuffer1.Return(); + result.freeBuffer1 = null; + } + + if (result.handle != null) + { + result.handle.Signal(); + } + + Interlocked.MemoryBarrier(); + Overlapped.Free(overlap); + } + } +} + + diff --git a/ZeroLevel/Services/FASTER/Device/Devices.cs b/ZeroLevel/Services/FASTER/Device/Devices.cs new file mode 100644 index 0000000..4bfa0a1 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Device/Devices.cs @@ -0,0 +1,52 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.IO; +using System.Linq.Expressions; +using System.Runtime.InteropServices; + +namespace FASTER.core +{ + /// + /// Factory to create FASTER objects + /// + public static class Devices + { + /// + /// This value is supplied for capacity when the device does not have a specified limit. + /// + public const long CAPACITY_UNSPECIFIED = -1; + private const string EMULATED_STORAGE_STRING = "UseDevelopmentStorage=true;"; + private const string TEST_CONTAINER = "test"; + + /// + /// Create a storage device for the log + /// + /// Path to file that will store the log (empty for null device) + /// Whether we try to preallocate the file on creation + /// Delete files on close + /// The maximal number of bytes this storage device can accommondate, or CAPACITY_UNSPECIFIED if there is no such limit + /// Whether to recover device metadata from existing files + /// Device instance + public static IDevice CreateLogDevice(string logPath, bool preallocateFile = true, bool deleteOnClose = false, long capacity = CAPACITY_UNSPECIFIED, bool recoverDevice = false) + { + if (string.IsNullOrWhiteSpace(logPath)) + return new NullDevice(); + + IDevice logDevice; + +#if DOTNETCORE + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + logDevice = new ManagedLocalStorageDevice(logPath, preallocateFile, deleteOnClose, capacity, recoverDevice); + } + else +#endif + { + logDevice = new LocalStorageDevice(logPath, preallocateFile, deleteOnClose, true, capacity, recoverDevice); + } + return logDevice; + } + } +} diff --git a/ZeroLevel/Services/FASTER/Device/IDevice.cs b/ZeroLevel/Services/FASTER/Device/IDevice.cs new file mode 100644 index 0000000..8833a71 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Device/IDevice.cs @@ -0,0 +1,161 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Interface for devices + /// + public interface IDevice + { + /// + /// Size of sector + /// + uint SectorSize { get; } + + /// + /// Name of device + /// + string FileName { get; } + + /// + /// Returns the maximum capacity of the storage device, in number of bytes. + /// If returned CAPACITY_UNSPECIFIED, the storage device has no specfied capacity limit. + /// + long Capacity { get; } + + /// + /// A device breaks up each logical log into multiple self-contained segments that are of the same size. + /// It is an atomic unit of data that cannot be partially present on a device (i.e. either the entire segment + /// is present or no data from the segment is present). Examples of this include files or named blobs. This + /// property returns the size of each segment. + /// + long SegmentSize { get; } + + /// + /// The index of the first segment present on this device + /// + int StartSegment { get; } + + /// + /// The index of the last segment present on this device + /// + int EndSegment { get; } + + /// + /// Initialize device. This function is used to pass optional information that may only be known after + /// FASTER initialization (whose constructor takes in IDevice upfront). Implementation are free to ignore + /// information if it does not need the supplied information. + /// + /// This is a bit of a hack. + /// + /// + /// + /// The instance of the epoch protection framework to use, if needed + /// + void Initialize(long segmentSize, LightEpoch epoch = null); + + + /* Segmented addressing API */ + /// + /// Write + /// + /// + /// + /// + /// + /// + /// + void WriteAsync(IntPtr sourceAddress, int segmentId, ulong destinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult); + + /// + /// Read + /// + /// + /// + /// + /// + /// + /// + void ReadAsync(int segmentId, ulong sourceAddress, IntPtr destinationAddress, uint readLength, IOCompletionCallback callback, IAsyncResult asyncResult); + + /* Direct addressing API */ + + /// + /// Write + /// + /// + /// + /// + /// + /// + void WriteAsync(IntPtr alignedSourceAddress, ulong alignedDestinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult); + + /// + /// Read + /// + /// + /// + /// + /// + /// + void ReadAsync(ulong alignedSourceAddress, IntPtr alignedDestinationAddress, uint aligned_read_length, IOCompletionCallback callback, IAsyncResult asyncResult); + + /// + /// Truncates the log until the given address. The truncated portion should no longer be accessed as the device is no longer responsible for + /// its maintenance, but physical deletion may not happen immediately. + /// + /// upper bound of truncated address + /// callback to invoke when truncation is complete + /// result to be passed to the callback + void TruncateUntilAddressAsync(long toAddress, AsyncCallback callback, IAsyncResult result); + + /// + /// Truncates the log until the given address. The truncated portion should no longer be accessed as the device is no longer responsible for + /// its maintenance, but physical deletion may not happen immediately. This version of the function can block. + /// + /// upper bound of truncated address + void TruncateUntilAddress(long toAddress); + + /// + /// Truncates the log until the given segment. Physical deletion of the given segments are guaranteed to have happened when the callback is invoked. + /// + /// the largest (in index) segment to truncate + /// callback to invoke when truncation is complete + /// result to be passed to the callback + void TruncateUntilSegmentAsync(int toSegment, AsyncCallback callback, IAsyncResult result); + + /// + /// Truncates the log until the given segment. Physical deletion of the given segments are guaranteed to have happened when the function returns. + /// This version of the function can block. + /// + /// the largest (in index) segment to truncate + void TruncateUntilSegment(int toSegment); + + /// + /// Removes a single segment from the device. This function should not normally be called. + /// Instead, use + /// + /// index of the segment to remov + /// callback to invoke when removal is complete + /// result to be passed to the callback + void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result); + + /// + /// Removes a single segment from the device. This function should not normally be called. + /// Instead, use + /// + /// index of the segment to remov + void RemoveSegment(int segment); + + /* Close */ + + /// + /// Close + /// + void Close(); + } +} diff --git a/ZeroLevel/Services/FASTER/Device/LocalStorageDevice.cs b/ZeroLevel/Services/FASTER/Device/LocalStorageDevice.cs new file mode 100644 index 0000000..aa6d0aa --- /dev/null +++ b/ZeroLevel/Services/FASTER/Device/LocalStorageDevice.cs @@ -0,0 +1,303 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using Microsoft.Win32.SafeHandles; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Runtime.InteropServices; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Local storage device + /// + public class LocalStorageDevice : StorageDeviceBase + { + private readonly bool preallocateFile; + private readonly bool deleteOnClose; + private readonly bool disableFileBuffering; + private readonly SafeConcurrentDictionary logHandles; + + /// + /// Constructor + /// + /// File name (or prefix) with path + /// + /// + /// + /// The maximum number of bytes this storage device can accommondate, or CAPACITY_UNSPECIFIED if there is no such limit + /// Whether to recover device metadata from existing files + public LocalStorageDevice(string filename, + bool preallocateFile = false, + bool deleteOnClose = false, + bool disableFileBuffering = true, + long capacity = Devices.CAPACITY_UNSPECIFIED, + bool recoverDevice = false) + : base(filename, GetSectorSize(filename), capacity) + + { + Native32.EnableProcessPrivileges(); + this.preallocateFile = preallocateFile; + this.deleteOnClose = deleteOnClose; + this.disableFileBuffering = disableFileBuffering; + logHandles = new SafeConcurrentDictionary(); + if (recoverDevice) + RecoverFiles(); + } + + private void RecoverFiles() + { + FileInfo fi = new FileInfo(FileName); // may not exist + DirectoryInfo di = fi.Directory; + if (!di.Exists) return; + + string bareName = fi.Name; + + List segids = new List(); + foreach (FileInfo item in di.GetFiles(bareName + "*")) + { + segids.Add(Int32.Parse(item.Name.Replace(bareName, "").Replace(".", ""))); + } + segids.Sort(); + + int prevSegmentId = -1; + foreach (int segmentId in segids) + { + if (segmentId != prevSegmentId + 1) + { + startSegment = segmentId; + } + else + { + endSegment = segmentId; + } + prevSegmentId = segmentId; + } + // No need to populate map because logHandles use Open or create on files. + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public override unsafe void ReadAsync(int segmentId, ulong sourceAddress, + IntPtr destinationAddress, + uint readLength, + IOCompletionCallback callback, + IAsyncResult asyncResult) + { + var logHandle = GetOrAddHandle(segmentId); + + Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult); + NativeOverlapped* ovNative = ov.UnsafePack(callback, IntPtr.Zero); + ovNative->OffsetLow = unchecked((int)((ulong)sourceAddress & 0xFFFFFFFF)); + ovNative->OffsetHigh = unchecked((int)(((ulong)sourceAddress >> 32) & 0xFFFFFFFF)); + + bool result = Native32.ReadFile(logHandle, + destinationAddress, + readLength, + out uint bytesRead, + ovNative); + + if (!result) + { + int error = Marshal.GetLastWin32Error(); + if (error != Native32.ERROR_IO_PENDING) + { + Overlapped.Unpack(ovNative); + Overlapped.Free(ovNative); + throw new Exception("Error reading from log file: " + error); + } + } + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public override unsafe void WriteAsync(IntPtr sourceAddress, + int segmentId, + ulong destinationAddress, + uint numBytesToWrite, + IOCompletionCallback callback, + IAsyncResult asyncResult) + { + var logHandle = GetOrAddHandle(segmentId); + + Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult); + NativeOverlapped* ovNative = ov.UnsafePack(callback, IntPtr.Zero); + ovNative->OffsetLow = unchecked((int)(destinationAddress & 0xFFFFFFFF)); + ovNative->OffsetHigh = unchecked((int)((destinationAddress >> 32) & 0xFFFFFFFF)); + + bool result = Native32.WriteFile(logHandle, + sourceAddress, + numBytesToWrite, + out uint bytesWritten, + ovNative); + + if (!result) + { + int error = Marshal.GetLastWin32Error(); + if (error != Native32.ERROR_IO_PENDING) + { + Overlapped.Unpack(ovNative); + Overlapped.Free(ovNative); + throw new Exception("Error writing to log file: " + error); + } + } + } + + /// + /// + /// + /// + public override void RemoveSegment(int segment) + { + if (logHandles.TryRemove(segment, out SafeFileHandle logHandle)) + { + logHandle.Dispose(); + Native32.DeleteFileW(GetSegmentName(segment)); + } + } + + /// + /// + /// + /// + /// + /// + public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result) + { + RemoveSegment(segment); + callback(result); + } + + // It may be somewhat inefficient to use the default async calls from the base class when the underlying + // method is inherently synchronous. But just for delete (which is called infrequently and off the + // critical path) such inefficiency is probably negligible. + + /// + /// Close device + /// + public override void Close() + { + foreach (var logHandle in logHandles.Values) + logHandle.Dispose(); + } + + /// + /// + /// + /// + /// + protected string GetSegmentName(int segmentId) + { + return FileName + "." + segmentId; + } + + /// + /// + /// + /// + /// + // Can be used to pre-load handles, e.g., after a checkpoint + protected SafeFileHandle GetOrAddHandle(int _segmentId) + { + return logHandles.GetOrAdd(_segmentId, segmentId => CreateHandle(segmentId)); + } + + private static uint GetSectorSize(string filename) + { + if (!Native32.GetDiskFreeSpace(filename.Substring(0, 3), + out uint lpSectorsPerCluster, + out uint _sectorSize, + out uint lpNumberOfFreeClusters, + out uint lpTotalNumberOfClusters)) + { + Debug.WriteLine("Unable to retrieve information for disk " + filename.Substring(0, 3) + " - check if the disk is available and you have specified the full path with drive name. Assuming sector size of 512 bytes."); + _sectorSize = 512; + } + return _sectorSize; + } + + private SafeFileHandle CreateHandle(int segmentId) + { + uint fileAccess = Native32.GENERIC_READ | Native32.GENERIC_WRITE; + uint fileShare = unchecked(((uint)FileShare.ReadWrite & ~(uint)FileShare.Inheritable)); + uint fileCreation = unchecked((uint)FileMode.OpenOrCreate); + uint fileFlags = Native32.FILE_FLAG_OVERLAPPED; + + if (this.disableFileBuffering) + { + fileFlags = fileFlags | Native32.FILE_FLAG_NO_BUFFERING; + } + + if (deleteOnClose) + { + fileFlags = fileFlags | Native32.FILE_FLAG_DELETE_ON_CLOSE; + + // FILE_SHARE_DELETE allows multiple FASTER instances to share a single log directory and each can specify deleteOnClose. + // This will allow the files to persist until all handles across all instances have been closed. + fileShare = fileShare | Native32.FILE_SHARE_DELETE; + } + + var logHandle = Native32.CreateFileW( + GetSegmentName(segmentId), + fileAccess, fileShare, + IntPtr.Zero, fileCreation, + fileFlags, IntPtr.Zero); + + if (logHandle.IsInvalid) + { + var error = Marshal.GetLastWin32Error(); + throw new IOException($"Error creating log file for {GetSegmentName(segmentId)}, error: {error}", Native32.MakeHRFromErrorCode(error)); + } + + if (preallocateFile) + SetFileSize(FileName, logHandle, segmentSize); + + try + { + ThreadPool.BindHandle(logHandle); + } + catch (Exception e) + { + throw new Exception("Error binding log handle for " + GetSegmentName(segmentId) + ": " + e.ToString()); + } + return logHandle; + } + + /// Sets file size to the specified value. + /// Does not reset file seek pointer to original location. + private bool SetFileSize(string filename, SafeFileHandle logHandle, long size) + { + if (segmentSize <= 0) + return false; + + if (Native32.EnableVolumePrivileges(filename, logHandle)) + { + return Native32.SetFileSize(logHandle, size); + } + + int lodist = (int)size; + int hidist = (int)(size >> 32); + Native32.SetFilePointer(logHandle, lodist, ref hidist, Native32.EMoveMethod.Begin); + if (!Native32.SetEndOfFile(logHandle)) return false; + return true; + } + } +} diff --git a/ZeroLevel/Services/FASTER/Device/ManagedLocalStorageDevice.cs b/ZeroLevel/Services/FASTER/Device/ManagedLocalStorageDevice.cs new file mode 100644 index 0000000..034c304 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Device/ManagedLocalStorageDevice.cs @@ -0,0 +1,302 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using Microsoft.Win32.SafeHandles; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Runtime.InteropServices; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Managed device using .NET streams + /// + public class ManagedLocalStorageDevice : StorageDeviceBase + { + private readonly bool preallocateFile; + private readonly bool deleteOnClose; + private readonly ConcurrentDictionary logHandles; + private SectorAlignedBufferPool pool; + + /// + /// + /// + /// File name (or prefix) with path + /// + /// + /// The maximal number of bytes this storage device can accommondate, or CAPACITY_UNSPECIFIED if there is no such limit + /// Whether to recover device metadata from existing files + public ManagedLocalStorageDevice(string filename, bool preallocateFile = false, bool deleteOnClose = false, long capacity = Devices.CAPACITY_UNSPECIFIED, bool recoverDevice = false) + : base(filename, GetSectorSize(filename), capacity) + { + pool = new SectorAlignedBufferPool(1, 1); + + this.preallocateFile = preallocateFile; + this.deleteOnClose = deleteOnClose; + logHandles = new ConcurrentDictionary(); + if (recoverDevice) + RecoverFiles(); + } + + + private void RecoverFiles() + { + FileInfo fi = new FileInfo(FileName); // may not exist + DirectoryInfo di = fi.Directory; + if (!di.Exists) return; + + string bareName = fi.Name; + + List segids = new List(); + foreach (FileInfo item in di.GetFiles(bareName + "*")) + { + segids.Add(Int32.Parse(item.Name.Replace(bareName, "").Replace(".", ""))); + } + segids.Sort(); + + int prevSegmentId = -1; + foreach (int segmentId in segids) + { + if (segmentId != prevSegmentId + 1) + { + startSegment = segmentId; + } + else + { + endSegment = segmentId; + } + prevSegmentId = segmentId; + } + // No need to populate map because logHandles use Open or create on files. + } + + + + + class ReadCallbackWrapper + { + readonly Stream logHandle; + readonly IOCompletionCallback callback; + readonly IAsyncResult asyncResult; + SectorAlignedMemory memory; + readonly IntPtr destinationAddress; + readonly uint readLength; + + public ReadCallbackWrapper(Stream logHandle, IOCompletionCallback callback, IAsyncResult asyncResult, SectorAlignedMemory memory, IntPtr destinationAddress, uint readLength) + { + this.logHandle = logHandle; + this.callback = callback; + this.asyncResult = asyncResult; + this.memory = memory; + this.destinationAddress = destinationAddress; + this.readLength = readLength; + } + + public unsafe void Callback(IAsyncResult result) + { + uint errorCode = 0; + try + { + logHandle.EndRead(result); + fixed (void* source = memory.buffer) + { + Buffer.MemoryCopy(source, (void*)destinationAddress, readLength, readLength); + } + } + catch + { + errorCode = 1; + } + + memory.Return(); + Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult); + callback(errorCode, 0, ov.UnsafePack(callback, IntPtr.Zero)); + } + } + + class WriteCallbackWrapper + { + readonly Stream logHandle; + readonly IOCompletionCallback callback; + readonly IAsyncResult asyncResult; + SectorAlignedMemory memory; + + public WriteCallbackWrapper(Stream logHandle, IOCompletionCallback callback, IAsyncResult asyncResult, SectorAlignedMemory memory) + { + this.callback = callback; + this.asyncResult = asyncResult; + this.memory = memory; + this.logHandle = logHandle; + } + + public unsafe void Callback(IAsyncResult result) + { + uint errorCode = 0; + try + { + logHandle.EndWrite(result); + } + catch + { + errorCode = 1; + } + + memory.Return(); + Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult); + callback(errorCode, 0, ov.UnsafePack(callback, IntPtr.Zero)); + } + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public override unsafe void ReadAsync(int segmentId, ulong sourceAddress, + IntPtr destinationAddress, + uint readLength, + IOCompletionCallback callback, + IAsyncResult asyncResult) + { + var logHandle = GetOrAddHandle(segmentId); + var memory = pool.Get((int)readLength); + logHandle.Seek((long)sourceAddress, SeekOrigin.Begin); + logHandle.BeginRead(memory.buffer, 0, (int)readLength, + new ReadCallbackWrapper(logHandle, callback, asyncResult, memory, destinationAddress, readLength).Callback, null); + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public override unsafe void WriteAsync(IntPtr sourceAddress, + int segmentId, + ulong destinationAddress, + uint numBytesToWrite, + IOCompletionCallback callback, + IAsyncResult asyncResult) + { + var logHandle = GetOrAddHandle(segmentId); + var memory = pool.Get((int)numBytesToWrite); + + fixed (void* destination = memory.buffer) + { + Buffer.MemoryCopy((void*)sourceAddress, destination, numBytesToWrite, numBytesToWrite); + } + logHandle.Seek((long)destinationAddress, SeekOrigin.Begin); + logHandle.BeginWrite(memory.buffer, 0, (int)numBytesToWrite, + new WriteCallbackWrapper(logHandle, callback, asyncResult, memory).Callback, null); + } + + /// + /// + /// + /// + public override void RemoveSegment(int segment) + { + if (logHandles.TryRemove(segment, out Stream logHandle)) + { + logHandle.Dispose(); + File.Delete(GetSegmentName(segment)); + } + } + + /// + /// + /// + /// + /// + /// + public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result) + { + RemoveSegment(segment); + callback(result); + } + + /// + /// + /// + public override void Close() + { + foreach (var logHandle in logHandles.Values) + logHandle.Dispose(); + pool.Free(); + } + + + private string GetSegmentName(int segmentId) + { + return FileName + "." + segmentId; + } + + private static uint GetSectorSize(string filename) + { +#if DOTNETCORE + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + { + Debug.WriteLine("Assuming 512 byte sector alignment for disk with file " + filename); + return 512; + } +#endif + if (!Native32.GetDiskFreeSpace(filename.Substring(0, 3), + out uint lpSectorsPerCluster, + out uint _sectorSize, + out uint lpNumberOfFreeClusters, + out uint lpTotalNumberOfClusters)) + { + Debug.WriteLine("Unable to retrieve information for disk " + filename.Substring(0, 3) + " - check if the disk is available and you have specified the full path with drive name. Assuming sector size of 512 bytes."); + _sectorSize = 512; + } + return _sectorSize; + } + + private Stream CreateHandle(int segmentId) + { + FileOptions fo = FileOptions.WriteThrough; + fo |= FileOptions.Asynchronous; + if (deleteOnClose) + fo |= FileOptions.DeleteOnClose; + + var logHandle = new FileStream( + GetSegmentName(segmentId), FileMode.OpenOrCreate, + FileAccess.ReadWrite, FileShare.ReadWrite, 4096, fo); + + if (preallocateFile && segmentSize != -1) + SetFileSize(FileName, logHandle, segmentSize); + + return logHandle; + } + + private Stream GetOrAddHandle(int _segmentId) + { + return logHandles.GetOrAdd(_segmentId, segmentId => CreateHandle(segmentId)); + } + + /// + /// Sets file size to the specified value. + /// Does not reset file seek pointer to original location. + /// + /// + /// + /// + /// + private bool SetFileSize(string filename, Stream logHandle, long size) + { + logHandle.SetLength(size); + return true; + } + } +} diff --git a/ZeroLevel/Services/FASTER/Device/NullDevice.cs b/ZeroLevel/Services/FASTER/Device/NullDevice.cs new file mode 100644 index 0000000..c8cca2c --- /dev/null +++ b/ZeroLevel/Services/FASTER/Device/NullDevice.cs @@ -0,0 +1,88 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; + +namespace FASTER.core +{ + /// + /// + /// + public class NullDevice : StorageDeviceBase + { + /// + /// + /// + public NullDevice() : base("null", 512, Devices.CAPACITY_UNSPECIFIED) + { + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public override unsafe void ReadAsync(int segmentId, ulong alignedSourceAddress, IntPtr alignedDestinationAddress, uint aligned_read_length, IOCompletionCallback callback, IAsyncResult asyncResult) + { + alignedSourceAddress = ((ulong)segmentId << 30) | alignedSourceAddress; + + Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult); + NativeOverlapped* ov_native = ov.UnsafePack(callback, IntPtr.Zero); + ov_native->OffsetLow = unchecked((int)(alignedSourceAddress & 0xFFFFFFFF)); + ov_native->OffsetHigh = unchecked((int)((alignedSourceAddress >> 32) & 0xFFFFFFFF)); + + callback(0, aligned_read_length, ov_native); + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public override unsafe void WriteAsync(IntPtr alignedSourceAddress, int segmentId, ulong alignedDestinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult) + { + alignedDestinationAddress = ((ulong)segmentId << 30) | alignedDestinationAddress; + + Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult); + NativeOverlapped* ov_native = ov.UnsafePack(callback, IntPtr.Zero); + + ov_native->OffsetLow = unchecked((int)(alignedDestinationAddress & 0xFFFFFFFF)); + ov_native->OffsetHigh = unchecked((int)((alignedDestinationAddress >> 32) & 0xFFFFFFFF)); + + callback(0, numBytesToWrite, ov_native); + } + + /// + /// + /// + /// + public override void RemoveSegment(int segment) + { + // No-op + } + + /// + /// + /// + /// + /// + /// + public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result) => callback(result); + + /// + /// + /// + public override void Close() + { + } + } +} diff --git a/ZeroLevel/Services/FASTER/Device/ShardedStorageDevice.cs b/ZeroLevel/Services/FASTER/Device/ShardedStorageDevice.cs new file mode 100644 index 0000000..efa6e3d --- /dev/null +++ b/ZeroLevel/Services/FASTER/Device/ShardedStorageDevice.cs @@ -0,0 +1,312 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Text; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Interface that encapsulates a sharding strategy that is used by . This + /// allows users to customize their sharding behaviors. Some default implementations are supplied for common + /// partitioning schemes. + /// + interface IPartitionScheme + { + /// + /// A list of that represents the shards. Indexes into this list will be + /// used as unique identifiers for the shards. + /// + IList Devices { get; } + + /// + /// Maps a range in the unified logical address space into a contiguous physical chunk on a shard's address space. + /// Because the given range may be sharded across multiple devices, only the largest contiguous chunk starting from + /// start address but smaller than end address is returned in shard, shardStartAddress, and shardEndAddress. + /// + /// start address of the range to map in the logical address space + /// end address of the range to map in the logical address space + /// the shard (potentially part of) the given range resides in, given as index into + /// start address translated into physical start address on the returned shard + /// + /// physical address of the end of the part of the range on the returned shard. This is not necessarily a translation of the end address + /// given, as the tail of the range maybe on (a) different device(s). + /// + /// + /// the logical address translated from the returned shardEndAddress. If this is not equal to the given end address, the caller is + /// expected to repeatedly call this method using the returned value as the new startAddress until the entire original range is + /// covered. + /// + long MapRange(long startAddress, long endAddress, out int shard, out long shardStartAddress, out long shardEndAddress); + + /// + /// Maps the sector size of a composed device into sector sizes for each shard + /// + /// sector size of the composed device + /// the shard + /// sector size on shard + long MapSectorSize(long sectorSize, int shard); + } + + /// + /// Uniformly shards data across given devices. + /// + class UniformPartitionScheme : IPartitionScheme + { + public IList Devices { get; } + private readonly long chunkSize; + + /// + /// Constructs a UniformPartitionScheme to shard data uniformly across given devices. Suppose we have 3 devices and the following logical write: + /// [chunk 1][chunk 2][chunk 3][chunk 4]... + /// chunk 1 is written on device 0, 2 on device 1, 3 on device 2, 4 on device 0, etc. + /// + /// size of each chunk + /// the devices to compose from + public UniformPartitionScheme(long chunkSize, IList devices) + { + Debug.Assert(devices.Count != 0, "There cannot be zero shards"); + Debug.Assert(chunkSize > 0, "chunk size should not be negative"); + Debug.Assert((chunkSize & (chunkSize - 1)) == 0, "Chunk size must be a power of 2"); + this.Devices = devices; + this.chunkSize = chunkSize; + foreach (IDevice device in Devices) + { + Debug.Assert(chunkSize % device.SectorSize == 0, "A single device sector cannot be partitioned"); + } + } + + /// + /// vararg version of + /// + /// + /// + public UniformPartitionScheme(long chunkSize, params IDevice[] devices) : this(chunkSize, (IList)devices) + { + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public long MapRange(long startAddress, long endAddress, out int shard, out long shardStartAddress, out long shardEndAddress) + { + long chunkId = startAddress / chunkSize; + shard = (int)(chunkId % Devices.Count); + shardStartAddress = chunkId / Devices.Count * chunkSize + startAddress % chunkSize; + long chunkEndAddress = (chunkId + 1) * chunkSize; + if (endAddress > chunkEndAddress) + { + shardEndAddress = shardStartAddress + chunkSize; + return chunkEndAddress; + } + else + { + shardEndAddress = endAddress - startAddress + shardStartAddress; + return endAddress; + } + } + + /// + /// + /// + /// + /// + /// + public long MapSectorSize(long sectorSize, int shard) + { + var numChunks = sectorSize / chunkSize; + // ceiling of (a div b) is (a + b - 1) / b where div is mathematical division and / is integer division + return (numChunks + Devices.Count - 1) / Devices.Count * chunkSize; + } + } + + /// + /// A logically composes multiple into a single storage device + /// by sharding writes into different devices according to a supplied . The goal is to be + /// able to issue large reads and writes in parallel into multiple devices and improve throughput. Beware that this + /// code does not contain error detection or correction mechanism to cope with increased failure from more devices. + /// + class ShardedStorageDevice : StorageDeviceBase + { + private readonly IPartitionScheme partitions; + + /// + /// Constructs a new ShardedStorageDevice with the given partition scheme + /// + /// The parition scheme to use + public ShardedStorageDevice(IPartitionScheme partitions) : base("", 512, -1) + { + this.partitions = partitions; + } + + /// + /// + /// + public override void Close() + { + foreach (IDevice device in partitions.Devices) + { + device.Close(); + } + } + + /// + /// + /// + /// + /// + public override void Initialize(long segmentSize, LightEpoch epoch) + { + base.Initialize(segmentSize, epoch); + + for (int i = 0; i < partitions.Devices.Count; i++) + { + partitions.Devices[i].Initialize(partitions.MapSectorSize(segmentSize, 0), epoch); + } + } + + /// + /// + /// + /// + /// + /// + public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result) + { + var countdown = new CountdownEvent(partitions.Devices.Count); + foreach (IDevice shard in partitions.Devices) + { + shard.RemoveSegmentAsync(segment, ar => + { + if (countdown.Signal()) + { + callback(ar); + countdown.Dispose(); + } + }, result); + } + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public unsafe override void WriteAsync(IntPtr sourceAddress, int segmentId, ulong destinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult) + { + // Starts off in one, in order to prevent some issued writes calling the callback before all parallel writes are issued. + var countdown = new CountdownEvent(1); + long currentWriteStart = (long)destinationAddress; + long writeEnd = currentWriteStart + (long)numBytesToWrite; + uint aggregateErrorCode = 0; + while (currentWriteStart < writeEnd) + { + long newStart = partitions.MapRange(currentWriteStart, writeEnd, out int shard, out long shardStartAddress, out long shardEndAddress); + ulong writeOffset = (ulong)currentWriteStart - destinationAddress; + // Indicate that there is one more task to wait for + countdown.AddCount(); + // Because more than one device can return with an error, it is important that we remember the most recent error code we saw. (It is okay to only + // report one error out of many. It will be as if we failed on that error and cancelled all other reads, even though we issue reads in parallel and + // wait until all of them are complete in the implementation) + // Can there be races on async result as we issue writes or reads in parallel? + partitions.Devices[shard].WriteAsync(IntPtr.Add(sourceAddress, (int)writeOffset), + segmentId, + (ulong)shardStartAddress, + (uint)(shardEndAddress - shardStartAddress), + (e, n, o) => + { + // TODO: Check if it is incorrect to ignore o + if (e != 0) aggregateErrorCode = e; + if (countdown.Signal()) + { + callback(aggregateErrorCode, n, o); + countdown.Dispose(); + } + else + { + Overlapped.Free(o); + } + }, + asyncResult); + + currentWriteStart = newStart; + } + + // TODO: Check if overlapped wrapper is handled correctly + if (countdown.Signal()) + { + Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult); + NativeOverlapped* ovNative = ov.UnsafePack(callback, IntPtr.Zero); + callback(aggregateErrorCode, numBytesToWrite, ovNative); + countdown.Dispose(); + } + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public unsafe override void ReadAsync(int segmentId, ulong sourceAddress, IntPtr destinationAddress, uint readLength, IOCompletionCallback callback, IAsyncResult asyncResult) + { + // Starts off in one, in order to prevent some issued writes calling the callback before all parallel writes are issued. + var countdown = new CountdownEvent(1); + long currentReadStart = (long)sourceAddress; + long readEnd = currentReadStart + readLength; + uint aggregateErrorCode = 0; + while (currentReadStart < readEnd) + { + long newStart = partitions.MapRange(currentReadStart, readEnd, out int shard, out long shardStartAddress, out long shardEndAddress); + ulong writeOffset = (ulong)currentReadStart - sourceAddress; + // Because more than one device can return with an error, it is important that we remember the most recent error code we saw. (It is okay to only + // report one error out of many. It will be as if we failed on that error and cancelled all other reads, even though we issue reads in parallel and + // wait until all of them are complete in the implementation) + countdown.AddCount(); + partitions.Devices[shard].ReadAsync(segmentId, + (ulong)shardStartAddress, + IntPtr.Add(destinationAddress, (int)writeOffset), + (uint)(shardEndAddress - shardStartAddress), + (e, n, o) => + { + // TODO: this is incorrect if returned "bytes" written is allowed to be less than requested like POSIX. + if (e != 0) aggregateErrorCode = e; + if (countdown.Signal()) + { + callback(aggregateErrorCode, n, o); + countdown.Dispose(); + } + else + { + Overlapped.Free(o); + } + }, + asyncResult); + + currentReadStart = newStart; + } + + // TODO: Check handling of overlapped wrapper + if (countdown.Signal()) + { + Overlapped ov = new Overlapped(0, 0, IntPtr.Zero, asyncResult); + NativeOverlapped* ovNative = ov.UnsafePack(callback, IntPtr.Zero); + callback(aggregateErrorCode, readLength, ovNative); + countdown.Dispose(); + } + } + } +} diff --git a/ZeroLevel/Services/FASTER/Device/StorageDeviceBase.cs b/ZeroLevel/Services/FASTER/Device/StorageDeviceBase.cs new file mode 100644 index 0000000..7ab14ab --- /dev/null +++ b/ZeroLevel/Services/FASTER/Device/StorageDeviceBase.cs @@ -0,0 +1,279 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using Microsoft.Win32.SafeHandles; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Runtime.InteropServices; +using System.Threading; +using System.Threading.Tasks; + +namespace FASTER.core +{ + /// + /// + /// + public abstract class StorageDeviceBase : IDevice + { + + /// + /// + /// + public uint SectorSize { get; } + + /// + /// + /// + public string FileName { get; } + + /// + /// + /// + public long Capacity { get; } + + /// + /// + /// + public int StartSegment { get { return startSegment; } } + + /// + /// + /// + public int EndSegment { get { return endSegment; } } + + /// + /// + /// + public long SegmentSize { get { return segmentSize; } } + + /// + /// Segment size + /// + protected long segmentSize; + + private int segmentSizeBits; + private ulong segmentSizeMask; + + /// + /// Instance of the epoch protection framework in the current system. + /// A device may have internal in-memory data structure that requires epoch protection under concurrent access. + /// + protected LightEpoch epoch; + + /// + /// start and end segment corresponding to and . Subclasses are + /// allowed to modify these as needed. + /// + protected int startSegment = 0, endSegment = -1; + + /// + /// Initializes a new StorageDeviceBase + /// + /// Name of the file to use + /// The smallest unit of write of the underlying storage device (e.g. 512 bytes for a disk) + /// The maximal number of bytes this storage device can accommondate, or CAPAPCITY_UNSPECIFIED if there is no such limit + public StorageDeviceBase(string filename, uint sectorSize, long capacity) + { + FileName = filename; + SectorSize = sectorSize; + + segmentSize = -1; + segmentSizeBits = 64; + segmentSizeMask = ~0UL; + + Capacity = capacity; + } + + /// + /// Initialize device + /// + /// + /// + public virtual void Initialize(long segmentSize, LightEpoch epoch = null) + { + Debug.Assert(Capacity == -1 || Capacity % segmentSize == 0, "capacity must be a multiple of segment sizes"); + this.segmentSize = segmentSize; + this.epoch = epoch; + if (!Utility.IsPowerOfTwo(segmentSize)) + { + if (segmentSize != -1) + throw new Exception("Invalid segment size: " + segmentSize); + segmentSizeBits = 64; + segmentSizeMask = ~0UL; + } + else + { + segmentSizeBits = Utility.GetLogBase2((ulong)segmentSize); + segmentSizeMask = (ulong)segmentSize - 1; + } + } + + /// + /// + /// + /// + /// + /// + /// + /// + public void WriteAsync(IntPtr alignedSourceAddress, ulong alignedDestinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult) + { + int segment = (int)(segmentSizeBits < 64 ? alignedDestinationAddress >> segmentSizeBits : 0); + + // If the device has bounded space, and we are writing a new segment, need to check whether an existing segment needs to be evicted. + if (Capacity != Devices.CAPACITY_UNSPECIFIED && Utility.MonotonicUpdate(ref endSegment, segment, out int oldEnd)) + { + // Attempt to update the stored range until there are enough space on the tier to accomodate the current logTail + int newStartSegment = endSegment - (int)(Capacity >> segmentSizeBits); + // Assuming that we still have enough physical capacity to write another segment, even if delete does not immediately free up space. + TruncateUntilSegmentAsync(newStartSegment, r => { }, null); + } + WriteAsync( + alignedSourceAddress, + segment, + alignedDestinationAddress & segmentSizeMask, + numBytesToWrite, callback, asyncResult); + } + + /// + /// + /// + /// + /// + /// + /// + /// + public void ReadAsync(ulong alignedSourceAddress, IntPtr alignedDestinationAddress, uint aligned_read_length, IOCompletionCallback callback, IAsyncResult asyncResult) + { + var segment = segmentSizeBits < 64 ? alignedSourceAddress >> segmentSizeBits : 0; + + ReadAsync( + (int)segment, + alignedSourceAddress & segmentSizeMask, + alignedDestinationAddress, + aligned_read_length, callback, asyncResult); + } + + /// + /// + /// + /// + /// + /// + public abstract void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result); + + /// + /// + /// By default the implementation calls into + /// + /// + public virtual void RemoveSegment(int segment) + { + ManualResetEventSlim completionEvent = new ManualResetEventSlim(false); + RemoveSegmentAsync(segment, r => completionEvent.Set(), null); + completionEvent.Wait(); + } + + /// + /// + /// + /// + /// + /// + public void TruncateUntilSegmentAsync(int toSegment, AsyncCallback callback, IAsyncResult result) + { + // Reset begin range to at least toAddress + if (!Utility.MonotonicUpdate(ref startSegment, toSegment, out int oldStart)) + { + // If no-op, invoke callback and return immediately + callback(result); + return; + } + CountdownEvent countdown = new CountdownEvent(toSegment - oldStart); + // This action needs to be epoch-protected because readers may be issuing reads to the deleted segment, unaware of the delete. + // Because of earlier compare-and-swap, the caller has exclusive access to the range [oldStartSegment, newStartSegment), and there will + // be no double deletes. + epoch.BumpCurrentEpoch(() => + { + for (int i = oldStart; i < toSegment; i++) + { + RemoveSegmentAsync(i, r => { + if (countdown.Signal()) + { + callback(r); + countdown.Dispose(); + } + }, result); + } + }); + } + + /// + /// + /// + /// + public void TruncateUntilSegment(int toSegment) + { + using (ManualResetEventSlim completionEvent = new ManualResetEventSlim(false)) + { + TruncateUntilSegmentAsync(toSegment, r => completionEvent.Set(), null); + completionEvent.Wait(); + } + } + + /// + /// + /// + /// + /// + /// + public virtual void TruncateUntilAddressAsync(long toAddress, AsyncCallback callback, IAsyncResult result) + { + // Truncate only up to segment boundary if address is not aligned + TruncateUntilSegmentAsync((int)(toAddress >> segmentSizeBits), callback, result); + } + + /// + /// + /// + /// + public virtual void TruncateUntilAddress(long toAddress) + { + using (ManualResetEventSlim completionEvent = new ManualResetEventSlim(false)) + { + TruncateUntilAddressAsync(toAddress, r => completionEvent.Set(), null); + completionEvent.Wait(); + } + } + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public abstract void WriteAsync(IntPtr sourceAddress, int segmentId, ulong destinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult); + + /// + /// + /// + /// + /// + /// + /// + /// + /// + public abstract void ReadAsync(int segmentId, ulong sourceAddress, IntPtr destinationAddress, uint readLength, IOCompletionCallback callback, IAsyncResult asyncResult); + + /// + /// + /// + public abstract void Close(); + } +} diff --git a/ZeroLevel/Services/FASTER/Device/TieredStorageDevice.cs b/ZeroLevel/Services/FASTER/Device/TieredStorageDevice.cs new file mode 100644 index 0000000..5640054 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Device/TieredStorageDevice.cs @@ -0,0 +1,176 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Diagnostics; +using System.Threading; +using System.ComponentModel; +using System.Collections.Concurrent; + +namespace FASTER.core +{ + /// + /// A logically composes multiple into a single storage device. It is assumed + /// that some are used as caches while there is one that is considered the commit point, i.e. when a write is completed + /// on the device, it is considered persistent. Reads are served from the closest device with available data. Writes are issued in parallel to + /// all devices + /// + class TieredStorageDevice : StorageDeviceBase + { + private readonly IList devices; + private readonly int commitPoint; + + /// + /// Constructs a new TieredStorageDevice composed of the given devices. + /// + /// + /// The index of an IDevice in . When a write has been completed on the device, + /// the write is considered persistent. It is guaranteed that the callback in + /// will not be called until the write is completed on the commit point device. + /// + /// + /// List of devices to be used. The list should be given in order of hot to cold. Read is served from the + /// device with smallest index in the list that has the requested data + /// + public TieredStorageDevice(int commitPoint, IList devices) : base(ComputeFileString(devices, commitPoint), 512, ComputeCapacity(devices)) + { + Debug.Assert(commitPoint >= 0 && commitPoint < devices.Count, "commit point is out of range"); + + this.devices = devices; + this.commitPoint = commitPoint; + } + + /// + /// Constructs a new TieredStorageDevice composed of the given devices. + /// + /// + /// The index of an IDevice in devices. When a write has been completed on the device, + /// the write is considered persistent. It is guaranteed that the callback in + /// will not be called until the write is completed on commit point device and all previous tiers. + /// + /// + /// List of devices to be used. The list should be given in order of hot to cold. Read is served from the + /// device with smallest index in the list that has the requested data + /// + public TieredStorageDevice(int commitPoint, params IDevice[] devices) : this(commitPoint, (IList)devices) + { + } + + public override void Initialize(long segmentSize, LightEpoch epoch) + { + base.Initialize(segmentSize, epoch); + + foreach (IDevice devices in devices) + { + devices.Initialize(segmentSize, epoch); + } + } + + public override void Close() + { + foreach (IDevice device in devices) + { + device.Close(); + } + } + + public override void ReadAsync(int segmentId, ulong sourceAddress, IntPtr destinationAddress, uint readLength, IOCompletionCallback callback, IAsyncResult asyncResult) + { + // This device is epoch-protected and cannot be stale while the operation is in flight + IDevice closestDevice = devices[FindClosestDeviceContaining(segmentId)]; + // We can directly forward the address, because assuming an inclusive policy, all devices agree on the same address space. The only difference is that some segments may not + // be present for certain devices. + closestDevice.ReadAsync(segmentId, sourceAddress, destinationAddress, readLength, callback, asyncResult); + } + + public override unsafe void WriteAsync(IntPtr sourceAddress, int segmentId, ulong destinationAddress, uint numBytesToWrite, IOCompletionCallback callback, IAsyncResult asyncResult) + { + + int startTier = FindClosestDeviceContaining(segmentId); + Debug.Assert(startTier <= commitPoint, "Write should not elide the commit point"); + + var countdown = new CountdownEvent(commitPoint + 1); // number of devices to wait on + // Issue writes to all tiers in parallel + for (int i = startTier; i < devices.Count; i++) + { + if (i <= commitPoint) + { + + // All tiers before the commit point (incluisive) need to be persistent before the callback is invoked. + devices[i].WriteAsync(sourceAddress, segmentId, destinationAddress, numBytesToWrite, (e, n, o) => + { + // The last tier to finish invokes the callback + if (countdown.Signal()) + { + callback(e, n, o); + countdown.Dispose(); + } + + }, asyncResult); + } + else + { + // Otherwise, simply issue the write without caring about callbacks + devices[i].WriteAsync(sourceAddress, segmentId, destinationAddress, numBytesToWrite, (e, n, o) => { }, null); + } + } + } + + public override void RemoveSegmentAsync(int segment, AsyncCallback callback, IAsyncResult result) + { + int startTier = FindClosestDeviceContaining(segment); + var countdown = new CountdownEvent(devices.Count); + for(int i = startTier; i < devices.Count; i++) + { + devices[i].RemoveSegmentAsync(segment, r => + { + if (countdown.Signal()) + { + callback(r); + countdown.Dispose(); + } + }, result); + } + } + + private static long ComputeCapacity(IList devices) + { + long result = 0; + // The capacity of a tiered storage device is the sum of the capacity of its tiers + foreach (IDevice device in devices) + { + // Unless the last tier device has unspecified storage capacity, in which case the tiered storage also has unspecified capacity + if (device.Capacity == Devices.CAPACITY_UNSPECIFIED) + { + Debug.Assert(device == devices[devices.Count - 1], "Only the last tier storage of a tiered storage device can have unspecified capacity"); + return Devices.CAPACITY_UNSPECIFIED; + } + result = Math.Max(result, device.Capacity); + } + return result; + } + + private static string ComputeFileString(IList devices, int commitPoint) + { + StringBuilder result = new StringBuilder(); + foreach (IDevice device in devices) + { + string formatString = "{0}, file name {1}, capacity {2} bytes;"; + string capacity = device.Capacity == Devices.CAPACITY_UNSPECIFIED ? "unspecified" : device.Capacity.ToString(); + result.AppendFormat(formatString, device.GetType().Name, device.FileName, capacity); + } + result.AppendFormat("commit point: {0} at tier {1}", devices[commitPoint].GetType().Name, commitPoint); + return result.ToString(); + } + + private int FindClosestDeviceContaining(int segment) + { + // Can use binary search, but 1) it might not be faster than linear on a array assumed small, and 2) C# built in does not guarantee first element is returned on duplicates. + // Therefore we are sticking to the simpler approach at first. + for (int i = 0; i < devices.Count; i++) + { + if (devices[i].StartSegment <= segment) return i; + } + throw new ArgumentException("No such address exists"); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Epochs/FastThreadLocal.cs b/ZeroLevel/Services/FASTER/Epochs/FastThreadLocal.cs new file mode 100644 index 0000000..7bc2e2e --- /dev/null +++ b/ZeroLevel/Services/FASTER/Epochs/FastThreadLocal.cs @@ -0,0 +1,81 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Net; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Fast implementation of instance-thread-local variables + /// + /// + internal class FastThreadLocal + { + // Max instances supported + private const int kMaxInstances = 128; + + [ThreadStatic] + private static T[] tl_values; + [ThreadStatic] + private static int[] tl_iid; + + private readonly int offset; + private readonly int iid; + + private static readonly int[] instances = new int[kMaxInstances]; + private static int instanceId = 0; + + public FastThreadLocal() + { + iid = Interlocked.Increment(ref instanceId); + + for (int i = 0; i < kMaxInstances; i++) + { + if (0 == Interlocked.CompareExchange(ref instances[i], iid, 0)) + { + offset = i; + return; + } + } + throw new Exception("Unsupported number of simultaneous instances"); + } + + public void InitializeThread() + { + if (tl_values == null) + { + tl_values = new T[kMaxInstances]; + tl_iid = new int[kMaxInstances]; + } + if (tl_iid[offset] != iid) + { + tl_iid[offset] = iid; + tl_values[offset] = default(T); + } + } + + public void DisposeThread() + { + tl_values[offset] = default(T); + tl_iid[offset] = 0; + } + + /// + /// Dispose instance for all threads + /// + public void Dispose() + { + instances[offset] = 0; + } + + public T Value + { + get => tl_values[offset]; + set => tl_values[offset] = value; + } + + public bool IsInitializedForThread => (tl_values != null) && (iid == tl_iid[offset]); + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Epochs/LightEpoch.cs b/ZeroLevel/Services/FASTER/Epochs/LightEpoch.cs new file mode 100644 index 0000000..5d7b789 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Epochs/LightEpoch.cs @@ -0,0 +1,450 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; +using System.Runtime.InteropServices; +using System.Runtime.CompilerServices; +using System.Diagnostics; + +namespace FASTER.core +{ + /// + /// Epoch protection + /// + public unsafe class LightEpoch + { + /// + /// Default invalid index entry. + /// + private const int kInvalidIndex = 0; + + /// + /// Default number of entries in the entries table + /// + public const int kTableSize = 128; + + /// + /// Default drainlist size + /// + private const int kDrainListSize = 16; + + /// + /// Thread protection status entries. + /// + private Entry[] tableRaw; + private GCHandle tableHandle; + private Entry* tableAligned; + + private static Entry[] threadIndex; + private static GCHandle threadIndexHandle; + private static Entry* threadIndexAligned; + + /// + /// List of action, epoch pairs containing actions to performed + /// when an epoch becomes safe to reclaim. + /// + private int drainCount = 0; + private readonly EpochActionPair[] drainList = new EpochActionPair[kDrainListSize]; + + /// + /// A thread's entry in the epoch table. + /// + [ThreadStatic] + private static int threadEntryIndex; + + /// + /// Number of instances using this entry + /// + [ThreadStatic] + private static int threadEntryIndexCount; + + [ThreadStatic] + static int threadId; + + /// + /// Global current epoch value + /// + public int CurrentEpoch; + + /// + /// Cached value of latest epoch that is safe to reclaim + /// + public int SafeToReclaimEpoch; + + /// + /// Static constructor to setup shared cache-aligned space + /// to store per-entry count of instances using that entry + /// + static LightEpoch() + { + // Over-allocate to do cache-line alignment + threadIndex = new Entry[kTableSize + 2]; + threadIndexHandle = GCHandle.Alloc(threadIndex, GCHandleType.Pinned); + long p = (long)threadIndexHandle.AddrOfPinnedObject(); + + // Force the pointer to align to 64-byte boundaries + long p2 = (p + (Constants.kCacheLineBytes - 1)) & ~(Constants.kCacheLineBytes - 1); + threadIndexAligned = (Entry*)p2; + } + + /// + /// Instantiate the epoch table + /// + public LightEpoch() + { + // Over-allocate to do cache-line alignment + tableRaw = new Entry[kTableSize + 2]; + tableHandle = GCHandle.Alloc(tableRaw, GCHandleType.Pinned); + long p = (long)tableHandle.AddrOfPinnedObject(); + + // Force the pointer to align to 64-byte boundaries + long p2 = (p + (Constants.kCacheLineBytes - 1)) & ~(Constants.kCacheLineBytes - 1); + tableAligned = (Entry*)p2; + + CurrentEpoch = 1; + SafeToReclaimEpoch = 0; + + for (int i = 0; i < kDrainListSize; i++) + drainList[i].epoch = int.MaxValue; + drainCount = 0; + } + + /// + /// Clean up epoch table + /// + public void Dispose() + { + tableHandle.Free(); + tableAligned = null; + tableRaw = null; + CurrentEpoch = 1; + SafeToReclaimEpoch = 0; + } + + /// + /// Check whether current thread is protected + /// + /// Result of the check + public bool IsProtected() + { + return kInvalidIndex != threadEntryIndex; + } + + /// + /// Enter the thread into the protected code region + /// + /// Current epoch + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int ProtectAndDrain() + { + int entry = threadEntryIndex; + + (*(tableAligned + entry)).threadId = threadEntryIndex; + (*(tableAligned + entry)).localCurrentEpoch = CurrentEpoch; + + if (drainCount > 0) + { + Drain((*(tableAligned + entry)).localCurrentEpoch); + } + + return (*(tableAligned + entry)).localCurrentEpoch; + } + + /// + /// Check and invoke trigger actions that are ready + /// + /// Next epoch + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void Drain(int nextEpoch) + { + ComputeNewSafeToReclaimEpoch(nextEpoch); + + for (int i = 0; i < kDrainListSize; i++) + { + var trigger_epoch = drainList[i].epoch; + + if (trigger_epoch <= SafeToReclaimEpoch) + { + if (Interlocked.CompareExchange(ref drainList[i].epoch, int.MaxValue - 1, trigger_epoch) == trigger_epoch) + { + var trigger_action = drainList[i].action; + drainList[i].action = null; + drainList[i].epoch = int.MaxValue; + trigger_action(); + if (Interlocked.Decrement(ref drainCount) == 0) break; + } + } + } + } + + /// + /// Thread acquires its epoch entry + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Acquire() + { + if (threadEntryIndex == kInvalidIndex) + threadEntryIndex = ReserveEntryForThread(); + threadEntryIndexCount++; + } + + + /// + /// Thread releases its epoch entry + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Release() + { + int entry = threadEntryIndex; + (*(tableAligned + entry)).localCurrentEpoch = 0; + (*(tableAligned + entry)).threadId = 0; + + threadEntryIndexCount--; + if (threadEntryIndexCount == 0) + { + (threadIndexAligned + threadEntryIndex)->threadId = 0; + threadEntryIndex = kInvalidIndex; + } + } + + /// + /// Thread suspends its epoch entry + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Suspend() + { + Release(); + } + + /// + /// Thread resumes its epoch entry + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Resume() + { + Acquire(); + ProtectAndDrain(); + } + + /// + /// Increment global current epoch + /// + /// + public int BumpCurrentEpoch() + { + int nextEpoch = Interlocked.Add(ref CurrentEpoch, 1); + + if (drainCount > 0) + Drain(nextEpoch); + + return nextEpoch; + } + + /// + /// Increment current epoch and associate trigger action + /// with the prior epoch + /// + /// Trigger action + /// + public int BumpCurrentEpoch(Action onDrain) + { + int PriorEpoch = BumpCurrentEpoch() - 1; + + int i = 0, j = 0; + while (true) + { + if (drainList[i].epoch == int.MaxValue) + { + if (Interlocked.CompareExchange(ref drainList[i].epoch, int.MaxValue - 1, int.MaxValue) == int.MaxValue) + { + drainList[i].action = onDrain; + drainList[i].epoch = PriorEpoch; + Interlocked.Increment(ref drainCount); + break; + } + } + else + { + var triggerEpoch = drainList[i].epoch; + + if (triggerEpoch <= SafeToReclaimEpoch) + { + if (Interlocked.CompareExchange(ref drainList[i].epoch, int.MaxValue - 1, triggerEpoch) == triggerEpoch) + { + var triggerAction = drainList[i].action; + drainList[i].action = onDrain; + drainList[i].epoch = PriorEpoch; + triggerAction(); + break; + } + } + } + + if (++i == kDrainListSize) + { + ProtectAndDrain(); + i = 0; + if (++j == 500) + { + j = 0; + Debug.WriteLine("Delay finding a free entry in the drain list"); + } + } + } + + ProtectAndDrain(); + + return PriorEpoch + 1; + } + + /// + /// Looks at all threads and return the latest safe epoch + /// + /// Current epoch + /// Safe epoch + private int ComputeNewSafeToReclaimEpoch(int currentEpoch) + { + int oldestOngoingCall = currentEpoch; + + for (int index = 1; index <= kTableSize; ++index) + { + int entry_epoch = (*(tableAligned + index)).localCurrentEpoch; + if (0 != entry_epoch) + { + if (entry_epoch < oldestOngoingCall) + { + oldestOngoingCall = entry_epoch; + } + } + } + + // The latest safe epoch is the one just before + // the earliest unsafe epoch. + SafeToReclaimEpoch = oldestOngoingCall - 1; + return SafeToReclaimEpoch; + } + + /// + /// Reserve entry for thread. This method relies on the fact that no + /// thread will ever have ID 0. + /// + /// Start index + /// Thread id + /// Reserved entry + private static int ReserveEntry(int startIndex, int threadId) + { + int current_iteration = 0; + for (; ; ) + { + // Reserve an entry in the table. + for (int i = 0; i < kTableSize; ++i) + { + int index_to_test = 1 + ((startIndex + i) & (kTableSize - 1)); + if (0 == (threadIndexAligned + index_to_test)->threadId) + { + bool success = + (0 == Interlocked.CompareExchange( + ref (threadIndexAligned+index_to_test)->threadId, + threadId, 0)); + + if (success) + { + return (int)index_to_test; + } + } + ++current_iteration; + } + + if (current_iteration > (kTableSize * 10)) + { + throw new Exception("Unable to reserve an epoch entry, try increasing the epoch table size (kTableSize)"); + } + } + } + + /// + /// Allocate a new entry in epoch table. This is called + /// once for a thread. + /// + /// Reserved entry + private static int ReserveEntryForThread() + { + if (threadId == 0) // run once per thread for performance + { + // For portability(run on non-windows platform) + threadId = Environment.OSVersion.Platform == PlatformID.Win32NT ? (int)Native32.GetCurrentThreadId() : Thread.CurrentThread.ManagedThreadId; + } + int startIndex = Utility.Murmur3(threadId); + return ReserveEntry(startIndex, threadId); + } + + /// + /// Epoch table entry (cache line size). + /// + [StructLayout(LayoutKind.Explicit, Size = Constants.kCacheLineBytes)] + private struct Entry + { + + /// + /// Thread-local value of epoch + /// + [FieldOffset(0)] + public int localCurrentEpoch; + + /// + /// ID of thread associated with this entry. + /// + [FieldOffset(4)] + public int threadId; + + [FieldOffset(8)] + public int reentrant; + + [FieldOffset(12)] + public fixed int markers[13]; + }; + + private struct EpochActionPair + { + public long epoch; + public Action action; + } + + /// + /// Mechanism for threads to mark some activity as completed until + /// some version by this thread, and check if all active threads + /// have completed the same activity until that version. + /// + /// ID of activity + /// Version + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool MarkAndCheckIsComplete(int markerIdx, int version) + { + int entry = threadEntryIndex; + if (kInvalidIndex == entry) + { + Debug.WriteLine("New Thread entered during CPR"); + Debug.Assert(false); + } + + (*(tableAligned + entry)).markers[markerIdx] = version; + + // check if all threads have reported complete + for (int index = 1; index <= kTableSize; ++index) + { + int entry_epoch = (*(tableAligned + index)).localCurrentEpoch; + int fc_version = (*(tableAligned + index)).markers[markerIdx]; + if (0 != entry_epoch) + { + if (fc_version != version && entry_epoch < int.MaxValue) + { + return false; + } + } + } + return true; + } + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Common/AddressInfo.cs b/ZeroLevel/Services/FASTER/Index/Common/AddressInfo.cs new file mode 100644 index 0000000..a9a78a9 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Common/AddressInfo.cs @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 1591 + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace FASTER.core +{ + /// + /// AddressInfo struct + /// + [StructLayout(LayoutKind.Explicit, Size = 8)] + public unsafe struct AddressInfo + { + private const int kMultiplierBits = 1; + private static readonly int kTotalBits = sizeof(IntPtr) * 8; + private static readonly int kAddressBits = 42*kTotalBits/64; + private static readonly int kSizeBits = kTotalBits - kAddressBits - kMultiplierBits; + private static readonly long kSizeMaskInWord = ((1L << kSizeBits) - 1) << kAddressBits; + private static readonly long kSizeMaskInInteger = (1L << kSizeBits) - 1; + private static readonly long kMultiplierMaskInWord = ((1L << kMultiplierBits) - 1) << (kAddressBits + kSizeBits); + private const long kMultiplierMaskInInteger = (1L << kMultiplierBits) - 1; + private static readonly long kAddressMask = (1L << kAddressBits) - 1; + + + [FieldOffset(0)] + private IntPtr word; + + public static void WriteInfo(AddressInfo* info, long address, long size) + { + info->word = default(IntPtr); + info->Address = address; + info->Size = size; + } + + public static string ToString(AddressInfo* info) + { + return "RecordHeader Word = " + info->word; + } + + public long Size + { + get + { + int multiplier = (int)((((long)word & kMultiplierMaskInWord) >> (kAddressBits + kSizeBits)) & kMultiplierMaskInInteger); + return (multiplier == 0 ? 512 : 1<<20)*((((long)word & kSizeMaskInWord) >> kAddressBits) & kSizeMaskInInteger); + } + set + { + int multiplier = 0; + int val = (int)(value >> 9); + if ((value & ((1<<9)-1)) != 0) val++; + + if (val >= (1 << kSizeBits)) + { + val = (int)(value >> 20); + if ((value & ((1<<20) - 1)) != 0) val++; + multiplier = 1; + if (val >= (1 << kSizeBits)) + { + throw new Exception("Unsupported object size: " + value); + } + } + var _word = (long)word; + _word &= ~kSizeMaskInWord; + _word &= ~kMultiplierMaskInWord; + _word |= (val & kSizeMaskInInteger) << kAddressBits; + _word |= (multiplier & kMultiplierMaskInInteger) << (kAddressBits + kSizeBits); + word = (IntPtr)_word; + } + } + + public long Address + { + get + { + return (long)word & kAddressMask; + } + set + { + var _word = (long)word; + _word &= ~kAddressMask; + _word |= (value & kAddressMask); + word = (IntPtr)_word; + if (value != Address) + { + throw new Exception("Overflow in AddressInfo" + ((kAddressBits < 64) ? " - consider running the program in x64 mode for larger address space support" : "")); + } + } + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/Common/CheckpointSettings.cs b/ZeroLevel/Services/FASTER/Index/Common/CheckpointSettings.cs new file mode 100644 index 0000000..6e20f6d --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Common/CheckpointSettings.cs @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + + +using System; + +namespace FASTER.core +{ + /// + /// Checkpoint type + /// + public enum CheckpointType + { + /// + /// Take separate snapshot of in-memory portion of log (default) + /// + Snapshot, + + /// + /// Flush current log (move read-only to tail) + /// (enables incremental checkpointing, but log grows faster) + /// + FoldOver + } + + /// + /// Checkpoint-related settings + /// + public class CheckpointSettings + { + /// + /// Checkpoint manager + /// + public ICheckpointManager CheckpointManager = null; + + /// + /// Type of checkpoint + /// + public CheckpointType CheckPointType = CheckpointType.Snapshot; + + /// + /// Use specified directory for storing and retrieving checkpoints + /// This is a shortcut to providing the following: + /// CheckpointSettings.CheckpointManager = new LocalCheckpointManager(CheckpointDir) + /// + public string CheckpointDir = null; + } +} diff --git a/ZeroLevel/Services/FASTER/Index/Common/Contexts.cs b/ZeroLevel/Services/FASTER/Index/Common/Contexts.cs new file mode 100644 index 0000000..c304e09 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Common/Contexts.cs @@ -0,0 +1,479 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.core +{ + internal enum OperationType + { + READ, + RMW, + UPSERT, + INSERT, + DELETE + } + + internal enum OperationStatus + { + SUCCESS, + NOTFOUND, + RETRY_NOW, + RETRY_LATER, + RECORD_ON_DISK, + SUCCESS_UNMARK, + CPR_SHIFT_DETECTED, + CPR_PENDING_DETECTED + } + + internal class SerializedFasterExecutionContext + { + public int version; + public long serialNum; + public Guid guid; + + public void Write(StreamWriter writer) + { + writer.WriteLine(version); + writer.WriteLine(guid); + writer.WriteLine(serialNum); + } + + public void Load(StreamReader reader) + { + string value = reader.ReadLine(); + version = int.Parse(value); + + value = reader.ReadLine(); + guid = Guid.Parse(value); + + value = reader.ReadLine(); + serialNum = long.Parse(value); + } + } + + public unsafe partial class FasterKV : FasterBase, IFasterKV + where Key : new() + where Value : new() + where Functions : IFunctions + { + + internal struct PendingContext + { + // User provided information + + public OperationType type; + + public IHeapContainer key; + public IHeapContainer value; + public Input input; + public Output output; + public Context userContext; + + // Some additional information about the previous attempt + + public long id; + + public int version; + + public long logicalAddress; + + public long serialNum; + + public HashBucketEntry entry; + + public void Dispose() + { + key?.Dispose(); + value?.Dispose(); + } + } + + internal class FasterExecutionContext : SerializedFasterExecutionContext + { + public Phase phase; + public bool[] markers; + public long totalPending; + public Queue retryRequests; + public Dictionary ioPendingRequests; + public BlockingCollection> readyResponses; + } + } + + + /// + /// Recovery info for hybrid log + /// + public struct HybridLogRecoveryInfo + { + /// + /// Guid + /// + public Guid guid; + /// + /// Use snapshot file + /// + public int useSnapshotFile; + /// + /// Version + /// + public int version; + /// + /// Number of threads + /// + public int numThreads; + /// + /// Flushed logical address + /// + public long flushedLogicalAddress; + /// + /// Start logical address + /// + public long startLogicalAddress; + /// + /// Final logical address + /// + public long finalLogicalAddress; + /// + /// Head address + /// + public long headAddress; + /// + /// Begin address + /// + public long beginAddress; + /// + /// Guid array + /// + public Guid[] guids; + + /// + /// Tokens per guid restored during Continue + /// + public ConcurrentDictionary continueTokens; + + /// + /// Tokens per guid created during Checkpoint + /// + public ConcurrentDictionary checkpointTokens; + + /// + /// Object log segment offsets + /// + public long[] objectLogSegmentOffsets; + + /// + /// Initialize + /// + /// + /// + public void Initialize(Guid token, int _version) + { + guid = token; + useSnapshotFile = 0; + version = _version; + numThreads = 0; + flushedLogicalAddress = 0; + startLogicalAddress = 0; + finalLogicalAddress = 0; + headAddress = 0; + guids = new Guid[LightEpoch.kTableSize + 1]; + continueTokens = new ConcurrentDictionary(); + checkpointTokens = new ConcurrentDictionary(); + objectLogSegmentOffsets = null; + } + + /// + /// Initialize from stream + /// + /// + public void Initialize(StreamReader reader) + { + guids = new Guid[LightEpoch.kTableSize + 1]; + continueTokens = new ConcurrentDictionary(); + + string value = reader.ReadLine(); + guid = Guid.Parse(value); + + value = reader.ReadLine(); + useSnapshotFile = int.Parse(value); + + value = reader.ReadLine(); + version = int.Parse(value); + + value = reader.ReadLine(); + flushedLogicalAddress = long.Parse(value); + + value = reader.ReadLine(); + startLogicalAddress = long.Parse(value); + + value = reader.ReadLine(); + finalLogicalAddress = long.Parse(value); + + value = reader.ReadLine(); + headAddress = long.Parse(value); + + value = reader.ReadLine(); + beginAddress = long.Parse(value); + + value = reader.ReadLine(); + numThreads = int.Parse(value); + + for (int i = 0; i < numThreads; i++) + { + value = reader.ReadLine(); + guids[i] = Guid.Parse(value); + value = reader.ReadLine(); + var serialno = long.Parse(value); + continueTokens.TryAdd(guids[i], serialno); + } + + // Read object log segment offsets + value = reader.ReadLine(); + var numSegments = int.Parse(value); + if (numSegments > 0) + { + objectLogSegmentOffsets = new long[numSegments]; + for (int i = 0; i < numSegments; i++) + { + value = reader.ReadLine(); + objectLogSegmentOffsets[i] = long.Parse(value); + } + } + } + + /// + /// Recover info from token + /// + /// + /// + /// + internal void Recover(Guid token, ICheckpointManager checkpointManager) + { + var metadata = checkpointManager.GetLogCommitMetadata(token); + if (metadata == null) + throw new Exception("Invalid log commit metadata for ID " + token.ToString()); + + Initialize(new StreamReader(new MemoryStream(metadata))); + } + + /// + /// Reset + /// + public void Reset() + { + Initialize(default(Guid), -1); + } + + /// + /// Write info to byte array + /// + public byte[] ToByteArray() + { + using (var ms = new MemoryStream()) + { + using (StreamWriter writer = new StreamWriter(ms)) + { + writer.WriteLine(guid); + writer.WriteLine(useSnapshotFile); + writer.WriteLine(version); + writer.WriteLine(flushedLogicalAddress); + writer.WriteLine(startLogicalAddress); + writer.WriteLine(finalLogicalAddress); + writer.WriteLine(headAddress); + writer.WriteLine(beginAddress); + writer.WriteLine(numThreads); + for (int i = 0; i < numThreads; i++) + { + writer.WriteLine(guids[i]); + writer.WriteLine(checkpointTokens[guids[i]]); + } + + // Write object log segment offsets + writer.WriteLine(objectLogSegmentOffsets == null ? 0 : objectLogSegmentOffsets.Length); + if (objectLogSegmentOffsets != null) + { + for (int i = 0; i < objectLogSegmentOffsets.Length; i++) + { + writer.WriteLine(objectLogSegmentOffsets[i]); + } + } + } + return ms.ToArray(); + } + } + + /// + /// Print checkpoint info for debugging purposes + /// + public void DebugPrint() + { + Debug.WriteLine("******** HybridLog Checkpoint Info for {0} ********", guid); + Debug.WriteLine("Version: {0}", version); + Debug.WriteLine("Is Snapshot?: {0}", useSnapshotFile == 1); + Debug.WriteLine("Flushed LogicalAddress: {0}", flushedLogicalAddress); + Debug.WriteLine("Start Logical Address: {0}", startLogicalAddress); + Debug.WriteLine("Final Logical Address: {0}", finalLogicalAddress); + Debug.WriteLine("Head Address: {0}", headAddress); + Debug.WriteLine("Begin Address: {0}", beginAddress); + Debug.WriteLine("Num sessions recovered: {0}", numThreads); + Debug.WriteLine("Recovered sessions: "); + foreach (var sessionInfo in continueTokens) + { + Debug.WriteLine("{0}: {1}", sessionInfo.Key, sessionInfo.Value); + } + } + } + + internal struct HybridLogCheckpointInfo + { + public HybridLogRecoveryInfo info; + public IDevice snapshotFileDevice; + public IDevice snapshotFileObjectLogDevice; + public CountdownEvent flushed; + public long started; + + public void Initialize(Guid token, int _version, ICheckpointManager checkpointManager) + { + info.Initialize(token, _version); + started = 0; + checkpointManager.InitializeLogCheckpoint(token); + } + + public void Recover(Guid token, ICheckpointManager checkpointManager) + { + info.Recover(token, checkpointManager); + started = 0; + } + + public void Reset() + { + started = 0; + flushed = null; + info.Reset(); + if (snapshotFileDevice != null) snapshotFileDevice.Close(); + if (snapshotFileObjectLogDevice != null) snapshotFileObjectLogDevice.Close(); + } + } + + internal struct IndexRecoveryInfo + { + public Guid token; + public long table_size; + public ulong num_ht_bytes; + public ulong num_ofb_bytes; + public int num_buckets; + public long startLogicalAddress; + public long finalLogicalAddress; + + public void Initialize(Guid token, long _size) + { + this.token = token; + table_size = _size; + num_ht_bytes = 0; + num_ofb_bytes = 0; + startLogicalAddress = 0; + finalLogicalAddress = 0; + num_buckets = 0; + } + public void Initialize(StreamReader reader) + { + string value = reader.ReadLine(); + token = Guid.Parse(value); + + value = reader.ReadLine(); + table_size = long.Parse(value); + + value = reader.ReadLine(); + num_ht_bytes = ulong.Parse(value); + + value = reader.ReadLine(); + num_ofb_bytes = ulong.Parse(value); + + value = reader.ReadLine(); + num_buckets = int.Parse(value); + + value = reader.ReadLine(); + startLogicalAddress = long.Parse(value); + + value = reader.ReadLine(); + finalLogicalAddress = long.Parse(value); + } + + public void Recover(Guid guid, ICheckpointManager checkpointManager) + { + var metadata = checkpointManager.GetIndexCommitMetadata(guid); + if (metadata == null) + throw new Exception("Invalid index commit metadata for ID " + guid.ToString()); + Initialize(new StreamReader(new MemoryStream(metadata))); + } + + public byte[] ToByteArray() + { + using (var ms = new MemoryStream()) + { + using (var writer = new StreamWriter(ms)) + { + + writer.WriteLine(token); + writer.WriteLine(table_size); + writer.WriteLine(num_ht_bytes); + writer.WriteLine(num_ofb_bytes); + writer.WriteLine(num_buckets); + writer.WriteLine(startLogicalAddress); + writer.WriteLine(finalLogicalAddress); + } + return ms.ToArray(); + } + } + + public void DebugPrint() + { + Debug.WriteLine("******** Index Checkpoint Info for {0} ********", token); + Debug.WriteLine("Table Size: {0}", table_size); + Debug.WriteLine("Main Table Size (in GB): {0}", ((double)num_ht_bytes) / 1000.0 / 1000.0 / 1000.0); + Debug.WriteLine("Overflow Table Size (in GB): {0}", ((double)num_ofb_bytes) / 1000.0 / 1000.0 / 1000.0); + Debug.WriteLine("Num Buckets: {0}", num_buckets); + Debug.WriteLine("Start Logical Address: {0}", startLogicalAddress); + Debug.WriteLine("Final Logical Address: {0}", finalLogicalAddress); + } + public void Reset() + { + token = default(Guid); + table_size = 0; + num_ht_bytes = 0; + num_ofb_bytes = 0; + num_buckets = 0; + startLogicalAddress = 0; + finalLogicalAddress = 0; + } + } + + internal struct IndexCheckpointInfo + { + public IndexRecoveryInfo info; + public IDevice main_ht_device; + + public void Initialize(Guid token, long _size, ICheckpointManager checkpointManager) + { + info.Initialize(token, _size); + checkpointManager.InitializeIndexCheckpoint(token); + main_ht_device = checkpointManager.GetIndexDevice(token); + } + public void Recover(Guid token, ICheckpointManager checkpointManager) + { + info.Recover(token, checkpointManager); + } + public void Reset() + { + info.Reset(); + main_ht_device.Close(); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/Common/HeapContainer.cs b/ZeroLevel/Services/FASTER/Index/Common/HeapContainer.cs new file mode 100644 index 0000000..34340dd --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Common/HeapContainer.cs @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + + +using System; +using System.Runtime.CompilerServices; + +namespace FASTER.core +{ + /// + /// Heap container to store keys and values when they go pending + /// + /// + public interface IHeapContainer + { + /// + /// Get object + /// + /// + ref T Get(); + + /// + /// Dispose container + /// + void Dispose(); + } + + /// + /// Heap container for standard C# objects (non-variable-length) + /// + /// + internal class StandardHeapContainer : IHeapContainer + { + private T obj; + + public StandardHeapContainer(ref T obj) + { + this.obj = obj; + } + + public ref T Get() => ref obj; + + public void Dispose() { } + } + + /// + /// Heap container for variable length structs + /// + /// + internal class VarLenHeapContainer : IHeapContainer + { + private SectorAlignedMemory mem; + + + public unsafe VarLenHeapContainer(ref T obj, IVariableLengthStruct varLenStruct, SectorAlignedBufferPool pool) + { + var len = varLenStruct.GetLength(ref obj); + mem = pool.Get(len); + Buffer.MemoryCopy(Unsafe.AsPointer(ref obj), mem.GetValidPointer(), len, len); + } + + public unsafe ref T Get() + { + return ref Unsafe.AsRef(mem.GetValidPointer()); + } + + public void Dispose() + { + mem.Return(); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/Common/LogSettings.cs b/ZeroLevel/Services/FASTER/Index/Common/LogSettings.cs new file mode 100644 index 0000000..f6b59e3 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Common/LogSettings.cs @@ -0,0 +1,185 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + + +using System; + +namespace FASTER.core +{ + /// + /// Configuration settings for serializing objects + /// + /// + /// + public class SerializerSettings + { + /// + /// Key serializer + /// + public Func> keySerializer; + + /// + /// Value serializer + /// + public Func> valueSerializer; + } + + /// + /// Interface for variable length in-place objects + /// modeled as structs, in FASTER + /// + /// + public interface IVariableLengthStruct + { + /// + /// Actual length of object + /// + /// + /// + int GetLength(ref T t); + + /// + /// Average length of objects, make sure this includes the object + /// header needed to compute the actual object length + /// + /// + int GetAverageLength(); + + /// + /// Initial length, when populating for RMW from given input + /// + /// + /// + /// + int GetInitialLength(ref Input input); + } + + + /// + /// Length specification for fixed size (normal) structs + /// + /// + public struct FixedLengthStruct : IVariableLengthStruct + { + private static readonly int size = Utility.GetSize(default(T)); + + /// + /// Get average length + /// + /// + public int GetAverageLength() + { + return size; + } + + /// + /// Get initial length + /// + /// + /// + /// + public int GetInitialLength(ref Input input) + { + return size; + } + + /// + /// Get length + /// + /// + /// + public int GetLength(ref T t) + { + return size; + } + } + + /// + /// Settings for variable length keys and values + /// + /// + /// + public class VariableLengthStructSettings + { + /// + /// Key length + /// + public IVariableLengthStruct keyLength; + + /// + /// Value length + /// + public IVariableLengthStruct valueLength; + } + + + /// + /// Configuration settings for hybrid log + /// + public class LogSettings + { + /// + /// Device used for main hybrid log + /// + public IDevice LogDevice = new NullDevice(); + + /// + /// Device used for serialized heap objects in hybrid log + /// + public IDevice ObjectLogDevice = new NullDevice(); + + /// + /// Size of a segment (group of pages), in bits + /// + public int PageSizeBits = 25; + + /// + /// Size of a segment (group of pages), in bits + /// + public int SegmentSizeBits = 30; + + /// + /// Total size of in-memory part of log, in bits + /// + public int MemorySizeBits = 34; + + /// + /// Fraction of log marked as mutable (in-place updates) + /// + public double MutableFraction = 0.9; + + /// + /// Copy reads to tail of log + /// + public bool CopyReadsToTail = false; + + /// + /// Settings for optional read cache + /// Overrides the "copy reads to tail" setting + /// + public ReadCacheSettings ReadCacheSettings = null; + } + + /// + /// Configuration settings for hybrid log + /// + public class ReadCacheSettings + { + /// + /// Size of a segment (group of pages), in bits + /// + public int PageSizeBits = 25; + + /// + /// Total size of in-memory part of log, in bits + /// + public int MemorySizeBits = 34; + + /// + /// Fraction of log head (in memory) used for second chance + /// copy to tail. This is (1 - MutableFraction) for the + /// underlying log + /// + public double SecondChanceFraction = 0.1; + } +} diff --git a/ZeroLevel/Services/FASTER/Index/Common/RecordInfo.cs b/ZeroLevel/Services/FASTER/Index/Common/RecordInfo.cs new file mode 100644 index 0000000..da475dc --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Common/RecordInfo.cs @@ -0,0 +1,243 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 1591 + +//#define RECORD_INFO_WITH_PIN_COUNT + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace FASTER.core +{ +#if RECORD_INFO_WITH_PIN_COUNT + [StructLayout(LayoutKind.Explicit, Size = 12)] +#else + [StructLayout(LayoutKind.Explicit, Size = 8)] +#endif + public unsafe struct RecordInfo + { + public const int kFinalBitOffset = 48; + + public const int kTombstoneBitOffset = 49; + + public const int kInvalidBitOffset = 50; + + public const int kVersionBits = 13; + + public const int kVersionShiftInWord = 51; + + public const long kVersionMaskInWord = ((1L << kVersionBits) - 1) << kVersionShiftInWord; + + public const long kVersionMaskInInteger = (1L << kVersionBits) - 1; + + public const long kPreviousAddressMask = (1L << 48) - 1; + + public const long kFinalBitMask = (1L << kFinalBitOffset); + + public const long kTombstoneMask = (1L << kTombstoneBitOffset); + + public const long kInvalidBitMask = (1L << kInvalidBitOffset); + +#if RECORD_INFO_WITH_PIN_COUNT + public const int kTotalSizeInBytes = sizeof(long) + sizeof(int); + + public const int kTotalBits = kTotalSizeInBytes * 8; + + [FieldOffset(0)] + private long word; + + [FieldOffset(sizeof(long))] + private int access_data; + + public static void WriteInfo(RecordInfo* info, int checkpointVersion, bool final, bool tombstone, bool invalidBit, long previousAddress) + { + info->word = default(long); + info->Final = final; + info->Tombstone = tombstone; + info->Invalid = invalidBit; + info->PreviousAddress = previousAddress; + info->Version = checkpointVersion; + info->access_data = 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryPin() + { + return Interlocked.Increment(ref access_data) > 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryMarkReadOnly() + { + return Interlocked.CompareExchange(ref access_data, int.MinValue, 0) == 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void MarkReadOnly() + { + var found_value = Interlocked.CompareExchange(ref access_data, int.MinValue, 0); + if (found_value != 0) + { + int num_iterations = 1000; + Thread.SpinWait(num_iterations); + while (Interlocked.CompareExchange(ref access_data, int.MinValue, 0) != 0) + { + Thread.SpinWait(num_iterations); + num_iterations <<= 1; + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Unpin() + { + Interlocked.Decrement(ref access_data); + } + +#else + public const int kTotalSizeInBytes = sizeof(long); + + public const int kTotalBits = kTotalSizeInBytes * 8; + + [FieldOffset(0)] + private long word; + + public static void WriteInfo(ref RecordInfo info, int checkpointVersion, bool final, bool tombstone, bool invalidBit, long previousAddress) + { + info.word = default(long); + info.Final = final; + info.Tombstone = tombstone; + info.Invalid = invalidBit; + info.PreviousAddress = previousAddress; + info.Version = checkpointVersion; + } + + + public static string ToString(RecordInfo* info) + { + return "RecordHeader Word = " + info->word; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryPin() + { + throw new InvalidOperationException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryMarkReadOnly() + { + throw new InvalidOperationException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void MarkReadOnly() + { + throw new InvalidOperationException(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Unpin() + { + throw new InvalidOperationException(); + } +#endif + public bool IsNull() + { + return word == 0; + } + + public bool Tombstone + { + get + { + return (word & kTombstoneMask) > 0; + } + + set + { + if (value) + { + word |= kTombstoneMask; + } + else + { + word &= ~kTombstoneMask; + } + } + } + + public bool Final + { + get + { + return (word & kFinalBitMask) > 0; + } + set + { + if (value) + { + word |= kFinalBitMask; + } + else + { + word &= ~kFinalBitMask; + } + } + } + + public bool Invalid + { + get + { + return !((word & kInvalidBitMask) > 0); + } + set + { + if (value) + { + word &= ~kInvalidBitMask; + } + else + { + word |= kInvalidBitMask; + } + } + } + + public int Version + { + get + { + return (int)(((word & kVersionMaskInWord) >> kVersionShiftInWord) & kVersionMaskInInteger); + } + set + { + word &= ~kVersionMaskInWord; + word |= ((value & kVersionMaskInInteger) << kVersionShiftInWord); + } + } + + public long PreviousAddress + { + get + { + return (word & kPreviousAddressMask); + } + set + { + word &= ~kPreviousAddressMask; + word |= (value & kPreviousAddressMask); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int GetLength() + { + return kTotalSizeInBytes; + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FASTER/Extensions.cs b/ZeroLevel/Services/FASTER/Index/FASTER/Extensions.cs new file mode 100644 index 0000000..d11f04b --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FASTER/Extensions.cs @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + + +using System; + +namespace FASTER.core +{ + /// + /// Log subscription extensions + /// + public static class Extensions + { + /// + /// Create observable of log records + /// + /// + /// + /// + /// + public static IObservable> ToRecordObservable(this IObservable> source) + { + return new RecordObservable(source); + } + + internal class RecordObservable : IObservable> + { + IObservable> o; + + public RecordObservable(IObservable> o) + { + this.o = o; + } + + public IDisposable Subscribe(IObserver> observer) + { + return o.Subscribe(new RecordObserver(observer)); + } + } + + internal class RecordObserver : IObserver> + { + private IObserver> observer; + + public RecordObserver(IObserver> observer) + { + this.observer = observer; + } + + public void OnCompleted() + { + observer.OnCompleted(); + } + + public void OnError(Exception error) + { + observer.OnError(error); + } + + public void OnNext(IFasterScanIterator v) + { + while (v.GetNext(out RecordInfo info, out Key key, out Value value)) + { + observer.OnNext(new Record { info = info, key = key, value = value }); + } + v.Dispose(); + } + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FASTER/FASTER.cs b/ZeroLevel/Services/FASTER/Index/FASTER/FASTER.cs new file mode 100644 index 0000000..e222a35 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FASTER/FASTER.cs @@ -0,0 +1,488 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +using System; +using System.Collections.Concurrent; +using System.Runtime.CompilerServices; + +namespace FASTER.core +{ + public unsafe partial class FasterKV : FasterBase, IFasterKV + where Key : new() + where Value : new() + where Functions : IFunctions + { + private readonly Functions functions; + private readonly AllocatorBase hlog; + private readonly AllocatorBase readcache; + private readonly IFasterEqualityComparer comparer; + + private readonly bool UseReadCache = false; + private readonly bool CopyReadsToTail = false; + private readonly bool FoldOverSnapshot = false; + private readonly int sectorSize; + + private readonly bool WriteDefaultOnDelete = false; + + /// + /// Number of used entries in hash index + /// + public long EntryCount => GetEntryCount(); + + /// + /// Size of index in #cache lines (64 bytes each) + /// + public long IndexSize => state[resizeInfo.version].size; + + /// + /// Comparer used by FASTER + /// + public IFasterEqualityComparer Comparer => comparer; + + /// + /// Hybrid log used by this FASTER instance + /// + public LogAccessor Log { get; } + + /// + /// Read cache used by this FASTER instance + /// + public LogAccessor ReadCache { get; } + + private enum CheckpointType + { + INDEX_ONLY, + HYBRID_LOG_ONLY, + FULL, + NONE + } + + private CheckpointType _checkpointType; + private Guid _indexCheckpointToken; + private Guid _hybridLogCheckpointToken; + private SystemState _systemState; + + private HybridLogCheckpointInfo _hybridLogCheckpoint; + + + private ConcurrentDictionary _recoveredSessions; + + private FastThreadLocal prevThreadCtx; + private FastThreadLocal threadCtx; + + + /// + /// Create FASTER instance + /// + /// Size of core index (#cache lines) + /// FASTER equality comparer for key + /// + /// Callback functions + /// Log settings + /// Checkpoint settings + /// Serializer settings + public FasterKV(long size, Functions functions, LogSettings logSettings, CheckpointSettings checkpointSettings = null, SerializerSettings serializerSettings = null, IFasterEqualityComparer comparer = null, VariableLengthStructSettings variableLengthStructSettings = null) + { + threadCtx = new FastThreadLocal(); + prevThreadCtx = new FastThreadLocal(); + + if (comparer != null) + this.comparer = comparer; + else + { + if (typeof(IFasterEqualityComparer).IsAssignableFrom(typeof(Key))) + { + this.comparer = new Key() as IFasterEqualityComparer; + } + else + { + Console.WriteLine("***WARNING*** Creating default FASTER key equality comparer based on potentially slow EqualityComparer.Default. To avoid this, provide a comparer (IFasterEqualityComparer) as an argument to FASTER's constructor, or make Key implement the interface IFasterEqualityComparer"); + this.comparer = FasterEqualityComparer.Default; + } + } + + if (checkpointSettings == null) + checkpointSettings = new CheckpointSettings(); + + if (checkpointSettings.CheckpointDir != null && checkpointSettings.CheckpointManager != null) + throw new Exception("Specify either CheckpointManager or CheckpointDir for CheckpointSettings, not both"); + + checkpointManager = checkpointSettings.CheckpointManager ?? new LocalCheckpointManager(checkpointSettings.CheckpointDir ?? ""); + + FoldOverSnapshot = checkpointSettings.CheckPointType == core.CheckpointType.FoldOver; + CopyReadsToTail = logSettings.CopyReadsToTail; + this.functions = functions; + + if (logSettings.ReadCacheSettings != null) + { + CopyReadsToTail = false; + UseReadCache = true; + } + + if (Utility.IsBlittable() && Utility.IsBlittable()) + { + if (variableLengthStructSettings != null) + { + hlog = new VariableLengthBlittableAllocator(logSettings, variableLengthStructSettings, this.comparer, null, epoch); + Log = new LogAccessor(this, hlog); + if (UseReadCache) + { + readcache = new VariableLengthBlittableAllocator( + new LogSettings + { + PageSizeBits = logSettings.ReadCacheSettings.PageSizeBits, + MemorySizeBits = logSettings.ReadCacheSettings.MemorySizeBits, + SegmentSizeBits = logSettings.ReadCacheSettings.MemorySizeBits, + MutableFraction = 1 - logSettings.ReadCacheSettings.SecondChanceFraction + }, variableLengthStructSettings, this.comparer, ReadCacheEvict, epoch); + readcache.Initialize(); + ReadCache = new LogAccessor(this, readcache); + } + } + else + { + hlog = new BlittableAllocator(logSettings, this.comparer, null, epoch); + Log = new LogAccessor(this, hlog); + if (UseReadCache) + { + readcache = new BlittableAllocator( + new LogSettings + { + PageSizeBits = logSettings.ReadCacheSettings.PageSizeBits, + MemorySizeBits = logSettings.ReadCacheSettings.MemorySizeBits, + SegmentSizeBits = logSettings.ReadCacheSettings.MemorySizeBits, + MutableFraction = 1 - logSettings.ReadCacheSettings.SecondChanceFraction + }, this.comparer, ReadCacheEvict, epoch); + readcache.Initialize(); + ReadCache = new LogAccessor(this, readcache); + } + } + } + else + { + WriteDefaultOnDelete = true; + + hlog = new GenericAllocator(logSettings, serializerSettings, this.comparer, null, epoch); + Log = new LogAccessor(this, hlog); + if (UseReadCache) + { + readcache = new GenericAllocator( + new LogSettings + { + PageSizeBits = logSettings.ReadCacheSettings.PageSizeBits, + MemorySizeBits = logSettings.ReadCacheSettings.MemorySizeBits, + SegmentSizeBits = logSettings.ReadCacheSettings.MemorySizeBits, + MutableFraction = 1 - logSettings.ReadCacheSettings.SecondChanceFraction + }, serializerSettings, this.comparer, ReadCacheEvict, epoch); + readcache.Initialize(); + ReadCache = new LogAccessor(this, readcache); + } + } + + hlog.Initialize(); + + sectorSize = (int)logSettings.LogDevice.SectorSize; + Initialize(size, sectorSize); + + _systemState = default(SystemState); + _systemState.phase = Phase.REST; + _systemState.version = 1; + _checkpointType = CheckpointType.HYBRID_LOG_ONLY; + } + + /// + /// Take full checkpoint + /// + /// + /// + public bool TakeFullCheckpoint(out Guid token) + { + var success = InternalTakeCheckpoint(CheckpointType.FULL); + if (success) + { + token = _indexCheckpointToken; + } + else + { + token = default(Guid); + } + return success; + } + + /// + /// Take index checkpoint + /// + /// + /// + public bool TakeIndexCheckpoint(out Guid token) + { + var success = InternalTakeCheckpoint(CheckpointType.INDEX_ONLY); + if (success) + { + token = _indexCheckpointToken; + } + else + { + token = default(Guid); + } + return success; + } + + /// + /// Take hybrid log checkpoint + /// + /// + /// + public bool TakeHybridLogCheckpoint(out Guid token) + { + var success = InternalTakeCheckpoint(CheckpointType.HYBRID_LOG_ONLY); + if (success) + { + token = _hybridLogCheckpointToken; + } + else + { + token = default(Guid); + } + return success; + } + + /// + /// Recover from the latest checkpoints + /// + public void Recover() + { + InternalRecoverFromLatestCheckpoints(); + } + + /// + /// Recover + /// + /// + public void Recover(Guid fullCheckpointToken) + { + InternalRecover(fullCheckpointToken, fullCheckpointToken); + } + + /// + /// Recover + /// + /// + /// + public void Recover(Guid indexCheckpointToken, Guid hybridLogCheckpointToken) + { + InternalRecover(indexCheckpointToken, hybridLogCheckpointToken); + } + + /// + /// Start session with FASTER - call once per thread before using FASTER + /// + /// + public Guid StartSession() + { + return InternalAcquire(); + } + + + /// + /// Continue session with FASTER + /// + /// + /// + public long ContinueSession(Guid guid) + { + return InternalContinue(guid); + } + + /// + /// Stop session with FASTER + /// + public void StopSession() + { + InternalRelease(); + } + + /// + /// Refresh epoch (release memory pins) + /// + public void Refresh() + { + InternalRefresh(); + } + + + /// + /// Complete outstanding pending operations + /// + /// + /// + public bool CompletePending(bool wait = false) + { + return InternalCompletePending(wait); + } + + /// + /// Complete the ongoing checkpoint (if any) + /// + /// + /// + public bool CompleteCheckpoint(bool wait = false) + { + if (threadCtx == null) + { + // the thread does not have an active session + // we can wait until system state becomes REST + do + { + if (_systemState.phase == Phase.REST) + { + return true; + } + } while (wait); + } + else + { + // the thread does has an active session and + // so we need to constantly complete pending + // and refresh (done inside CompletePending) + // for the checkpoint to be proceed + do + { + CompletePending(); + if (_systemState.phase == Phase.REST) + { + CompletePending(); + return true; + } + } while (wait); + } + return false; + } + + /// + /// Read + /// + /// + /// + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Status Read(ref Key key, ref Input input, ref Output output, Context userContext, long monotonicSerialNum) + { + var context = default(PendingContext); + var internalStatus = InternalRead(ref key, ref input, ref output, ref userContext, ref context); + var status = default(Status); + if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND) + { + status = (Status)internalStatus; + } + else + { + status = HandleOperationStatus(threadCtx.Value, context, internalStatus); + } + threadCtx.Value.serialNum = monotonicSerialNum; + return status; + } + + /// + /// Upsert + /// + /// + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Status Upsert(ref Key key, ref Value desiredValue, Context userContext, long monotonicSerialNum) + { + var context = default(PendingContext); + var internalStatus = InternalUpsert(ref key, ref desiredValue, ref userContext, ref context); + var status = default(Status); + + if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND) + { + status = (Status)internalStatus; + } + else + { + status = HandleOperationStatus(threadCtx.Value, context, internalStatus); + } + threadCtx.Value.serialNum = monotonicSerialNum; + return status; + } + + /// + /// Read-modify-write + /// + /// + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Status RMW(ref Key key, ref Input input, Context userContext, long monotonicSerialNum) + { + var context = default(PendingContext); + var internalStatus = InternalRMW(ref key, ref input, ref userContext, ref context); + var status = default(Status); + if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND) + { + status = (Status)internalStatus; + } + else + { + status = HandleOperationStatus(threadCtx.Value, context, internalStatus); + } + threadCtx.Value.serialNum = monotonicSerialNum; + return status; + } + + /// + /// Delete entry (use tombstone if necessary) + /// Hash entry is removed as a best effort (if key is in memory and at + /// the head of hash chain. + /// Value is set to null (using ConcurrentWrite) if it is in mutable region + /// + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Status Delete(ref Key key, Context userContext, long monotonicSerialNum) + { + var context = default(PendingContext); + var internalStatus = InternalDelete(ref key, ref userContext, ref context); + var status = default(Status); + if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND) + { + status = (Status)internalStatus; + } + threadCtx.Value.serialNum = monotonicSerialNum; + return status; + } + + /// + /// Grow the hash index + /// + /// + public bool GrowIndex() + { + return InternalGrowIndex(); + } + + /// + /// Dispose FASTER instance + /// + public void Dispose() + { + base.Free(); + threadCtx.Dispose(); + prevThreadCtx.Dispose(); + hlog.Dispose(); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FASTER/FASTERBase.cs b/ZeroLevel/Services/FASTER/Index/FASTER/FASTERBase.cs new file mode 100644 index 0000000..61f30f4 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FASTER/FASTERBase.cs @@ -0,0 +1,794 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; + +namespace FASTER.core +{ + internal static class Constants + { + /// Size of cache line in bytes + public const int kCacheLineBytes = 64; + + public const bool kFineGrainedHandoverRecord = false; + + public const bool kFineGrainedHandoverBucket = true; + + /// Number of entries per bucket (assuming 8-byte entries to fill a cacheline) + /// Number of bits per bucket (assuming 8-byte entries to fill a cacheline) + public const int kBitsPerBucket = 3; + + public const int kEntriesPerBucket = 1 << kBitsPerBucket; + + // Position of fields in hash-table entry + public const int kTentativeBitShift = 63; + + public const long kTentativeBitMask = (1L << kTentativeBitShift); + + public const int kPendingBitShift = 62; + + public const long kPendingBitMask = (1L << kPendingBitShift); + + public const int kReadCacheBitShift = 47; + public const long kReadCacheBitMask = (1L << kReadCacheBitShift); + + public const int kTagSize = 14; + + public const int kTagShift = 62 - kTagSize; + + public const long kTagMask = (1L << kTagSize) - 1; + + public const long kTagPositionMask = (kTagMask << kTagShift); + + public const long kAddressMask = (1L << 48) - 1; + + // Position of tag in hash value (offset is always in the least significant bits) + public const int kHashTagShift = 64 - kTagSize; + + + /// Invalid entry value + public const int kInvalidEntrySlot = kEntriesPerBucket; + + /// Location of the special bucket entry + public const long kOverflowBucketIndex = kEntriesPerBucket - 1; + + /// Invalid value in the hash table + public const long kInvalidEntry = 0; + + /// Number of times to retry a compare-and-swap before failure + public const long kRetryThreshold = 1000000; + + /// Number of merge/split chunks. + public const int kNumMergeChunkBits = 8; + public const int kNumMergeChunks = 1 << kNumMergeChunkBits; + + // Size of chunks for garbage collection + public const int kSizeofChunkBits = 14; + public const int kSizeofChunk = 1 << 14; + + public const long kInvalidAddress = 0; + public const long kTempInvalidAddress = 1; + public const int kFirstValidAddress = 64; + } + + [StructLayout(LayoutKind.Explicit, Size = Constants.kEntriesPerBucket * 8)] + internal unsafe struct HashBucket + { + + public const long kPinConstant = (1L << 48); + + public const long kExclusiveLatchBitMask = (1L << 63); + + [FieldOffset(0)] + public fixed long bucket_entries[Constants.kEntriesPerBucket]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryAcquireSharedLatch(HashBucket* bucket) + { + return Interlocked.Add(ref bucket->bucket_entries[Constants.kOverflowBucketIndex], + kPinConstant) > 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void ReleaseSharedLatch(HashBucket* bucket) + { + Interlocked.Add(ref bucket->bucket_entries[Constants.kOverflowBucketIndex], + -kPinConstant); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool TryAcquireExclusiveLatch(HashBucket* bucket) + { + long expected_word = bucket->bucket_entries[Constants.kOverflowBucketIndex]; + if ((expected_word & ~Constants.kAddressMask) == 0) + { + long desired_word = expected_word | kExclusiveLatchBitMask; + var found_word = Interlocked.CompareExchange( + ref bucket->bucket_entries[Constants.kOverflowBucketIndex], + desired_word, + expected_word); + return found_word == expected_word; + } + return false; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static void ReleaseExclusiveLatch(HashBucket* bucket) + { + long expected_word = bucket->bucket_entries[Constants.kOverflowBucketIndex]; + long desired_word = expected_word & Constants.kAddressMask; + var found_word = Interlocked.Exchange( + ref bucket->bucket_entries[Constants.kOverflowBucketIndex], + desired_word); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool NoSharedLatches(HashBucket* bucket) + { + long word = bucket->bucket_entries[Constants.kOverflowBucketIndex]; + return (word & ~Constants.kAddressMask) == 0; + } + } + + // Long value layout: [1-bit tentative][15-bit TAG][48-bit address] + // Physical little endian memory layout: [48-bit address][15-bit TAG][1-bit tentative] + [StructLayout(LayoutKind.Explicit, Size = 8)] + internal struct HashBucketEntry + { + [FieldOffset(0)] + public long word; + + public long Address + { + get + { + return word & Constants.kAddressMask; + } + + set + { + word &= ~Constants.kAddressMask; + word |= (value & Constants.kAddressMask); + } + } + + + public ushort Tag + { + get + { + return (ushort)((word & Constants.kTagPositionMask) >> Constants.kTagShift); + } + + set + { + word &= ~Constants.kTagPositionMask; + word |= ((long)value << Constants.kTagShift); + } + } + + public bool Pending + { + get + { + return (word & Constants.kPendingBitMask) != 0; + } + + set + { + if (value) + { + word |= Constants.kPendingBitMask; + } + else + { + word &= ~Constants.kPendingBitMask; + } + } + } + + public bool Tentative + { + get + { + return (word & Constants.kTentativeBitMask) != 0; + } + + set + { + if (value) + { + word |= Constants.kTentativeBitMask; + } + else + { + word &= ~Constants.kTentativeBitMask; + } + } + } + + public bool ReadCache + { + get + { + return (word & Constants.kReadCacheBitMask) != 0; + } + + set + { + if (value) + { + word |= Constants.kReadCacheBitMask; + } + else + { + word &= ~Constants.kReadCacheBitMask; + } + } + } + + } + + internal unsafe struct InternalHashTable + { + public long size; + public long size_mask; + public int size_bits; + public HashBucket[] tableRaw; + public GCHandle tableHandle; + public HashBucket* tableAligned; + } + + public unsafe partial class FasterBase + { + // Initial size of the table + internal long minTableSize = 16; + + // Allocator for the hash buckets + internal readonly MallocFixedPageSize overflowBucketsAllocator; + + // An array of size two, that contains the old and new versions of the hash-table + internal InternalHashTable[] state = new InternalHashTable[2]; + + // Array used to denote if a specific chunk is merged or not + internal long[] splitStatus; + + // Used as an atomic counter to check if resizing is complete + internal long numPendingChunksToBeSplit; + + // Epoch set for resizing + internal int resizeEpoch; + + internal LightEpoch epoch; + + internal ResizeInfo resizeInfo; + + /// + /// Constructor + /// + public FasterBase() + { + epoch = new LightEpoch(); + overflowBucketsAllocator = new MallocFixedPageSize(false, epoch); + } + + internal Status Free() + { + Free(0); + Free(1); + epoch.Dispose(); + overflowBucketsAllocator.Dispose(); + return Status.OK; + } + + private Status Free(int version) + { + if (state[version].tableHandle.IsAllocated) + state[version].tableHandle.Free(); + + state[version].tableRaw = null; + state[version].tableAligned = null; + return Status.OK; + } + + /// + /// Initialize + /// + /// + /// + public void Initialize(long size, int sector_size) + { + if (!Utility.IsPowerOfTwo(size)) + { + throw new ArgumentException("Size {0} is not a power of 2"); + } + if (!Utility.Is32Bit(size)) + { + throw new ArgumentException("Size {0} is not 32-bit"); + } + + minTableSize = size; + resizeInfo = default(ResizeInfo); + resizeInfo.status = ResizeOperationStatus.DONE; + resizeInfo.version = 0; + Initialize(resizeInfo.version, size, sector_size); + } + + /// + /// Initialize + /// + /// + /// + /// + protected void Initialize(int version, long size, int sector_size) + { + long size_bytes = size * sizeof(HashBucket); + long aligned_size_bytes = sector_size + + ((size_bytes + (sector_size - 1)) & ~(sector_size - 1)); + + //Over-allocate and align the table to the cacheline + state[version].size = size; + state[version].size_mask = size - 1; + state[version].size_bits = Utility.GetLogBase2((int)size); + + state[version].tableRaw = new HashBucket[aligned_size_bytes / Constants.kCacheLineBytes]; + state[version].tableHandle = GCHandle.Alloc(state[version].tableRaw, GCHandleType.Pinned); + long sectorAlignedPointer = ((long)state[version].tableHandle.AddrOfPinnedObject() + (sector_size - 1)) & ~(sector_size - 1); + state[version].tableAligned = (HashBucket*)sectorAlignedPointer; + } + + /// + /// A helper function that is used to find the slot corresponding to a + /// key in the specified version of the hash table + /// + /// + /// + /// + /// + /// + /// true if such a slot exists, false otherwise + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool FindTag(long hash, ushort tag, ref HashBucket* bucket, ref int slot, ref HashBucketEntry entry) + { + var target_entry_word = default(long); + var entry_slot_bucket = default(HashBucket*); + var version = resizeInfo.version; + var masked_entry_word = hash & state[version].size_mask; + bucket = state[version].tableAligned + masked_entry_word; + slot = Constants.kInvalidEntrySlot; + + do + { + // Search through the bucket looking for our key. Last entry is reserved + // for the overflow pointer. + for (int index = 0; index < Constants.kOverflowBucketIndex; ++index) + { + target_entry_word = *(((long*)bucket) + index); + if (0 == target_entry_word) + { + continue; + } + + entry.word = target_entry_word; + if (tag == entry.Tag) + { + slot = index; + if (!entry.Tentative) + return true; + } + } + + target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex) & Constants.kAddressMask; + // Go to next bucket in the chain + + + if (target_entry_word == 0) + { + entry = default(HashBucketEntry); + return false; + } + bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word); + } while (true); + } + + + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void FindOrCreateTag(long hash, ushort tag, ref HashBucket* bucket, ref int slot, ref HashBucketEntry entry, long BeginAddress) + { + var version = resizeInfo.version; + var masked_entry_word = hash & state[version].size_mask; + + while (true) + { + bucket = state[version].tableAligned + masked_entry_word; + slot = Constants.kInvalidEntrySlot; + + if (FindTagOrFreeInternal(hash, tag, ref bucket, ref slot, ref entry, BeginAddress)) + return; + + + // Install tentative tag in free slot + entry = default(HashBucketEntry); + entry.Tag = tag; + entry.Address = Constants.kTempInvalidAddress; + entry.Pending = false; + entry.Tentative = true; + + if (0 == Interlocked.CompareExchange(ref bucket->bucket_entries[slot], entry.word, 0)) + { + var orig_bucket = state[version].tableAligned + masked_entry_word; + var orig_slot = Constants.kInvalidEntrySlot; + + if (FindOtherTagMaybeTentativeInternal(hash, tag, ref orig_bucket, ref orig_slot, bucket, slot)) + { + bucket->bucket_entries[slot] = 0; + } + else + { + entry.Tentative = false; + *((long*)bucket + slot) = entry.word; + break; + } + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool FindTagInternal(long hash, ushort tag, ref HashBucket* bucket, ref int slot) + { + var target_entry_word = default(long); + var entry_slot_bucket = default(HashBucket*); + + do + { + // Search through the bucket looking for our key. Last entry is reserved + // for the overflow pointer. + for (int index = 0; index < Constants.kOverflowBucketIndex; ++index) + { + target_entry_word = *(((long*)bucket) + index); + if (0 == target_entry_word) + { + continue; + } + + HashBucketEntry entry = default(HashBucketEntry); + entry.word = target_entry_word; + if (tag == entry.Tag) + { + slot = index; + if (!entry.Tentative) + return true; + } + } + + target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex) & Constants.kAddressMask; + // Go to next bucket in the chain + + + if (target_entry_word == 0) + { + return false; + } + bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word); + } while (true); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool FindTagMaybeTentativeInternal(long hash, ushort tag, ref HashBucket* bucket, ref int slot) + { + var target_entry_word = default(long); + var entry_slot_bucket = default(HashBucket*); + + do + { + // Search through the bucket looking for our key. Last entry is reserved + // for the overflow pointer. + for (int index = 0; index < Constants.kOverflowBucketIndex; ++index) + { + target_entry_word = *(((long*)bucket) + index); + if (0 == target_entry_word) + { + continue; + } + + HashBucketEntry entry = default(HashBucketEntry); + entry.word = target_entry_word; + if (tag == entry.Tag) + { + slot = index; + return true; + } + } + + target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex) & Constants.kAddressMask; + // Go to next bucket in the chain + + + if (target_entry_word == 0) + { + return false; + } + bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word); + } while (true); + } + + /// + /// Find existing entry (non-tenative) + /// If not found, return pointer to some empty slot + /// + /// + /// + /// + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool FindTagOrFreeInternal(long hash, ushort tag, ref HashBucket* bucket, ref int slot, ref HashBucketEntry entry, long BeginAddress = 0) + { + var target_entry_word = default(long); + var recordExists = false; + var entry_slot_bucket = default(HashBucket*); + + do + { + // Search through the bucket looking for our key. Last entry is reserved + // for the overflow pointer. + for (int index = 0; index < Constants.kOverflowBucketIndex; ++index) + { + target_entry_word = *(((long*)bucket) + index); + if (0 == target_entry_word) + { + if (slot == Constants.kInvalidEntrySlot) + { + slot = index; + entry_slot_bucket = bucket; + } + continue; + } + + entry.word = target_entry_word; + if (entry.Address < BeginAddress && entry.Address != Constants.kTempInvalidAddress) + { + if (entry.word == Interlocked.CompareExchange(ref bucket->bucket_entries[index], Constants.kInvalidAddress, target_entry_word)) + { + if (slot == Constants.kInvalidEntrySlot) + { + slot = index; + entry_slot_bucket = bucket; + } + continue; + } + } + if (tag == entry.Tag && !entry.Tentative) + { + slot = index; + recordExists = true; + return recordExists; + } + } + + target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex); + // Go to next bucket in the chain + + + if ((target_entry_word & Constants.kAddressMask) == 0) + { + if (slot == Constants.kInvalidEntrySlot) + { + // Allocate new bucket + var logicalBucketAddress = overflowBucketsAllocator.Allocate(); + var physicalBucketAddress = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(logicalBucketAddress); + long compare_word = target_entry_word; + target_entry_word = logicalBucketAddress; + target_entry_word |= (compare_word & ~Constants.kAddressMask); + + long result_word = Interlocked.CompareExchange( + ref bucket->bucket_entries[Constants.kOverflowBucketIndex], + target_entry_word, + compare_word); + + if (compare_word != result_word) + { + // Install failed, undo allocation; use the winner's entry + overflowBucketsAllocator.FreeAtEpoch(logicalBucketAddress, 0); + target_entry_word = result_word; + } + else + { + // Install succeeded + bucket = physicalBucketAddress; + slot = 0; + entry = default(HashBucketEntry); + return recordExists; + } + } + else + { + if (!recordExists) + { + bucket = entry_slot_bucket; + } + entry = default(HashBucketEntry); + break; + } + } + + bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word & Constants.kAddressMask); + } while (true); + + return recordExists; + } + + + /// + /// Find existing entry (tenative or otherwise) other than the specified "exception" slot + /// If not found, return false. Does not return a free slot. + /// + /// + /// + /// + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool FindOtherTagMaybeTentativeInternal(long hash, ushort tag, ref HashBucket* bucket, ref int slot, HashBucket* except_bucket, int except_entry_slot) + { + var target_entry_word = default(long); + var entry_slot_bucket = default(HashBucket*); + + do + { + // Search through the bucket looking for our key. Last entry is reserved + // for the overflow pointer. + for (int index = 0; index < Constants.kOverflowBucketIndex; ++index) + { + target_entry_word = *(((long*)bucket) + index); + if (0 == target_entry_word) + { + continue; + } + + HashBucketEntry entry = default(HashBucketEntry); + entry.word = target_entry_word; + if (tag == entry.Tag) + { + if ((except_entry_slot == index) && (except_bucket == bucket)) + continue; + + slot = index; + return true; + } + } + + target_entry_word = *(((long*)bucket) + Constants.kOverflowBucketIndex) & Constants.kAddressMask; + // Go to next bucket in the chain + + + if (target_entry_word == 0) + { + return false; + } + bucket = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(target_entry_word); + } while (true); + } + + + /// + /// Helper function used to update the slot atomically with the + /// new offset value using the CAS operation + /// + /// + /// + /// + /// + /// + /// If atomic update was successful + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal bool UpdateSlot(HashBucket* bucket, int entrySlot, long expected, long desired, out long found) + { + found = Interlocked.CompareExchange( + ref bucket->bucket_entries[entrySlot], + desired, + expected); + + return (found == expected); + } + + /// + /// + /// + /// + protected virtual long GetEntryCount() + { + var version = resizeInfo.version; + var table_size_ = state[version].size; + var ptable_ = state[version].tableAligned; + long total_entry_count = 0; + + for (long bucket = 0; bucket < table_size_; ++bucket) + { + HashBucket b = *(ptable_ + bucket); + while (true) + { + for (int bucket_entry = 0; bucket_entry < Constants.kOverflowBucketIndex; ++bucket_entry) + if (0 != b.bucket_entries[bucket_entry]) + ++total_entry_count; + if (b.bucket_entries[Constants.kOverflowBucketIndex] == 0) break; + b = *((HashBucket*)overflowBucketsAllocator.GetPhysicalAddress((b.bucket_entries[Constants.kOverflowBucketIndex]))); + } + } + return total_entry_count; + } + + /// + /// + /// + /// + protected virtual string _DumpDistribution(int version) + { + var table_size_ = state[version].size; + var ptable_ = state[version].tableAligned; + long total_record_count = 0; + Dictionary histogram = new Dictionary(); + + for (long bucket = 0; bucket < table_size_; ++bucket) + { + List tags = new List(); + int cnt = 0; + HashBucket b = *(ptable_ + bucket); + while (true) + { + for (int bucket_entry = 0; bucket_entry < Constants.kOverflowBucketIndex; ++bucket_entry) + { + if (0 != b.bucket_entries[bucket_entry]) + { + var x = default(HashBucketEntry); + x.word = b.bucket_entries[bucket_entry]; + if (tags.Contains(x.Tag) && !x.Tentative) + throw new Exception("Duplicate tag found in index"); + tags.Add(x.Tag); + ++cnt; + ++total_record_count; + } + } + if (b.bucket_entries[Constants.kOverflowBucketIndex] == 0) break; + b = *((HashBucket*)overflowBucketsAllocator.GetPhysicalAddress((b.bucket_entries[Constants.kOverflowBucketIndex]))); + } + + if (!histogram.ContainsKey(cnt)) histogram[cnt] = 0; + histogram[cnt]++; + } + + var distribution = + $"Number of hash buckets: {{{table_size_}}}\n" + + $"Total distinct hash-table entry count: {{{total_record_count}}}\n" + + $"Average #entries per hash bucket: {{{total_record_count / (double)table_size_:0.00}}}\n" + + $"Histogram of #entries per bucket:\n"; + foreach (var kvp in histogram.OrderBy(e => e.Key)) + { + distribution += $" {kvp.Key} : {kvp.Value}\n"; + } + + return distribution; + } + + /// + /// Dumps the distribution of each non-empty bucket in the hash table. + /// + public string DumpDistribution() + { + return _DumpDistribution(resizeInfo.version); + } + + } + +} diff --git a/ZeroLevel/Services/FASTER/Index/FASTER/FASTERImpl.cs b/ZeroLevel/Services/FASTER/Index/FASTER/FASTERImpl.cs new file mode 100644 index 0000000..52a0253 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FASTER/FASTERImpl.cs @@ -0,0 +1,2332 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 +#define CPR + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.core +{ + public unsafe partial class FasterKV : FasterBase, IFasterKV + where Key : new() + where Value : new() + where Functions : IFunctions + { + enum LatchOperation : byte + { + None, + ReleaseShared, + ReleaseExclusive + } + + #region Read Operation + + /// + /// Read operation. Computes the 'output' from 'input' and current value corresponding to 'key'. + /// When the read operation goes pending, once the record is retrieved from disk, InternalContinuePendingRead + /// function is used to complete the operation. + /// + /// Key of the record. + /// Input required to compute output from value. + /// Location to store output computed from input and value. + /// User context for the operation, in case it goes pending. + /// Pending context used internally to store the context of the operation. + /// + /// + /// + /// Value + /// Description + /// + /// + /// SUCCESS + /// The output has been computed using current value of 'key' and 'input'; and stored in 'output'. + /// + /// + /// RECORD_ON_DISK + /// The record corresponding to 'key' is on disk and the operation. + /// + /// + /// CPR_SHIFT_DETECTED + /// A shift in version has been detected. Synchronize immediately to avoid violating CPR consistency. + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal OperationStatus InternalRead( + ref Key key, + ref Input input, + ref Output output, + ref Context userContext, + ref PendingContext pendingContext) + { + var status = default(OperationStatus); + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + var latestRecordVersion = -1; + + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + if (threadCtx.Value.phase != Phase.REST) + HeavyEnter(hash); + + #region Trace back for record in in-memory HybridLog + HashBucketEntry entry = default(HashBucketEntry); + var tagExists = FindTag(hash, tag, ref bucket, ref slot, ref entry); + if (tagExists) + { + logicalAddress = entry.Address; + + if (UseReadCache && ReadFromCache(ref key, ref logicalAddress, ref physicalAddress, ref latestRecordVersion)) + { + if (threadCtx.Value.phase == Phase.PREPARE && latestRecordVersion != -1 && latestRecordVersion > threadCtx.Value.version) + { + status = OperationStatus.CPR_SHIFT_DETECTED; + goto CreatePendingContext; // Pivot thread + } + functions.SingleReader(ref key, ref input, ref readcache.GetValue(physicalAddress), ref output); + return OperationStatus.SUCCESS; + } + + if (logicalAddress >= hlog.HeadAddress) + { + physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + if (latestRecordVersion == -1) + latestRecordVersion = hlog.GetInfo(physicalAddress).Version; + + if (!comparer.Equals(ref key, ref hlog.GetKey(physicalAddress))) + { + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + TraceBackForKeyMatch(ref key, + logicalAddress, + hlog.HeadAddress, + out logicalAddress, + out physicalAddress); + } + } + } + else + { + // no tag found + return OperationStatus.NOTFOUND; + } + #endregion + + if (threadCtx.Value.phase != Phase.REST) + { + switch (threadCtx.Value.phase) + { + case Phase.PREPARE: + { + if (latestRecordVersion != -1 && latestRecordVersion > threadCtx.Value.version) + { + status = OperationStatus.CPR_SHIFT_DETECTED; + goto CreatePendingContext; // Pivot thread + } + break; // Normal processing + } + case Phase.GC: + { + GarbageCollectBuckets(hash); + break; + } + default: + { + break; + } + } + } + + #region Normal processing + + // Mutable region (even fuzzy region is included here) + if (logicalAddress >= hlog.SafeReadOnlyAddress) + { + if (hlog.GetInfo(physicalAddress).Tombstone) + return OperationStatus.NOTFOUND; + + functions.ConcurrentReader(ref key, ref input, ref hlog.GetValue(physicalAddress), ref output); + return OperationStatus.SUCCESS; + } + + // Immutable region + else if (logicalAddress >= hlog.HeadAddress) + { + if (hlog.GetInfo(physicalAddress).Tombstone) + return OperationStatus.NOTFOUND; + + functions.SingleReader(ref key, ref input, ref hlog.GetValue(physicalAddress), ref output); + return OperationStatus.SUCCESS; + } + + // On-Disk Region + else if (logicalAddress >= hlog.BeginAddress) + { + status = OperationStatus.RECORD_ON_DISK; + + if (threadCtx.Value.phase == Phase.PREPARE) + { + if (!HashBucket.TryAcquireSharedLatch(bucket)) + { + status = OperationStatus.CPR_SHIFT_DETECTED; + } + } + + goto CreatePendingContext; + } + + // No record found + else + { + return OperationStatus.NOTFOUND; + } + + #endregion + + #region Create pending context + CreatePendingContext: + { + + pendingContext.type = OperationType.READ; + pendingContext.key = hlog.GetKeyContainer(ref key); + pendingContext.input = input; + pendingContext.output = output; + pendingContext.userContext = userContext; + pendingContext.entry.word = entry.word; + pendingContext.logicalAddress = logicalAddress; + pendingContext.version = threadCtx.Value.version; + pendingContext.serialNum = threadCtx.Value.serialNum + 1; + } + #endregion + + return status; + } + + /// + /// Continue a pending read operation. Computes 'output' from 'input' and value corresponding to 'key' + /// obtained from disk. Optionally, it copies the value to tail to serve future read/write requests quickly. + /// + /// The thread (or session) context to execute operation in. + /// Async response from disk. + /// Pending context corresponding to operation. + /// + /// + /// + /// Value + /// Description + /// + /// + /// SUCCESS + /// The output has been computed and stored in 'output'. + /// + /// + /// + internal OperationStatus InternalContinuePendingRead( + FasterExecutionContext ctx, + AsyncIOContext request, + ref PendingContext pendingContext) + { + Debug.Assert(pendingContext.version == ctx.version); + + if (request.logicalAddress >= hlog.BeginAddress) + { + Debug.Assert(hlog.GetInfoFromBytePointer(request.record.GetValidPointer()).Version <= ctx.version); + + if (hlog.GetInfoFromBytePointer(request.record.GetValidPointer()).Tombstone) + return OperationStatus.NOTFOUND; + + functions.SingleReader(ref pendingContext.key.Get(), ref pendingContext.input, + ref hlog.GetContextRecordValue(ref request), ref pendingContext.output); + + if (CopyReadsToTail || UseReadCache) + { + InternalContinuePendingReadCopyToTail(ctx, request, ref pendingContext); + } + } + else + return OperationStatus.NOTFOUND; + + return OperationStatus.SUCCESS; + } + + /// + /// Copies the record read from disk to tail of the HybridLog. + /// + /// The thread(or session) context to execute operation in. + /// Async response from disk. + /// Pending context corresponding to operation. + internal void InternalContinuePendingReadCopyToTail( + FasterExecutionContext ctx, + AsyncIOContext request, + ref PendingContext pendingContext) + { + Debug.Assert(pendingContext.version == ctx.version); + + var recordSize = default(int); + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + var latestRecordVersion = default(int); + + var hash = comparer.GetHashCode64(ref pendingContext.key.Get()); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + #region Trace back record in in-memory HybridLog + var entry = default(HashBucketEntry); + FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress); + logicalAddress = entry.word & Constants.kAddressMask; + + if (UseReadCache) + SkipReadCache(ref logicalAddress, ref latestRecordVersion); + var latestLogicalAddress = logicalAddress; + + if (logicalAddress >= hlog.HeadAddress) + { + physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + if (!comparer.Equals(ref pendingContext.key.Get(), ref hlog.GetKey(physicalAddress))) + { + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + TraceBackForKeyMatch(ref pendingContext.key.Get(), + logicalAddress, + hlog.HeadAddress, + out logicalAddress, + out physicalAddress); + } + } + #endregion + + if (logicalAddress > pendingContext.entry.Address) + { + // Give up early + return; + } + + #region Create new copy in mutable region + physicalAddress = (long)request.record.GetValidPointer(); + recordSize = hlog.GetRecordSize(physicalAddress); + + long newLogicalAddress, newPhysicalAddress; + if (UseReadCache) + { + BlockAllocateReadCache(recordSize, out newLogicalAddress); + newPhysicalAddress = readcache.GetPhysicalAddress(newLogicalAddress); + RecordInfo.WriteInfo(ref readcache.GetInfo(newPhysicalAddress), ctx.version, + true, false, false, + entry.Address); + readcache.ShallowCopy(ref pendingContext.key.Get(), ref readcache.GetKey(newPhysicalAddress)); + functions.SingleWriter(ref pendingContext.key.Get(), + ref hlog.GetContextRecordValue(ref request), + ref readcache.GetValue(newPhysicalAddress)); + } + else + { + BlockAllocate(recordSize, out newLogicalAddress); + newPhysicalAddress = hlog.GetPhysicalAddress(newLogicalAddress); + RecordInfo.WriteInfo(ref hlog.GetInfo(newPhysicalAddress), ctx.version, + true, false, false, + latestLogicalAddress); + hlog.ShallowCopy(ref pendingContext.key.Get(), ref hlog.GetKey(newPhysicalAddress)); + functions.SingleWriter(ref pendingContext.key.Get(), + ref hlog.GetContextRecordValue(ref request), + ref hlog.GetValue(newPhysicalAddress)); + } + + + var updatedEntry = default(HashBucketEntry); + updatedEntry.Tag = tag; + updatedEntry.Address = newLogicalAddress & Constants.kAddressMask; + updatedEntry.Pending = entry.Pending; + updatedEntry.Tentative = false; + updatedEntry.ReadCache = UseReadCache; + + var foundEntry = default(HashBucketEntry); + foundEntry.word = Interlocked.CompareExchange( + ref bucket->bucket_entries[slot], + updatedEntry.word, + entry.word); + if (foundEntry.word != entry.word) + { + if (!UseReadCache) hlog.GetInfo(newPhysicalAddress).Invalid = true; + // We don't retry, just give up + } + #endregion + } + + #endregion + + #region Upsert Operation + + /// + /// Upsert operation. Replaces the value corresponding to 'key' with provided 'value', if one exists + /// else inserts a new record with 'key' and 'value'. + /// + /// key of the record. + /// value to be updated to (or inserted if key does not exist). + /// User context for the operation, in case it goes pending. + /// Pending context used internally to store the context of the operation. + /// + /// + /// + /// Value + /// Description + /// + /// + /// SUCCESS + /// The value has been successfully replaced(or inserted) + /// + /// + /// RETRY_LATER + /// Cannot be processed immediately due to system state. Add to pending list and retry later + /// + /// + /// CPR_SHIFT_DETECTED + /// A shift in version has been detected. Synchronize immediately to avoid violating CPR consistency. + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal OperationStatus InternalUpsert( + ref Key key, ref Value value, + ref Context userContext, + ref PendingContext pendingContext) + { + var status = default(OperationStatus); + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + var latchOperation = default(LatchOperation); + var version = default(int); + var latestRecordVersion = -1; + + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + if (threadCtx.Value.phase != Phase.REST) + HeavyEnter(hash); + + #region Trace back for record in in-memory HybridLog + var entry = default(HashBucketEntry); + FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress); + logicalAddress = entry.Address; + + if (UseReadCache) + SkipAndInvalidateReadCache(ref logicalAddress, ref latestRecordVersion, ref key); + var latestLogicalAddress = logicalAddress; + + if (logicalAddress >= hlog.ReadOnlyAddress) + { + physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + if (latestRecordVersion == -1) + latestRecordVersion = hlog.GetInfo(physicalAddress).Version; + if (!comparer.Equals(ref key, ref hlog.GetKey(physicalAddress))) + { + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + TraceBackForKeyMatch(ref key, + logicalAddress, + hlog.ReadOnlyAddress, + out logicalAddress, + out physicalAddress); + } + } + #endregion + + // Optimization for most common case + if (threadCtx.Value.phase == Phase.REST && logicalAddress >= hlog.ReadOnlyAddress && !hlog.GetInfo(physicalAddress).Tombstone) + { + if (functions.ConcurrentWriter(ref key, ref value, ref hlog.GetValue(physicalAddress))) + { + return OperationStatus.SUCCESS; + } + } + + #region Entry latch operation + if (threadCtx.Value.phase != Phase.REST) + { + switch (threadCtx.Value.phase) + { + case Phase.PREPARE: + { + version = threadCtx.Value.version; + if (HashBucket.TryAcquireSharedLatch(bucket)) + { + // Set to release shared latch (default) + latchOperation = LatchOperation.ReleaseShared; + if (latestRecordVersion != -1 && latestRecordVersion > version) + { + status = OperationStatus.CPR_SHIFT_DETECTED; + goto CreatePendingContext; // Pivot Thread + } + break; // Normal Processing + } + else + { + status = OperationStatus.CPR_SHIFT_DETECTED; + goto CreatePendingContext; // Pivot Thread + } + } + case Phase.IN_PROGRESS: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + if (HashBucket.TryAcquireExclusiveLatch(bucket)) + { + // Set to release exclusive latch (default) + latchOperation = LatchOperation.ReleaseExclusive; + goto CreateNewRecord; // Create a (v+1) record + } + else + { + status = OperationStatus.RETRY_LATER; + goto CreatePendingContext; // Go Pending + } + } + break; // Normal Processing + } + case Phase.WAIT_PENDING: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + if (HashBucket.NoSharedLatches(bucket)) + { + goto CreateNewRecord; // Create a (v+1) record + } + else + { + status = OperationStatus.RETRY_LATER; + goto CreatePendingContext; // Go Pending + } + } + break; // Normal Processing + } + case Phase.WAIT_FLUSH: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + goto CreateNewRecord; // Create a (v+1) record + } + break; // Normal Processing + } + default: + break; + } + } + #endregion + + Debug.Assert(latestRecordVersion <= threadCtx.Value.version); + + #region Normal processing + + // Mutable Region: Update the record in-place + if (logicalAddress >= hlog.ReadOnlyAddress && !hlog.GetInfo(physicalAddress).Tombstone) + { + if (functions.ConcurrentWriter(ref key, ref value, ref hlog.GetValue(physicalAddress))) + { + status = OperationStatus.SUCCESS; + goto LatchRelease; // Release shared latch (if acquired) + } + } + + // All other regions: Create a record in the mutable region + #endregion + + #region Create new record in the mutable region + CreateNewRecord: + { + // Immutable region or new record + var recordSize = hlog.GetRecordSize(ref key, ref value); + BlockAllocate(recordSize, out long newLogicalAddress); + var newPhysicalAddress = hlog.GetPhysicalAddress(newLogicalAddress); + RecordInfo.WriteInfo(ref hlog.GetInfo(newPhysicalAddress), + threadCtx.Value.version, + true, false, false, + latestLogicalAddress); + hlog.ShallowCopy(ref key, ref hlog.GetKey(newPhysicalAddress)); + functions.SingleWriter(ref key, ref value, + ref hlog.GetValue(newPhysicalAddress)); + + var updatedEntry = default(HashBucketEntry); + updatedEntry.Tag = tag; + updatedEntry.Address = newLogicalAddress & Constants.kAddressMask; + updatedEntry.Pending = entry.Pending; + updatedEntry.Tentative = false; + + var foundEntry = default(HashBucketEntry); + foundEntry.word = Interlocked.CompareExchange( + ref bucket->bucket_entries[slot], + updatedEntry.word, entry.word); + + if (foundEntry.word == entry.word) + { + status = OperationStatus.SUCCESS; + goto LatchRelease; + } + else + { + hlog.GetInfo(newPhysicalAddress).Invalid = true; + status = OperationStatus.RETRY_NOW; + goto LatchRelease; + } + } + #endregion + + #region Create pending context + CreatePendingContext: + { + pendingContext.type = OperationType.UPSERT; + pendingContext.key = hlog.GetKeyContainer(ref key); + pendingContext.value = hlog.GetValueContainer(ref value); + pendingContext.userContext = userContext; + pendingContext.entry.word = entry.word; + pendingContext.logicalAddress = logicalAddress; + pendingContext.version = threadCtx.Value.version; + pendingContext.serialNum = threadCtx.Value.serialNum + 1; + } + #endregion + + #region Latch release + LatchRelease: + { + switch (latchOperation) + { + case LatchOperation.ReleaseShared: + HashBucket.ReleaseSharedLatch(bucket); + break; + case LatchOperation.ReleaseExclusive: + HashBucket.ReleaseExclusiveLatch(bucket); + break; + default: + break; + } + } + #endregion + + if (status == OperationStatus.RETRY_NOW) + { + return InternalUpsert(ref key, ref value, ref userContext, ref pendingContext); + } + else + { + return status; + } + } + + #endregion + + #region RMW Operation + + /// + /// Read-Modify-Write Operation. Updates value of 'key' using 'input' and current value. + /// Pending operations are processed either using InternalRetryPendingRMW or + /// InternalContinuePendingRMW. + /// + /// key of the record. + /// input used to update the value. + /// user context corresponding to operation used during completion callback. + /// pending context created when the operation goes pending. + /// + /// + /// + /// Value + /// Description + /// + /// + /// SUCCESS + /// The value has been successfully updated(or inserted). + /// + /// + /// RECORD_ON_DISK + /// The record corresponding to 'key' is on disk. Issue async IO to retrieve record and retry later. + /// + /// + /// RETRY_LATER + /// Cannot be processed immediately due to system state. Add to pending list and retry later. + /// + /// + /// CPR_SHIFT_DETECTED + /// A shift in version has been detected. Synchronize immediately to avoid violating CPR consistency. + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal OperationStatus InternalRMW( + ref Key key, ref Input input, + ref Context userContext, + ref PendingContext pendingContext) + { + var recordSize = default(int); + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + var version = default(int); + var latestRecordVersion = -1; + var status = default(OperationStatus); + var latchOperation = LatchOperation.None; + + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + if (threadCtx.Value.phase != Phase.REST) + HeavyEnter(hash); + + #region Trace back for record in in-memory HybridLog + var entry = default(HashBucketEntry); + FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress); + logicalAddress = entry.Address; + + // For simplicity, we don't let RMW operations use read cache + if (UseReadCache) + SkipReadCache(ref logicalAddress, ref latestRecordVersion); + var latestLogicalAddress = logicalAddress; + + if (logicalAddress >= hlog.HeadAddress) + { + physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + latestRecordVersion = hlog.GetInfo(physicalAddress).Version; + + if (!comparer.Equals(ref key, ref hlog.GetKey(physicalAddress))) + { + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + TraceBackForKeyMatch(ref key, logicalAddress, + hlog.HeadAddress, + out logicalAddress, + out physicalAddress); + } + } + #endregion + + // Optimization for the most common case + if (threadCtx.Value.phase == Phase.REST && logicalAddress >= hlog.ReadOnlyAddress && !hlog.GetInfo(physicalAddress).Tombstone) + { + if (functions.InPlaceUpdater(ref key, ref input, ref hlog.GetValue(physicalAddress))) + { + return OperationStatus.SUCCESS; + } + } + + #region Entry latch operation + if (threadCtx.Value.phase != Phase.REST) + { + switch (threadCtx.Value.phase) + { + case Phase.PREPARE: + { + version = threadCtx.Value.version; + if (HashBucket.TryAcquireSharedLatch(bucket)) + { + // Set to release shared latch (default) + latchOperation = LatchOperation.ReleaseShared; + if (latestRecordVersion != -1 && latestRecordVersion > version) + { + status = OperationStatus.CPR_SHIFT_DETECTED; + goto CreateFailureContext; // Pivot Thread + } + break; // Normal Processing + } + else + { + status = OperationStatus.CPR_SHIFT_DETECTED; + goto CreateFailureContext; // Pivot Thread + } + } + case Phase.IN_PROGRESS: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion <= version) + { + if (HashBucket.TryAcquireExclusiveLatch(bucket)) + { + // Set to release exclusive latch (default) + latchOperation = LatchOperation.ReleaseExclusive; + goto CreateNewRecord; // Create a (v+1) record + } + else + { + status = OperationStatus.RETRY_LATER; + goto CreateFailureContext; // Go Pending + } + } + break; // Normal Processing + } + case Phase.WAIT_PENDING: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + if (HashBucket.NoSharedLatches(bucket)) + { + goto CreateNewRecord; // Create a (v+1) record + } + else + { + status = OperationStatus.RETRY_LATER; + goto CreateFailureContext; // Go Pending + } + } + break; // Normal Processing + } + case Phase.WAIT_FLUSH: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + goto CreateNewRecord; // Create a (v+1) record + } + break; // Normal Processing + } + default: + break; + } + } + #endregion + + Debug.Assert(latestRecordVersion <= threadCtx.Value.version); + + #region Normal processing + + // Mutable Region: Update the record in-place + if (logicalAddress >= hlog.ReadOnlyAddress && !hlog.GetInfo(physicalAddress).Tombstone) + { + if (FoldOverSnapshot) + { + Debug.Assert(hlog.GetInfo(physicalAddress).Version == threadCtx.Value.version); + } + + if (functions.InPlaceUpdater(ref key, ref input, ref hlog.GetValue(physicalAddress))) + { + status = OperationStatus.SUCCESS; + goto LatchRelease; // Release shared latch (if acquired) + } + } + + // Fuzzy Region: Must go pending due to lost-update anomaly + else if (logicalAddress >= hlog.SafeReadOnlyAddress && !hlog.GetInfo(physicalAddress).Tombstone) + { + status = OperationStatus.RETRY_LATER; + // Retain the shared latch (if acquired) + if (latchOperation == LatchOperation.ReleaseShared) + { + latchOperation = LatchOperation.None; + } + goto CreateFailureContext; // Go pending + } + + // Safe Read-Only Region: Create a record in the mutable region + else if (logicalAddress >= hlog.HeadAddress) + { + goto CreateNewRecord; + } + + // Disk Region: Need to issue async io requests + else if (logicalAddress >= hlog.BeginAddress) + { + status = OperationStatus.RECORD_ON_DISK; + // Retain the shared latch (if acquired) + if (latchOperation == LatchOperation.ReleaseShared) + { + latchOperation = LatchOperation.None; + } + goto CreateFailureContext; // Go pending + } + + // No record exists - create new + else + { + goto CreateNewRecord; + } + + #endregion + + #region Create new record + CreateNewRecord: + { + recordSize = (logicalAddress < hlog.BeginAddress) ? + hlog.GetInitialRecordSize(ref key, ref input) : + hlog.GetRecordSize(physicalAddress); + BlockAllocate(recordSize, out long newLogicalAddress); + var newPhysicalAddress = hlog.GetPhysicalAddress(newLogicalAddress); + RecordInfo.WriteInfo(ref hlog.GetInfo(newPhysicalAddress), threadCtx.Value.version, + true, false, false, + latestLogicalAddress); + hlog.ShallowCopy(ref key, ref hlog.GetKey(newPhysicalAddress)); + if (logicalAddress < hlog.BeginAddress) + { + functions.InitialUpdater(ref key, ref input, ref hlog.GetValue(newPhysicalAddress)); + status = OperationStatus.NOTFOUND; + } + else if (logicalAddress >= hlog.HeadAddress) + { + if (hlog.GetInfo(physicalAddress).Tombstone) + { + functions.InitialUpdater(ref key, ref input, ref hlog.GetValue(newPhysicalAddress)); + status = OperationStatus.NOTFOUND; + } + else + { + functions.CopyUpdater(ref key, ref input, + ref hlog.GetValue(physicalAddress), + ref hlog.GetValue(newPhysicalAddress)); + status = OperationStatus.SUCCESS; + } + } + else + { + // ah, old record slipped onto disk + hlog.GetInfo(newPhysicalAddress).Invalid = true; + status = OperationStatus.RETRY_NOW; + goto LatchRelease; + } + + var updatedEntry = default(HashBucketEntry); + updatedEntry.Tag = tag; + updatedEntry.Address = newLogicalAddress & Constants.kAddressMask; + updatedEntry.Pending = entry.Pending; + updatedEntry.Tentative = false; + + var foundEntry = default(HashBucketEntry); + foundEntry.word = Interlocked.CompareExchange( + ref bucket->bucket_entries[slot], + updatedEntry.word, entry.word); + + if (foundEntry.word == entry.word) + { + goto LatchRelease; + } + else + { + // ah, CAS failed + hlog.GetInfo(newPhysicalAddress).Invalid = true; + status = OperationStatus.RETRY_NOW; + goto LatchRelease; + } + } + #endregion + + #region Create failure context + CreateFailureContext: + { + pendingContext.type = OperationType.RMW; + pendingContext.key = hlog.GetKeyContainer(ref key); + pendingContext.input = input; + pendingContext.userContext = userContext; + pendingContext.entry.word = entry.word; + pendingContext.logicalAddress = logicalAddress; + pendingContext.version = threadCtx.Value.version; + pendingContext.serialNum = threadCtx.Value.serialNum + 1; + } + #endregion + + #region Latch release + LatchRelease: + { + switch (latchOperation) + { + case LatchOperation.ReleaseShared: + HashBucket.ReleaseSharedLatch(bucket); + break; + case LatchOperation.ReleaseExclusive: + HashBucket.ReleaseExclusiveLatch(bucket); + break; + default: + break; + } + } + #endregion + + if (status == OperationStatus.RETRY_NOW) + { + return InternalRMW(ref key, ref input, ref userContext, ref pendingContext); + } + else + { + return status; + } + } + + /// + /// Retries a pending RMW operation. + /// + /// Thread (or session) context under which operation must be executed. + /// Internal context of the RMW operation. + /// + /// + /// + /// Value + /// Description + /// + /// + /// SUCCESS + /// The value has been successfully updated(or inserted). + /// + /// + /// RECORD_ON_DISK + /// The record corresponding to 'key' is on disk. Issue async IO to retrieve record and retry later. + /// + /// + /// RETRY_LATER + /// Cannot be processed immediately due to system state. Add to pending list and retry later. + /// + /// + /// + internal OperationStatus InternalRetryPendingRMW( + FasterExecutionContext ctx, + ref PendingContext pendingContext) + { + var recordSize = default(int); + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + var version = default(int); + var latestRecordVersion = -1; + var status = default(OperationStatus); + var latchOperation = LatchOperation.None; + ref Key key = ref pendingContext.key.Get(); + + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + if (threadCtx.Value.phase != Phase.REST) + HeavyEnter(hash); + + #region Trace back for record in in-memory HybridLog + var entry = default(HashBucketEntry); + FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress); + logicalAddress = entry.Address; + + // For simplicity, we don't let RMW operations use read cache + if (UseReadCache) + SkipReadCache(ref logicalAddress, ref latestRecordVersion); + var latestLogicalAddress = logicalAddress; + + if (logicalAddress >= hlog.HeadAddress) + { + physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + if (latestRecordVersion == -1) + latestRecordVersion = hlog.GetInfo(physicalAddress).Version; + if (!comparer.Equals(ref key, ref hlog.GetKey(physicalAddress))) + { + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + TraceBackForKeyMatch(ref key, logicalAddress, + hlog.HeadAddress, + out logicalAddress, + out physicalAddress); + } + } + #endregion + + #region Entry latch operation + if (threadCtx.Value.phase != Phase.REST) + { + if (!((ctx.version < threadCtx.Value.version) + || + (threadCtx.Value.phase == Phase.PREPARE))) + { + // Processing a pending (v+1) request + version = (threadCtx.Value.version - 1); + switch (threadCtx.Value.phase) + { + case Phase.IN_PROGRESS: + { + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + if (HashBucket.TryAcquireExclusiveLatch(bucket)) + { + // Set to release exclusive latch (default) + latchOperation = LatchOperation.ReleaseExclusive; + goto CreateNewRecord; // Create a (v+1) record + } + else + { + status = OperationStatus.RETRY_LATER; + goto UpdateFailureContext; // Go Pending + } + } + break; // Normal Processing + } + case Phase.WAIT_PENDING: + { + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + if (HashBucket.NoSharedLatches(bucket)) + { + goto CreateNewRecord; // Create a (v+1) record + } + else + { + status = OperationStatus.RETRY_LATER; + goto UpdateFailureContext; // Go Pending + } + } + break; // Normal Processing + } + case Phase.WAIT_FLUSH: + { + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + goto CreateNewRecord; // Create a (v+1) record + } + break; // Normal Processing + } + default: + break; + } + } + } + #endregion + + #region Normal processing + + // Mutable Region: Update the record in-place + if (logicalAddress >= hlog.ReadOnlyAddress) + { + if (FoldOverSnapshot) + { + Debug.Assert(hlog.GetInfo(physicalAddress).Version == threadCtx.Value.version); + } + + if (functions.InPlaceUpdater(ref key, ref pendingContext.input, ref hlog.GetValue(physicalAddress))) + { + status = OperationStatus.SUCCESS; + goto LatchRelease; + } + } + + // Fuzzy Region: Must go pending due to lost-update anomaly + else if (logicalAddress >= hlog.SafeReadOnlyAddress) + { + status = OperationStatus.RETRY_LATER; + goto UpdateFailureContext; // Go pending + } + + // Safe Read-Only Region: Create a record in the mutable region + else if (logicalAddress >= hlog.HeadAddress) + { + goto CreateNewRecord; + } + + // Disk Region: Need to issue async io requests + else if (logicalAddress >= hlog.BeginAddress) + { + status = OperationStatus.RECORD_ON_DISK; + goto UpdateFailureContext; // Go pending + } + + // No record exists - create new + else + { + goto CreateNewRecord; + } + + #endregion + + #region Create new record in mutable region + CreateNewRecord: + { + recordSize = (logicalAddress < hlog.BeginAddress) ? + hlog.GetInitialRecordSize(ref key, ref pendingContext.input) : + hlog.GetRecordSize(physicalAddress); + BlockAllocate(recordSize, out long newLogicalAddress); + var newPhysicalAddress = hlog.GetPhysicalAddress(newLogicalAddress); + RecordInfo.WriteInfo(ref hlog.GetInfo(newPhysicalAddress), pendingContext.version, + true, false, false, + latestLogicalAddress); + hlog.ShallowCopy(ref key, ref hlog.GetKey(newPhysicalAddress)); + if (logicalAddress < hlog.BeginAddress) + { + functions.InitialUpdater(ref key, + ref pendingContext.input, + ref hlog.GetValue(newPhysicalAddress)); + status = OperationStatus.NOTFOUND; + } + else if (logicalAddress >= hlog.HeadAddress) + { + functions.CopyUpdater(ref key, + ref pendingContext.input, + ref hlog.GetValue(physicalAddress), + ref hlog.GetValue(newPhysicalAddress)); + status = OperationStatus.SUCCESS; + } + else + { + // record slipped onto disk + hlog.GetInfo(newPhysicalAddress).Invalid = true; + status = OperationStatus.RETRY_NOW; + goto LatchRelease; + } + + var updatedEntry = default(HashBucketEntry); + updatedEntry.Tag = tag; + updatedEntry.Address = newLogicalAddress & Constants.kAddressMask; + updatedEntry.Pending = entry.Pending; + updatedEntry.Tentative = false; + + var foundEntry = default(HashBucketEntry); + foundEntry.word = Interlocked.CompareExchange( + ref bucket->bucket_entries[slot], + updatedEntry.word, entry.word); + + if (foundEntry.word == entry.word) + { + goto LatchRelease; + } + else + { + // ah, CAS failed + hlog.GetInfo(newPhysicalAddress).Invalid = true; + status = OperationStatus.RETRY_NOW; + goto LatchRelease; + } + } + #endregion + + #region Update failure context + UpdateFailureContext: + { + pendingContext.entry.word = entry.word; + pendingContext.logicalAddress = logicalAddress; + } + #endregion + + #region Latch release + LatchRelease: + { + switch (latchOperation) + { + case LatchOperation.ReleaseExclusive: + HashBucket.ReleaseExclusiveLatch(bucket); + break; + case LatchOperation.ReleaseShared: + throw new Exception("Should not release shared latch here!"); + default: + break; + } + } + #endregion + + if (status == OperationStatus.RETRY_NOW) + { + return InternalRetryPendingRMW(ctx, ref pendingContext); + } + else + { + return status; + } + } + + /// + /// Continue a pending RMW operation with the record retrieved from disk. + /// + /// thread (or session) context under which operation must be executed. + /// record read from the disk. + /// internal context for the pending RMW operation + /// + /// + /// + /// Value + /// Description + /// + /// + /// SUCCESS + /// The value has been successfully updated(or inserted). + /// + /// + /// RECORD_ON_DISK + /// The record corresponding to 'key' is on disk. Issue async IO to retrieve record and retry later. + /// + /// + /// RETRY_LATER + /// Cannot be processed immediately due to system state. Add to pending list and retry later. + /// + /// + /// + internal OperationStatus InternalContinuePendingRMW( + FasterExecutionContext ctx, + AsyncIOContext request, + ref PendingContext pendingContext) + { + var recordSize = default(int); + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + var status = default(OperationStatus); + var latestRecordVersion = default(int); + ref Key key = ref pendingContext.key.Get(); + + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + #region Trace Back for Record on In-Memory HybridLog + var entry = default(HashBucketEntry); + FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress); + logicalAddress = entry.Address; + + // For simplicity, we don't let RMW operations use read cache + if (UseReadCache) + SkipReadCache(ref logicalAddress, ref latestRecordVersion); + var latestLogicalAddress = logicalAddress; + + if (logicalAddress >= hlog.HeadAddress) + { + physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + if (!comparer.Equals(ref key, ref hlog.GetKey(physicalAddress))) + { + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + TraceBackForKeyMatch(ref key, + logicalAddress, + hlog.HeadAddress, + out logicalAddress, + out physicalAddress); + } + } + #endregion + + var previousFirstRecordAddress = pendingContext.entry.Address; + if (logicalAddress > previousFirstRecordAddress) + { + goto Retry; + } + + #region Create record in mutable region + if ((request.logicalAddress < hlog.BeginAddress) || (hlog.GetInfoFromBytePointer(request.record.GetValidPointer()).Tombstone)) + { + recordSize = hlog.GetInitialRecordSize(ref key, ref pendingContext.input); + } + else + { + physicalAddress = (long)request.record.GetValidPointer(); + recordSize = hlog.GetRecordSize(physicalAddress); + } + BlockAllocate(recordSize, out long newLogicalAddress); + var newPhysicalAddress = hlog.GetPhysicalAddress(newLogicalAddress); + RecordInfo.WriteInfo(ref hlog.GetInfo(newPhysicalAddress), ctx.version, + true, false, false, + latestLogicalAddress); + hlog.ShallowCopy(ref key, ref hlog.GetKey(newPhysicalAddress)); + if ((request.logicalAddress < hlog.BeginAddress) || (hlog.GetInfoFromBytePointer(request.record.GetValidPointer()).Tombstone)) + { + functions.InitialUpdater(ref key, + ref pendingContext.input, + ref hlog.GetValue(newPhysicalAddress)); + status = OperationStatus.NOTFOUND; + } + else + { + functions.CopyUpdater(ref key, + ref pendingContext.input, + ref hlog.GetContextRecordValue(ref request), + ref hlog.GetValue(newPhysicalAddress)); + status = OperationStatus.SUCCESS; + } + + var updatedEntry = default(HashBucketEntry); + updatedEntry.Tag = tag; + updatedEntry.Address = newLogicalAddress & Constants.kAddressMask; + updatedEntry.Pending = entry.Pending; + updatedEntry.Tentative = false; + + var foundEntry = default(HashBucketEntry); + foundEntry.word = Interlocked.CompareExchange( + ref bucket->bucket_entries[slot], + updatedEntry.word, entry.word); + + if (foundEntry.word == entry.word) + { + return status; + } + else + { + hlog.GetInfo(newPhysicalAddress).Invalid = true; + goto Retry; + } + #endregion + + Retry: + return InternalRetryPendingRMW(ctx, ref pendingContext); + } + + #endregion + + #region Delete Operation + + /// + /// Delete operation. Replaces the value corresponding to 'key' with tombstone. + /// If at head, tries to remove item from hash chain + /// + /// Key of the record to be deleted. + /// User context for the operation, in case it goes pending. + /// Pending context used internally to store the context of the operation. + /// + /// + /// + /// Value + /// Description + /// + /// + /// SUCCESS + /// The value has been successfully deleted + /// + /// + /// RETRY_LATER + /// Cannot be processed immediately due to system state. Add to pending list and retry later + /// + /// + /// CPR_SHIFT_DETECTED + /// A shift in version has been detected. Synchronize immediately to avoid violating CPR consistency. + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal OperationStatus InternalDelete( + ref Key key, + ref Context userContext, + ref PendingContext pendingContext) + { + var status = default(OperationStatus); + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + var latchOperation = default(LatchOperation); + var version = default(int); + var latestRecordVersion = -1; + + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + if (threadCtx.Value.phase != Phase.REST) + HeavyEnter(hash); + + #region Trace back for record in in-memory HybridLog + var entry = default(HashBucketEntry); + var tagExists = FindTag(hash, tag, ref bucket, ref slot, ref entry); + if (!tagExists) + return OperationStatus.NOTFOUND; + + logicalAddress = entry.Address; + + if (UseReadCache) + SkipAndInvalidateReadCache(ref logicalAddress, ref latestRecordVersion, ref key); + var latestLogicalAddress = logicalAddress; + + if (logicalAddress >= hlog.ReadOnlyAddress) + { + physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + if (latestRecordVersion == -1) + latestRecordVersion = hlog.GetInfo(physicalAddress).Version; + if (!comparer.Equals(ref key, ref hlog.GetKey(physicalAddress))) + { + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + TraceBackForKeyMatch(ref key, + logicalAddress, + hlog.ReadOnlyAddress, + out logicalAddress, + out physicalAddress); + } + } + #endregion + + // NO optimization for most common case + //if (threadCtx.Value.phase == Phase.REST && logicalAddress >= hlog.ReadOnlyAddress) + //{ + // hlog.GetInfo(physicalAddress).Tombstone = true; + // return OperationStatus.SUCCESS; + //} + + #region Entry latch operation + if (threadCtx.Value.phase != Phase.REST) + { + switch (threadCtx.Value.phase) + { + case Phase.PREPARE: + { + version = threadCtx.Value.version; + if (HashBucket.TryAcquireSharedLatch(bucket)) + { + // Set to release shared latch (default) + latchOperation = LatchOperation.ReleaseShared; + if (latestRecordVersion != -1 && latestRecordVersion > version) + { + status = OperationStatus.CPR_SHIFT_DETECTED; + goto CreatePendingContext; // Pivot Thread + } + break; // Normal Processing + } + else + { + status = OperationStatus.CPR_SHIFT_DETECTED; + goto CreatePendingContext; // Pivot Thread + } + } + case Phase.IN_PROGRESS: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + if (HashBucket.TryAcquireExclusiveLatch(bucket)) + { + // Set to release exclusive latch (default) + latchOperation = LatchOperation.ReleaseExclusive; + goto CreateNewRecord; // Create a (v+1) record + } + else + { + status = OperationStatus.RETRY_LATER; + goto CreatePendingContext; // Go Pending + } + } + break; // Normal Processing + } + case Phase.WAIT_PENDING: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + if (HashBucket.NoSharedLatches(bucket)) + { + goto CreateNewRecord; // Create a (v+1) record + } + else + { + status = OperationStatus.RETRY_LATER; + goto CreatePendingContext; // Go Pending + } + } + break; // Normal Processing + } + case Phase.WAIT_FLUSH: + { + version = (threadCtx.Value.version - 1); + if (latestRecordVersion != -1 && latestRecordVersion <= version) + { + goto CreateNewRecord; // Create a (v+1) record + } + break; // Normal Processing + } + default: + break; + } + } + #endregion + + Debug.Assert(latestRecordVersion <= threadCtx.Value.version); + + #region Normal processing + + // Record is in memory, try to update hash chain and completely elide record + // only if previous address points to invalid address + if (logicalAddress >= hlog.ReadOnlyAddress) + { + if (entry.Address == logicalAddress && hlog.GetInfo(physicalAddress).PreviousAddress < hlog.BeginAddress) + { + var updatedEntry = default(HashBucketEntry); + updatedEntry.Tag = 0; + if (hlog.GetInfo(physicalAddress).PreviousAddress == Constants.kTempInvalidAddress) + updatedEntry.Address = Constants.kInvalidAddress; + else + updatedEntry.Address = hlog.GetInfo(physicalAddress).PreviousAddress; + updatedEntry.Pending = entry.Pending; + updatedEntry.Tentative = false; + + if (entry.word == Interlocked.CompareExchange(ref bucket->bucket_entries[slot], updatedEntry.word, entry.word)) + { + // Apply tombstone bit to the record + hlog.GetInfo(physicalAddress).Tombstone = true; + + if (WriteDefaultOnDelete) + { + // Write default value + // Ignore return value, the record is already marked + Value v = default(Value); + functions.ConcurrentWriter(ref hlog.GetKey(physicalAddress), ref v, ref hlog.GetValue(physicalAddress)); + } + + status = OperationStatus.SUCCESS; + goto LatchRelease; // Release shared latch (if acquired) + } + } + } + + // Mutable Region: Update the record in-place + if (logicalAddress >= hlog.ReadOnlyAddress) + { + hlog.GetInfo(physicalAddress).Tombstone = true; + + if (WriteDefaultOnDelete) + { + // Write default value + // Ignore return value, the record is already marked + Value v = default(Value); + functions.ConcurrentWriter(ref hlog.GetKey(physicalAddress), ref v, ref hlog.GetValue(physicalAddress)); + } + + status = OperationStatus.SUCCESS; + goto LatchRelease; // Release shared latch (if acquired) + } + + // All other regions: Create a record in the mutable region + #endregion + + #region Create new record in the mutable region + CreateNewRecord: + { + var value = default(Value); + // Immutable region or new record + // Allocate default record size for tombstone + var recordSize = hlog.GetRecordSize(ref key, ref value); + BlockAllocate(recordSize, out long newLogicalAddress); + var newPhysicalAddress = hlog.GetPhysicalAddress(newLogicalAddress); + RecordInfo.WriteInfo(ref hlog.GetInfo(newPhysicalAddress), + threadCtx.Value.version, + true, true, false, + latestLogicalAddress); + hlog.ShallowCopy(ref key, ref hlog.GetKey(newPhysicalAddress)); + + var updatedEntry = default(HashBucketEntry); + updatedEntry.Tag = tag; + updatedEntry.Address = newLogicalAddress & Constants.kAddressMask; + updatedEntry.Pending = entry.Pending; + updatedEntry.Tentative = false; + + var foundEntry = default(HashBucketEntry); + foundEntry.word = Interlocked.CompareExchange( + ref bucket->bucket_entries[slot], + updatedEntry.word, entry.word); + + if (foundEntry.word == entry.word) + { + status = OperationStatus.SUCCESS; + goto LatchRelease; + } + else + { + hlog.GetInfo(newPhysicalAddress).Invalid = true; + status = OperationStatus.RETRY_NOW; + goto LatchRelease; + } + } + #endregion + + #region Create pending context + CreatePendingContext: + { + pendingContext.type = OperationType.DELETE; + pendingContext.key = hlog.GetKeyContainer(ref key); + pendingContext.userContext = userContext; + pendingContext.entry.word = entry.word; + pendingContext.logicalAddress = logicalAddress; + pendingContext.version = threadCtx.Value.version; + pendingContext.serialNum = threadCtx.Value.serialNum + 1; + } + #endregion + + #region Latch release + LatchRelease: + { + switch (latchOperation) + { + case LatchOperation.ReleaseShared: + HashBucket.ReleaseSharedLatch(bucket); + break; + case LatchOperation.ReleaseExclusive: + HashBucket.ReleaseExclusiveLatch(bucket); + break; + default: + break; + } + } + #endregion + + if (status == OperationStatus.RETRY_NOW) + { + return InternalDelete(ref key, ref userContext, ref pendingContext); + } + else + { + return status; + } + } + + #endregion + + #region ContainsKeyInMemory + /// + /// Experimental feature + /// Checks whether specified record is present in memory + /// (between HeadAddress and tail, or between fromAddress + /// and tail) + /// + /// Key of the record. + /// Look until this address + /// Status + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public Status ContainsKeyInMemory(ref Key key, long fromAddress = -1) + { + if (fromAddress == -1) + fromAddress = hlog.HeadAddress; + else + Debug.Assert(fromAddress >= hlog.HeadAddress); + + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + var latestRecordVersion = -1; + + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + if (threadCtx.Value.phase != Phase.REST) + HeavyEnter(hash); + + HashBucketEntry entry = default(HashBucketEntry); + var tagExists = FindTag(hash, tag, ref bucket, ref slot, ref entry); + + if (tagExists) + { + logicalAddress = entry.Address; + + if (UseReadCache) + SkipReadCache(ref logicalAddress, ref latestRecordVersion); + + if (logicalAddress >= fromAddress) + { + physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + if (latestRecordVersion == -1) + latestRecordVersion = hlog.GetInfo(physicalAddress).Version; + + if (!comparer.Equals(ref key, ref hlog.GetKey(physicalAddress))) + { + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + TraceBackForKeyMatch(ref key, + logicalAddress, + fromAddress, + out logicalAddress, + out physicalAddress); + } + + if (logicalAddress < fromAddress) + return Status.NOTFOUND; + else + return Status.OK; + } + else + return Status.NOTFOUND; + } + else + { + // no tag found + return Status.NOTFOUND; + } + } + #endregion + + #region Helper Functions + + /// + /// Performs appropriate handling based on the internal failure status of the trial. + /// + /// Thread (or session) context under which operation was tried to execute. + /// Internal context of the operation. + /// Internal status of the trial. + /// + /// + /// + /// Value + /// Description + /// + /// + /// OK + /// The operation has been completed successfully. + /// + /// + /// PENDING + /// The operation is still pending and will callback when done. + /// + /// + /// + internal Status HandleOperationStatus( + FasterExecutionContext ctx, + PendingContext pendingContext, + OperationStatus status) + { + if (status == OperationStatus.CPR_SHIFT_DETECTED) + { + #region Epoch Synchronization + var version = ctx.version; + Debug.Assert(threadCtx.Value.version == version); + Debug.Assert(threadCtx.Value.phase == Phase.PREPARE); + Refresh(); + Debug.Assert(threadCtx.Value.version == version + 1); + Debug.Assert(threadCtx.Value.phase == Phase.IN_PROGRESS); + + pendingContext.version = threadCtx.Value.version; + #endregion + + #region Retry as (v+1) Operation + var internalStatus = default(OperationStatus); + switch (pendingContext.type) + { + case OperationType.READ: + internalStatus = InternalRead(ref pendingContext.key.Get(), + ref pendingContext.input, + ref pendingContext.output, + ref pendingContext.userContext, + ref pendingContext); + break; + case OperationType.UPSERT: + internalStatus = InternalUpsert(ref pendingContext.key.Get(), + ref pendingContext.value.Get(), + ref pendingContext.userContext, + ref pendingContext); + break; + case OperationType.DELETE: + internalStatus = InternalDelete(ref pendingContext.key.Get(), + ref pendingContext.userContext, + ref pendingContext); + break; + case OperationType.RMW: + internalStatus = InternalRetryPendingRMW(threadCtx.Value, ref pendingContext); + break; + } + + Debug.Assert(internalStatus != OperationStatus.CPR_SHIFT_DETECTED); + status = internalStatus; + #endregion + } + + if (status == OperationStatus.SUCCESS || status == OperationStatus.NOTFOUND) + { + return (Status)status; + } + else if (status == OperationStatus.RECORD_ON_DISK) + { + //Add context to dictionary + pendingContext.id = ctx.totalPending++; + ctx.ioPendingRequests.Add(pendingContext.id, pendingContext); + + // Issue asynchronous I/O request + AsyncIOContext request = default(AsyncIOContext); + request.id = pendingContext.id; + request.request_key = pendingContext.key; + request.logicalAddress = pendingContext.logicalAddress; + request.callbackQueue = ctx.readyResponses; + request.record = default(SectorAlignedMemory); + hlog.AsyncGetFromDisk(pendingContext.logicalAddress, + hlog.GetAverageRecordSize(), + request); + + return Status.PENDING; + } + else if (status == OperationStatus.RETRY_LATER) + { + ctx.retryRequests.Enqueue(pendingContext); + return Status.PENDING; + } + else + { + return Status.ERROR; + } + } + + private void AcquireSharedLatch(Key key) + { + var bucket = default(HashBucket*); + var slot = default(int); + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + var entry = default(HashBucketEntry); + FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress); + HashBucket.TryAcquireSharedLatch(bucket); + } + + private void ReleaseSharedLatch(Key key) + { + var bucket = default(HashBucket*); + var slot = default(int); + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + var entry = default(HashBucketEntry); + FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress); + HashBucket.ReleaseSharedLatch(bucket); + } + + private void HeavyEnter(long hash) + { + if (threadCtx.Value.phase == Phase.GC) + GarbageCollectBuckets(hash); + if (threadCtx.Value.phase == Phase.PREPARE_GROW) + { + // We spin-wait as a simplification + // Could instead do a "heavy operation" here + while (_systemState.phase != Phase.IN_PROGRESS_GROW) + Thread.SpinWait(100); + Refresh(); + } + if (threadCtx.Value.phase == Phase.IN_PROGRESS_GROW) + { + SplitBuckets(hash); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void BlockAllocate(int recordSize, out long logicalAddress) + { + while ((logicalAddress = hlog.TryAllocate(recordSize)) == 0) + { + InternalRefresh(); + Thread.Yield(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void BlockAllocateReadCache(int recordSize, out long logicalAddress) + { + while ((logicalAddress = readcache.TryAllocate(recordSize)) == 0) + { + InternalRefresh(); + Thread.Yield(); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool TraceBackForKeyMatch( + ref Key key, + long fromLogicalAddress, + long minOffset, + out long foundLogicalAddress, + out long foundPhysicalAddress) + { + foundLogicalAddress = fromLogicalAddress; + while (foundLogicalAddress >= minOffset) + { + foundPhysicalAddress = hlog.GetPhysicalAddress(foundLogicalAddress); + if (comparer.Equals(ref key, ref hlog.GetKey(foundPhysicalAddress))) + { + return true; + } + else + { + foundLogicalAddress = hlog.GetInfo(foundPhysicalAddress).PreviousAddress; + Debug.WriteLine("Tracing back"); + continue; + } + } + foundPhysicalAddress = Constants.kInvalidAddress; + return false; + } + #endregion + + #region Garbage Collection + private long[] gcStatus; + private long numPendingChunksToBeGCed; + + private void GarbageCollectBuckets(long hash, bool force = false) + { + if (numPendingChunksToBeGCed == 0) + { + InternalRefresh(); + return; + } + + long masked_bucket_index = hash & state[resizeInfo.version].size_mask; + int offset = (int)(masked_bucket_index >> Constants.kSizeofChunkBits); + + int numChunks = (int)(state[resizeInfo.version].size / Constants.kSizeofChunk); + if (numChunks == 0) numChunks = 1; // at least one chunk + + if (!Utility.IsPowerOfTwo(numChunks)) + { + throw new Exception("Invalid number of chunks: " + numChunks); + } + + for (int i = offset; i < offset + numChunks; i++) + { + if (0 == Interlocked.CompareExchange(ref gcStatus[i & (numChunks - 1)], 1, 0)) + { + int version = resizeInfo.version; + long chunkSize = state[version].size / numChunks; + long ptr = chunkSize * (i & (numChunks - 1)); + + HashBucket* src_start = state[version].tableAligned + ptr; + // CleanBucket(src_start, chunkSize); + + // GC for chunk is done + gcStatus[i & (numChunks - 1)] = 2; + + if (Interlocked.Decrement(ref numPendingChunksToBeGCed) == 0) + { + long context = 0; + GlobalMoveToNextState(_systemState, SystemState.Make(Phase.REST, _systemState.version), ref context); + return; + } + if (!force) + break; + + InternalRefresh(); + } + } + } + + private void CleanBucket(HashBucket* _src_start, long chunkSize) + { + HashBucketEntry entry = default(HashBucketEntry); + + for (int i = 0; i < chunkSize; i++) + { + var src_start = _src_start + i; + + do + { + for (int index = 0; index < Constants.kOverflowBucketIndex; ++index) + { + entry.word = *(((long*)src_start) + index); + if (entry.Address != Constants.kInvalidAddress && entry.Address != Constants.kTempInvalidAddress && entry.Address < hlog.BeginAddress) + { + Interlocked.CompareExchange(ref *(((long*)src_start) + index), Constants.kInvalidAddress, entry.word); + } + } + + if (*(((long*)src_start) + Constants.kOverflowBucketIndex) == 0) break; + src_start = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(*(((long*)src_start) + Constants.kOverflowBucketIndex)); + } while (true); + } + } + #endregion + + #region Split Index + private void SplitBuckets(long hash) + { + long masked_bucket_index = hash & state[1 - resizeInfo.version].size_mask; + int offset = (int)(masked_bucket_index >> Constants.kSizeofChunkBits); + + int numChunks = (int)(state[1 - resizeInfo.version].size / Constants.kSizeofChunk); + if (numChunks == 0) numChunks = 1; // at least one chunk + + + if (!Utility.IsPowerOfTwo(numChunks)) + { + throw new Exception("Invalid number of chunks: " + numChunks); + } + for (int i = offset; i < offset + numChunks; i++) + { + if (0 == Interlocked.CompareExchange(ref splitStatus[i & (numChunks - 1)], 1, 0)) + { + long chunkSize = state[1 - resizeInfo.version].size / numChunks; + long ptr = chunkSize * (i & (numChunks - 1)); + + HashBucket* src_start = state[1 - resizeInfo.version].tableAligned + ptr; + HashBucket* dest_start0 = state[resizeInfo.version].tableAligned + ptr; + HashBucket* dest_start1 = state[resizeInfo.version].tableAligned + state[1 - resizeInfo.version].size + ptr; + + SplitChunk(src_start, dest_start0, dest_start1, chunkSize); + + // split for chunk is done + splitStatus[i & (numChunks - 1)] = 2; + + if (Interlocked.Decrement(ref numPendingChunksToBeSplit) == 0) + { + // GC old version of hash table + state[1 - resizeInfo.version] = default(InternalHashTable); + + long context = 0; + GlobalMoveToNextState(_systemState, SystemState.Make(Phase.REST, _systemState.version), ref context); + return; + } + break; + } + } + + while (Interlocked.Read(ref splitStatus[offset & (numChunks - 1)]) == 1) + { + + } + + } + + private void SplitChunk( + HashBucket* _src_start, + HashBucket* _dest_start0, + HashBucket* _dest_start1, + long chunkSize) + { + for (int i = 0; i < chunkSize; i++) + { + var src_start = _src_start + i; + + long* left = (long*)(_dest_start0 + i); + long* right = (long*)(_dest_start1 + i); + long* left_end = left + Constants.kOverflowBucketIndex; + long* right_end = right + Constants.kOverflowBucketIndex; + + HashBucketEntry entry = default(HashBucketEntry); + do + { + for (int index = 0; index < Constants.kOverflowBucketIndex; ++index) + { + entry.word = *(((long*)src_start) + index); + if (Constants.kInvalidEntry == entry.word) + { + continue; + } + + var logicalAddress = entry.Address; + if (logicalAddress >= hlog.HeadAddress) + { + var physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + var hash = comparer.GetHashCode64(ref hlog.GetKey(physicalAddress)); + if ((hash & state[resizeInfo.version].size_mask) >> (state[resizeInfo.version].size_bits - 1) == 0) + { + // Insert in left + if (left == left_end) + { + var new_bucket = (HashBucket*)overflowBucketsAllocator.Allocate(); + *left = (long)new_bucket; + left = (long*)new_bucket; + left_end = left + Constants.kOverflowBucketIndex; + } + + *left = entry.word; + left++; + + // Insert previous address in right + entry.Address = TraceBackForOtherChainStart(hlog.GetInfo(physicalAddress).PreviousAddress, 1); + if (entry.Address != Constants.kInvalidAddress) + { + if (right == right_end) + { + var new_bucket = (HashBucket*)overflowBucketsAllocator.Allocate(); + *right = (long)new_bucket; + right = (long*)new_bucket; + right_end = right + Constants.kOverflowBucketIndex; + } + + *right = entry.word; + right++; + } + } + else + { + // Insert in right + if (right == right_end) + { + var new_bucket = (HashBucket*)overflowBucketsAllocator.Allocate(); + *right = (long)new_bucket; + right = (long*)new_bucket; + right_end = right + Constants.kOverflowBucketIndex; + } + + *right = entry.word; + right++; + + // Insert previous address in left + entry.Address = TraceBackForOtherChainStart(hlog.GetInfo(physicalAddress).PreviousAddress, 0); + if (entry.Address != Constants.kInvalidAddress) + { + if (left == left_end) + { + var new_bucket = (HashBucket*)overflowBucketsAllocator.Allocate(); + *left = (long)new_bucket; + left = (long*)new_bucket; + left_end = left + Constants.kOverflowBucketIndex; + } + + *left = entry.word; + left++; + } + } + } + else + { + // Insert in both new locations + + // Insert in left + if (left == left_end) + { + var new_bucket = (HashBucket*)overflowBucketsAllocator.Allocate(); + *left = (long)new_bucket; + left = (long*)new_bucket; + left_end = left + Constants.kOverflowBucketIndex; + } + + *left = entry.word; + left++; + + // Insert in right + if (right == right_end) + { + var new_bucket = (HashBucket*)overflowBucketsAllocator.Allocate(); + *right = (long)new_bucket; + right = (long*)new_bucket; + right_end = right + Constants.kOverflowBucketIndex; + } + + *right = entry.word; + right++; + } + } + + if (*(((long*)src_start) + Constants.kOverflowBucketIndex) == 0) break; + src_start = (HashBucket*)overflowBucketsAllocator.GetPhysicalAddress(*(((long*)src_start) + Constants.kOverflowBucketIndex)); + } while (true); + } + } + + private long TraceBackForOtherChainStart(long logicalAddress, int bit) + { + while (logicalAddress >= hlog.HeadAddress) + { + var physicalAddress = hlog.GetPhysicalAddress(logicalAddress); + var hash = comparer.GetHashCode64(ref hlog.GetKey(physicalAddress)); + if ((hash & state[resizeInfo.version].size_mask) >> (state[resizeInfo.version].size_bits - 1) == bit) + { + return logicalAddress; + } + logicalAddress = hlog.GetInfo(physicalAddress).PreviousAddress; + } + return logicalAddress; + } + #endregion + + #region Read Cache + private bool ReadFromCache(ref Key key, ref long logicalAddress, ref long physicalAddress, ref int latestRecordVersion) + { + HashBucketEntry entry = default(HashBucketEntry); + entry.word = logicalAddress; + if (!entry.ReadCache) return false; + + physicalAddress = readcache.GetPhysicalAddress(logicalAddress & ~Constants.kReadCacheBitMask); + latestRecordVersion = readcache.GetInfo(physicalAddress).Version; + + while (true) + { + if (!readcache.GetInfo(physicalAddress).Invalid && comparer.Equals(ref key, ref readcache.GetKey(physicalAddress))) + { + if ((logicalAddress & ~Constants.kReadCacheBitMask) >= readcache.SafeReadOnlyAddress) + { + return true; + } + Debug.Assert((logicalAddress & ~Constants.kReadCacheBitMask) >= readcache.SafeHeadAddress); + // TODO: copy to tail of read cache + // and return new cache entry + } + + logicalAddress = readcache.GetInfo(physicalAddress).PreviousAddress; + entry.word = logicalAddress; + if (!entry.ReadCache) break; + physicalAddress = readcache.GetPhysicalAddress(logicalAddress & ~Constants.kReadCacheBitMask); + } + physicalAddress = 0; + return false; + } + + private void SkipReadCache(ref long logicalAddress, ref int latestRecordVersion) + { + HashBucketEntry entry = default(HashBucketEntry); + entry.word = logicalAddress; + if (!entry.ReadCache) return; + + var physicalAddress = readcache.GetPhysicalAddress(logicalAddress & ~Constants.kReadCacheBitMask); + latestRecordVersion = readcache.GetInfo(physicalAddress).Version; + + while (true) + { + logicalAddress = readcache.GetInfo(physicalAddress).PreviousAddress; + entry.word = logicalAddress; + if (!entry.ReadCache) return; + physicalAddress = readcache.GetPhysicalAddress(logicalAddress & ~Constants.kReadCacheBitMask); + } + } + + private void SkipAndInvalidateReadCache(ref long logicalAddress, ref int latestRecordVersion, ref Key key) + { + HashBucketEntry entry = default(HashBucketEntry); + entry.word = logicalAddress; + if (!entry.ReadCache) return; + + var physicalAddress = readcache.GetPhysicalAddress(logicalAddress & ~Constants.kReadCacheBitMask); + latestRecordVersion = readcache.GetInfo(physicalAddress).Version; + + while (true) + { + // Invalidate read cache entry if key found + if (comparer.Equals(ref key, ref readcache.GetKey(physicalAddress))) + { + readcache.GetInfo(physicalAddress).Invalid = true; + } + + logicalAddress = readcache.GetInfo(physicalAddress).PreviousAddress; + entry.word = logicalAddress; + if (!entry.ReadCache) return; + physicalAddress = readcache.GetPhysicalAddress(logicalAddress & ~Constants.kReadCacheBitMask); + } + } + + private void ReadCacheEvict(long fromHeadAddress, long toHeadAddress) + { + var bucket = default(HashBucket*); + var slot = default(int); + var logicalAddress = Constants.kInvalidAddress; + var physicalAddress = default(long); + + HashBucketEntry entry = default(HashBucketEntry); + logicalAddress = fromHeadAddress; + + while (logicalAddress < toHeadAddress) + { + physicalAddress = readcache.GetPhysicalAddress(logicalAddress); + var recordSize = readcache.GetRecordSize(physicalAddress); + ref RecordInfo info = ref readcache.GetInfo(physicalAddress); + if (!info.Invalid) + { + ref Key key = ref readcache.GetKey(physicalAddress); + entry.word = info.PreviousAddress; + if (!entry.ReadCache) + { + var hash = comparer.GetHashCode64(ref key); + var tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + entry = default(HashBucketEntry); + var tagExists = FindTag(hash, tag, ref bucket, ref slot, ref entry); + while (tagExists && entry.ReadCache) + { + var updatedEntry = default(HashBucketEntry); + updatedEntry.Tag = tag; + updatedEntry.Address = info.PreviousAddress; + updatedEntry.Pending = entry.Pending; + updatedEntry.Tentative = false; + + if (entry.word == Interlocked.CompareExchange + (ref bucket->bucket_entries[slot], updatedEntry.word, entry.word)) + break; + + tagExists = FindTag(hash, tag, ref bucket, ref slot, ref entry); + } + } + } + logicalAddress += recordSize; + if ((logicalAddress & readcache.PageSizeMask) + recordSize > readcache.PageSize) + { + logicalAddress = (1 + (logicalAddress >> readcache.LogPageSizeBits)) << readcache.LogPageSizeBits; + continue; + } + } + } + #endregion + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FASTER/FASTERThread.cs b/ZeroLevel/Services/FASTER/Index/FASTER/FASTERThread.cs new file mode 100644 index 0000000..26cdea7 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FASTER/FASTERThread.cs @@ -0,0 +1,364 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.core +{ + public unsafe partial class FasterKV : FasterBase, IFasterKV + where Key : new() + where Value : new() + where Functions : IFunctions + { + internal Guid InternalAcquire() + { + epoch.Acquire(); + overflowBucketsAllocator.Acquire(); + threadCtx.InitializeThread(); + prevThreadCtx.InitializeThread(); + Phase phase = _systemState.phase; + if (phase != Phase.REST) + { + throw new Exception("Can acquire only in REST phase!"); + } + Guid guid = Guid.NewGuid(); + InitLocalContext(guid); + InternalRefresh(); + return threadCtx.Value.guid; + } + + internal long InternalContinue(Guid guid) + { + epoch.Acquire(); + overflowBucketsAllocator.Acquire(); + threadCtx.InitializeThread(); + prevThreadCtx.InitializeThread(); + if (_recoveredSessions != null) + { + if (_recoveredSessions.TryGetValue(guid, out long serialNum)) + { + // We have recovered the corresponding session. + // Now obtain the session by first locking the rest phase + var currentState = SystemState.Copy(ref _systemState); + if(currentState.phase == Phase.REST) + { + var intermediateState = SystemState.Make(Phase.INTERMEDIATE, currentState.version); + if(MakeTransition(currentState,intermediateState)) + { + // No one can change from REST phase + if(_recoveredSessions.TryRemove(guid, out serialNum)) + { + // We have atomically removed session details. + // No one else can continue this session + InitLocalContext(guid); + threadCtx.Value.serialNum = serialNum; + InternalRefresh(); + } + else + { + // Someone else continued this session + serialNum = -1; + Debug.WriteLine("Session already continued by another thread!"); + } + + MakeTransition(intermediateState, currentState); + return serialNum; + } + } + + // Need to try again when in REST + Debug.WriteLine("Can continue only in REST phase"); + return -1; + } + } + + Debug.WriteLine("No recovered sessions!"); + return -1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal void InternalRefresh() + { + epoch.ProtectAndDrain(); + + // We check if we are in normal mode + var newPhaseInfo = SystemState.Copy(ref _systemState); + if (threadCtx.Value.phase == Phase.REST && newPhaseInfo.phase == Phase.REST) + { + return; + } + + // Moving to non-checkpointing phases + if (newPhaseInfo.phase == Phase.GC || newPhaseInfo.phase == Phase.PREPARE_GROW || newPhaseInfo.phase == Phase.IN_PROGRESS_GROW) + { + threadCtx.Value.phase = newPhaseInfo.phase; + return; + } + + HandleCheckpointingPhases(); + } + + internal void InternalRelease() + { + Debug.Assert(threadCtx.Value.retryRequests.Count == 0 && + threadCtx.Value.ioPendingRequests.Count == 0); + if (prevThreadCtx.Value != default(FasterExecutionContext)) + { + Debug.Assert(prevThreadCtx.Value.retryRequests.Count == 0 && + prevThreadCtx.Value.ioPendingRequests.Count == 0); + } + Debug.Assert(threadCtx.Value.phase == Phase.REST); + threadCtx.DisposeThread(); + prevThreadCtx.DisposeThread(); + epoch.Release(); + overflowBucketsAllocator.Release(); + } + + internal void InitLocalContext(Guid token) + { + var ctx = + new FasterExecutionContext + { + phase = Phase.REST, + version = _systemState.version, + markers = new bool[8], + serialNum = 0, + totalPending = 0, + guid = token, + retryRequests = new Queue(), + readyResponses = new BlockingCollection>(), + ioPendingRequests = new Dictionary() + }; + + for(int i = 0; i < 8; i++) + { + ctx.markers[i] = false; + } + + threadCtx.Value = ctx; + } + + internal bool InternalCompletePending(bool wait = false) + { + do + { + bool done = true; + + #region Previous pending requests + if (threadCtx.Value.phase == Phase.IN_PROGRESS + || + threadCtx.Value.phase == Phase.WAIT_PENDING) + { + CompleteIOPendingRequests(prevThreadCtx.Value); + Refresh(); + CompleteRetryRequests(prevThreadCtx.Value); + + done &= (prevThreadCtx.Value.ioPendingRequests.Count == 0); + done &= (prevThreadCtx.Value.retryRequests.Count == 0); + } + #endregion + + if (!(threadCtx.Value.phase == Phase.IN_PROGRESS + || + threadCtx.Value.phase == Phase.WAIT_PENDING)) + { + CompleteIOPendingRequests(threadCtx.Value); + } + InternalRefresh(); + CompleteRetryRequests(threadCtx.Value); + + done &= (threadCtx.Value.ioPendingRequests.Count == 0); + done &= (threadCtx.Value.retryRequests.Count == 0); + + if (done) + { + return true; + } + + if (wait) + { + // Yield before checking again + Thread.Yield(); + } + } while (wait); + + return false; + } + + internal void CompleteRetryRequests(FasterExecutionContext context) + { + int count = context.retryRequests.Count; + for (int i = 0; i < count; i++) + { + var pendingContext = context.retryRequests.Dequeue(); + InternalRetryRequestAndCallback(context, pendingContext); + } + } + + internal void CompleteIOPendingRequests(FasterExecutionContext context) + { + if (context.readyResponses.Count == 0) return; + + while (context.readyResponses.TryTake(out AsyncIOContext request)) + { + InternalContinuePendingRequestAndCallback(context, request); + } + } + + internal void InternalRetryRequestAndCallback( + FasterExecutionContext ctx, + PendingContext pendingContext) + { + var status = default(Status); + var internalStatus = default(OperationStatus); + ref Key key = ref pendingContext.key.Get(); + ref Value value = ref pendingContext.value.Get(); + + #region Entry latch operation + var handleLatches = false; + if ((ctx.version < threadCtx.Value.version) // Thread has already shifted to (v+1) + || + (threadCtx.Value.phase == Phase.PREPARE)) // Thread still in version v, but acquired shared-latch + { + handleLatches = true; + } + #endregion + + // Issue retry command + switch(pendingContext.type) + { + case OperationType.RMW: + internalStatus = InternalRetryPendingRMW(ctx, ref pendingContext); + break; + case OperationType.UPSERT: + internalStatus = InternalUpsert(ref key, + ref value, + ref pendingContext.userContext, + ref pendingContext); + break; + case OperationType.DELETE: + internalStatus = InternalDelete(ref key, + ref pendingContext.userContext, + ref pendingContext); + break; + case OperationType.READ: + throw new Exception("Cannot happen!"); + } + + + // Handle operation status + if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND) + { + status = (Status)internalStatus; + } + else + { + status = HandleOperationStatus(ctx, pendingContext, internalStatus); + } + + // If done, callback user code. + if (status == Status.OK || status == Status.NOTFOUND) + { + if (handleLatches) + ReleaseSharedLatch(key); + + switch (pendingContext.type) + { + case OperationType.RMW: + functions.RMWCompletionCallback(ref key, + ref pendingContext.input, + pendingContext.userContext, status); + break; + case OperationType.UPSERT: + functions.UpsertCompletionCallback(ref key, + ref value, + pendingContext.userContext); + break; + case OperationType.DELETE: + functions.DeleteCompletionCallback(ref key, + pendingContext.userContext); + break; + default: + throw new Exception("Operation type not allowed for retry"); + } + + } + } + + internal void InternalContinuePendingRequestAndCallback( + FasterExecutionContext ctx, + AsyncIOContext request) + { + var handleLatches = false; + if ((ctx.version < threadCtx.Value.version) // Thread has already shifted to (v+1) + || + (threadCtx.Value.phase == Phase.PREPARE)) // Thread still in version v, but acquired shared-latch + { + handleLatches = true; + } + + if (ctx.ioPendingRequests.TryGetValue(request.id, out PendingContext pendingContext)) + { + var status = default(Status); + var internalStatus = default(OperationStatus); + ref Key key = ref pendingContext.key.Get(); + + // Remove from pending dictionary + ctx.ioPendingRequests.Remove(request.id); + + // Issue the continue command + if (pendingContext.type == OperationType.READ) + { + internalStatus = InternalContinuePendingRead(ctx, request, ref pendingContext); + } + else + { + internalStatus = InternalContinuePendingRMW(ctx, request, ref pendingContext); ; + } + + request.Dispose(); + + // Handle operation status + if (internalStatus == OperationStatus.SUCCESS || internalStatus == OperationStatus.NOTFOUND) + { + status = (Status)internalStatus; + } + else + { + status = HandleOperationStatus(ctx, pendingContext, internalStatus); + } + + // If done, callback user code + if(status == Status.OK || status == Status.NOTFOUND) + { + if (handleLatches) + ReleaseSharedLatch(key); + + if (pendingContext.type == OperationType.READ) + { + functions.ReadCompletionCallback(ref key, + ref pendingContext.input, + ref pendingContext.output, + pendingContext.userContext, + status); + } + else + { + functions.RMWCompletionCallback(ref key, + ref pendingContext.input, + pendingContext.userContext, + status); + } + } + pendingContext.Dispose(); + } + } + + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FASTER/LogAccessor.cs b/ZeroLevel/Services/FASTER/Index/FASTER/LogAccessor.cs new file mode 100644 index 0000000..0bbb560 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FASTER/LogAccessor.cs @@ -0,0 +1,347 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + + +using System; + +namespace FASTER.core +{ + /// + /// Wrapper to process log-related commands + /// + /// + /// + /// + /// + /// + public class LogAccessor : IObservable> + where Key : new() + where Value : new() + { + private readonly IFasterKV fht; + private readonly AllocatorBase allocator; + + /// + /// Constructor + /// + /// + /// + public LogAccessor(IFasterKV fht, AllocatorBase allocator) + { + this.fht = fht; + this.allocator = allocator; + } + + /// + /// Tail address of log + /// + public long TailAddress => allocator.GetTailAddress(); + + /// + /// Read-only address of log, i.e. boundary between read-only region and mutable region + /// + public long ReadOnlyAddress => allocator.ReadOnlyAddress; + + /// + /// Safe read-only address of log, i.e. boundary between read-only region and mutable region + /// + public long SafeReadOnlyAddress => allocator.SafeReadOnlyAddress; + + /// + /// Head address of log, i.e. beginning of in-memory regions + /// + public long HeadAddress => allocator.HeadAddress; + + /// + /// Beginning address of log + /// + public long BeginAddress => allocator.BeginAddress; + + /// + /// Truncate the log until, but not including, untilAddress + /// + /// + public void ShiftBeginAddress(long untilAddress) + { + allocator.ShiftBeginAddress(untilAddress); + } + + + /// + /// Shift log head address to prune memory foorprint of hybrid log + /// + /// Address to shift head until + /// Wait to ensure shift is registered (may involve page flushing) + /// When wait is false, this tells whether the shift to newHeadAddress was successfully registered with FASTER + public bool ShiftHeadAddress(long newHeadAddress, bool wait) + { + // First shift read-only + ShiftReadOnlyAddress(newHeadAddress, wait); + + // Then shift head address + var updatedHeadAddress = allocator.ShiftHeadAddress(newHeadAddress); + + return updatedHeadAddress >= newHeadAddress; + } + + /// + /// Subscribe to records (in batches) as they become read-only in the log + /// Currently, we support only one subscriber to the log (easy to extend) + /// Subscriber only receives new log updates from the time of subscription onwards + /// To scan the historical part of the log, use the Scan(...) method + /// + /// Observer to which scan iterator is pushed + public IDisposable Subscribe(IObserver> readOnlyObserver) + { + allocator.OnReadOnlyObserver = readOnlyObserver; + return new LogSubscribeDisposable(allocator); + } + + /// + /// Wrapper to help dispose the subscription + /// + class LogSubscribeDisposable : IDisposable + { + private AllocatorBase allocator; + + public LogSubscribeDisposable(AllocatorBase allocator) + { + this.allocator = allocator; + } + + public void Dispose() + { + allocator.OnReadOnlyObserver = null; + } + } + + /// + /// Shift log read-only address + /// + /// Address to shift read-only until + /// Wait to ensure shift is complete (may involve page flushing) + public void ShiftReadOnlyAddress(long newReadOnlyAddress, bool wait) + { + allocator.ShiftReadOnlyAddress(newReadOnlyAddress); + + // Wait for flush to complete + while (wait && allocator.FlushedUntilAddress < newReadOnlyAddress) + fht.Refresh(); + } + + /// + /// Scan the log given address range + /// + /// + /// + /// + /// + public IFasterScanIterator Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode = ScanBufferingMode.DoublePageBuffering) + { + return allocator.Scan(beginAddress, endAddress, scanBufferingMode); + } + + /// + /// Flush log until current tail (records are still retained in memory) + /// + /// Synchronous wait for operation to complete + public void Flush(bool wait) + { + ShiftReadOnlyAddress(allocator.GetTailAddress(), wait); + } + + /// + /// Flush log and evict all records from memory + /// + /// Synchronous wait for operation to complete + /// When wait is false, this tells whether the full eviction was successfully registered with FASTER + public bool FlushAndEvict(bool wait) + { + return ShiftHeadAddress(allocator.GetTailAddress(), wait); + } + + /// + /// Delete log entirely from memory. Cannot allocate on the log + /// after this point. This is a synchronous operation. + /// + public void DisposeFromMemory() + { + // Ensure we have flushed and evicted + FlushAndEvict(true); + + // Delete from memory + allocator.DeleteFromMemory(); + } + + /// + /// Compact the log until specified address, moving active + /// records to the tail of the log + /// + /// + public void Compact(long untilAddress) + { + var variableLengthStructSettings = default(VariableLengthStructSettings); + if (allocator is VariableLengthBlittableAllocator varLen) + { + var functions = new LogVariableCompactFunctions(varLen); + variableLengthStructSettings = new VariableLengthStructSettings + { + keyLength = varLen.KeyLength, + valueLength = varLen.ValueLength, + }; + + Compact(functions, untilAddress, variableLengthStructSettings); + } + else + { + Compact(new LogCompactFunctions(), untilAddress, null); + } + } + + private void Compact(T functions, long untilAddress, VariableLengthStructSettings variableLengthStructSettings) + where T : IFunctions + { + var originalUntilAddress = untilAddress; + + var tempKv = new FasterKV + (fht.IndexSize, functions, new LogSettings(), comparer: fht.Comparer, variableLengthStructSettings: variableLengthStructSettings); + tempKv.StartSession(); + + int cnt = 0; + + using (var iter1 = fht.Log.Scan(fht.Log.BeginAddress, untilAddress)) + { + while (iter1.GetNext(out RecordInfo recordInfo)) + { + ref var key = ref iter1.GetKey(); + ref var value = ref iter1.GetValue(); + + if (recordInfo.Tombstone) + tempKv.Delete(ref key, default(Context), 0); + else + tempKv.Upsert(ref key, ref value, default(Context), 0); + + if (++cnt % 1000 == 0) + { + fht.Refresh(); + tempKv.Refresh(); + } + } + } + + // TODO: Scan until SafeReadOnlyAddress + long scanUntil = untilAddress; + LogScanForValidity(ref untilAddress, ref scanUntil, ref tempKv); + + // Make sure key wasn't inserted between SafeReadOnlyAddress and TailAddress + + cnt = 0; + using (var iter3 = tempKv.Log.Scan(tempKv.Log.BeginAddress, tempKv.Log.TailAddress)) + { + while (iter3.GetNext(out RecordInfo recordInfo)) + { + ref var key = ref iter3.GetKey(); + ref var value = ref iter3.GetValue(); + + if (!recordInfo.Tombstone) + { + if (fht.ContainsKeyInMemory(ref key, scanUntil) == Status.NOTFOUND) + fht.Upsert(ref key, ref value, default(Context), 0); + } + if (++cnt % 1000 == 0) + { + fht.Refresh(); + tempKv.Refresh(); + } + if (scanUntil < fht.Log.SafeReadOnlyAddress) + { + LogScanForValidity(ref untilAddress, ref scanUntil, ref tempKv); + } + } + } + tempKv.StopSession(); + tempKv.Dispose(); + + ShiftBeginAddress(originalUntilAddress); + } + + private void LogScanForValidity(ref long untilAddress, ref long scanUntil, ref FasterKV tempKv) + where T : IFunctions + { + while (scanUntil < fht.Log.SafeReadOnlyAddress) + { + untilAddress = scanUntil; + scanUntil = fht.Log.SafeReadOnlyAddress; + int cnt = 0; + using (var iter2 = fht.Log.Scan(untilAddress, scanUntil)) + { + while (iter2.GetNext(out RecordInfo recordInfo)) + { + ref var key = ref iter2.GetKey(); + ref var value = ref iter2.GetValue(); + + tempKv.Delete(ref key, default(Context), 0); + + if (++cnt % 1000 == 0) + { + fht.Refresh(); + tempKv.Refresh(); + } + } + } + fht.Refresh(); + } + } + + private class LogVariableCompactFunctions : IFunctions + { + private VariableLengthBlittableAllocator allocator; + + public LogVariableCompactFunctions(VariableLengthBlittableAllocator allocator) + { + this.allocator = allocator; + } + + public void CheckpointCompletionCallback(Guid sessionId, long serialNum) { } + public void ConcurrentReader(ref Key key, ref Input input, ref Value value, ref Output dst) { } + public bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst) + { + var srcLength = allocator.ValueLength.GetLength(ref src); + var dstLength = allocator.ValueLength.GetLength(ref dst); + + if (srcLength != dstLength) + return false; + + allocator.ShallowCopy(ref src, ref dst); + return true; + } + public void CopyUpdater(ref Key key, ref Input input, ref Value oldValue, ref Value newValue) { } + public void InitialUpdater(ref Key key, ref Input input, ref Value value) { } + public bool InPlaceUpdater(ref Key key, ref Input input, ref Value value) => false; + public void ReadCompletionCallback(ref Key key, ref Input input, ref Output output, Context ctx, Status status) { } + public void RMWCompletionCallback(ref Key key, ref Input input, Context ctx, Status status) { } + public void SingleReader(ref Key key, ref Input input, ref Value value, ref Output dst) { } + public void SingleWriter(ref Key key, ref Value src, ref Value dst) { allocator.ShallowCopy(ref src, ref dst); } + public void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx) { } + public void DeleteCompletionCallback(ref Key key, Context ctx) { } + } + + private class LogCompactFunctions : IFunctions + { + public void CheckpointCompletionCallback(Guid sessionId, long serialNum) { } + public void ConcurrentReader(ref Key key, ref Input input, ref Value value, ref Output dst) { } + public bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst) { dst = src; return true; } + public void CopyUpdater(ref Key key, ref Input input, ref Value oldValue, ref Value newValue) { } + public void InitialUpdater(ref Key key, ref Input input, ref Value value) { } + public bool InPlaceUpdater(ref Key key, ref Input input, ref Value value) { return true; } + public void ReadCompletionCallback(ref Key key, ref Input input, ref Output output, Context ctx, Status status) { } + public void RMWCompletionCallback(ref Key key, ref Input input, Context ctx, Status status) { } + public void SingleReader(ref Key key, ref Input input, ref Value value, ref Output dst) { } + public void SingleWriter(ref Key key, ref Value src, ref Value dst) { dst = src; } + public void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx) { } + public void DeleteCompletionCallback(ref Key key, Context ctx) { } + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/CommitFailureException.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/CommitFailureException.cs new file mode 100644 index 0000000..c637480 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/CommitFailureException.cs @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +using System; +using System.Threading.Tasks; + +namespace FASTER.core +{ + /// + /// Exception thrown when commit fails + /// + public class CommitFailureException : Exception + { + /// + /// Commit info and next commit task in chain + /// + public LinkedCommitInfo LinkedCommitInfo { get; private set; } + + internal CommitFailureException(LinkedCommitInfo linkedCommitInfo, string message) + : base(message) + => LinkedCommitInfo = linkedCommitInfo; + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/CommitInfo.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/CommitInfo.cs new file mode 100644 index 0000000..70401ed --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/CommitInfo.cs @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +using System.Threading.Tasks; + +namespace FASTER.core +{ + /// + /// Info contained in task associated with commit + /// + public struct CommitInfo + { + /// + /// Begin address + /// + public long BeginAddress; + + /// + /// From address of commit range + /// + public long FromAddress; + + /// + /// Until address of commit range + /// + public long UntilAddress; + + /// + /// Error code (0 = success) + /// + public uint ErrorCode; + } + + /// + /// Linked list (chain) of commit info + /// + public struct LinkedCommitInfo + { + /// + /// Commit info + /// + public CommitInfo CommitInfo; + + /// + /// Next task in commit chain + /// + public Task NextTask; + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLog.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLog.cs new file mode 100644 index 0000000..3a46cfb --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLog.cs @@ -0,0 +1,941 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Runtime.CompilerServices; +using System.Threading; +using System.Threading.Tasks; + +namespace FASTER.core +{ + /// + /// FASTER log + /// + public class FasterLog : IDisposable + { + private readonly BlittableAllocator allocator; + private readonly LightEpoch epoch; + private readonly ILogCommitManager logCommitManager; + private readonly GetMemory getMemory; + private readonly int headerSize; + private readonly LogChecksumType logChecksum; + private readonly Dictionary RecoveredIterators; + private TaskCompletionSource commitTcs + = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + + /// + /// Beginning address of log + /// + public long BeginAddress => allocator.BeginAddress; + + /// + /// Tail address of log + /// + public long TailAddress => allocator.GetTailAddress(); + + /// + /// Log flushed until address + /// + public long FlushedUntilAddress => allocator.FlushedUntilAddress; + + /// + /// Log committed until address + /// + public long CommittedUntilAddress; + + /// + /// Log committed begin address + /// + public long CommittedBeginAddress; + + /// + /// Task notifying commit completions + /// + internal Task CommitTask => commitTcs.Task; + + /// + /// Create new log instance + /// + /// + public FasterLog(FasterLogSettings logSettings) + { + logCommitManager = logSettings.LogCommitManager ?? + new LocalLogCommitManager(logSettings.LogCommitFile ?? + logSettings.LogDevice.FileName + ".commit"); + + // Reserve 8 byte checksum in header if requested + logChecksum = logSettings.LogChecksum; + headerSize = logChecksum == LogChecksumType.PerEntry ? 12 : 4; + getMemory = logSettings.GetMemory; + epoch = new LightEpoch(); + CommittedUntilAddress = Constants.kFirstValidAddress; + CommittedBeginAddress = Constants.kFirstValidAddress; + + allocator = new BlittableAllocator( + logSettings.GetLogSettings(), null, + null, epoch, CommitCallback); + allocator.Initialize(); + Restore(out RecoveredIterators); + } + + /// + /// Dispose + /// + public void Dispose() + { + allocator.Dispose(); + epoch.Dispose(); + commitTcs.TrySetException(new ObjectDisposedException("Log has been disposed")); + } + + #region Enqueue + /// + /// Enqueue entry to log (in memory) - no guarantee of flush/commit + /// + /// Entry to be enqueued to log + /// Logical address of added entry + public long Enqueue(byte[] entry) + { + long logicalAddress; + while (!TryEnqueue(entry, out logicalAddress)) ; + return logicalAddress; + } + + /// + /// Enqueue entry to log (in memory) - no guarantee of flush/commit + /// + /// Entry to be enqueued to log + /// Logical address of added entry + public long Enqueue(ReadOnlySpan entry) + { + long logicalAddress; + while (!TryEnqueue(entry, out logicalAddress)) ; + return logicalAddress; + } + + /// + /// Enqueue batch of entries to log (in memory) - no guarantee of flush/commit + /// + /// Batch of entries to be enqueued to log + /// Logical address of added entry + public long Enqueue(IReadOnlySpanBatch readOnlySpanBatch) + { + long logicalAddress; + while (!TryEnqueue(readOnlySpanBatch, out logicalAddress)) ; + return logicalAddress; + } + #endregion + + #region TryEnqueue + /// + /// Try to enqueue entry to log (in memory). If it returns true, we are + /// done. If it returns false, we need to retry. + /// + /// Entry to be enqueued to log + /// Logical address of added entry + /// Whether the append succeeded + public unsafe bool TryEnqueue(byte[] entry, out long logicalAddress) + { + logicalAddress = 0; + + epoch.Resume(); + + var length = entry.Length; + logicalAddress = allocator.TryAllocate(headerSize + Align(length)); + if (logicalAddress == 0) + { + epoch.Suspend(); + return false; + } + + var physicalAddress = allocator.GetPhysicalAddress(logicalAddress); + fixed (byte* bp = entry) + Buffer.MemoryCopy(bp, (void*)(headerSize + physicalAddress), length, length); + SetHeader(length, (byte*)physicalAddress); + epoch.Suspend(); + return true; + } + + /// + /// Try to append entry to log. If it returns true, we are + /// done. If it returns false, we need to retry. + /// + /// Entry to be appended to log + /// Logical address of added entry + /// Whether the append succeeded + public unsafe bool TryEnqueue(ReadOnlySpan entry, out long logicalAddress) + { + logicalAddress = 0; + + epoch.Resume(); + + var length = entry.Length; + logicalAddress = allocator.TryAllocate(headerSize + Align(length)); + if (logicalAddress == 0) + { + epoch.Suspend(); + return false; + } + + var physicalAddress = allocator.GetPhysicalAddress(logicalAddress); + fixed (byte* bp = &entry.GetPinnableReference()) + Buffer.MemoryCopy(bp, (void*)(headerSize + physicalAddress), length, length); + SetHeader(length, (byte*)physicalAddress); + epoch.Suspend(); + return true; + } + + /// + /// Try to enqueue batch of entries as a single atomic unit (to memory). Entire + /// batch needs to fit on one log page. + /// + /// Batch to be appended to log + /// Logical address of first added entry + /// Whether the append succeeded + public bool TryEnqueue(IReadOnlySpanBatch readOnlySpanBatch, out long logicalAddress) + { + return TryAppend(readOnlySpanBatch, out logicalAddress, out _); + } + #endregion + + #region EnqueueAsync + /// + /// Enqueue entry to log in memory (async) - completes after entry is + /// appended to memory, NOT committed to storage. + /// + /// + /// + public async ValueTask EnqueueAsync(byte[] entry) + { + long logicalAddress; + + while (true) + { + var task = CommitTask; + if (TryEnqueue(entry, out logicalAddress)) + break; + if (NeedToWait(CommittedUntilAddress, TailAddress)) + { + // Wait for *some* commit - failure can be ignored + try + { + await task; + } + catch { } + } + } + + return logicalAddress; + } + + /// + /// Enqueue entry to log in memory (async) - completes after entry is + /// appended to memory, NOT committed to storage. + /// + /// + /// + public async ValueTask EnqueueAsync(ReadOnlyMemory entry) + { + long logicalAddress; + + while (true) + { + var task = CommitTask; + if (TryEnqueue(entry.Span, out logicalAddress)) + break; + if (NeedToWait(CommittedUntilAddress, TailAddress)) + { + // Wait for *some* commit - failure can be ignored + try + { + await task; + } + catch { } + } + } + + return logicalAddress; + } + + /// + /// Enqueue batch of entries to log in memory (async) - completes after entry is + /// appended to memory, NOT committed to storage. + /// + /// + /// + public async ValueTask EnqueueAsync(IReadOnlySpanBatch readOnlySpanBatch) + { + long logicalAddress; + + while (true) + { + var task = CommitTask; + if (TryEnqueue(readOnlySpanBatch, out logicalAddress)) + break; + if (NeedToWait(CommittedUntilAddress, TailAddress)) + { + // Wait for *some* commit - failure can be ignored + try + { + await task; + } + catch { } + } + } + + return logicalAddress; + } + #endregion + + #region WaitForCommit and WaitForCommitAsync + + /// + /// Spin-wait for enqueues, until tail or specified address, to commit to + /// storage. Does NOT itself issue a commit, just waits for commit. So you should + /// ensure that someone else causes the commit to happen. + /// + /// Address until which we should wait for commit, default 0 for tail of log + /// + public void WaitForCommit(long untilAddress = 0) + { + var tailAddress = untilAddress; + if (tailAddress == 0) tailAddress = allocator.GetTailAddress(); + + while (CommittedUntilAddress < tailAddress) ; + } + + /// + /// Wait for appends (in memory), until tail or specified address, to commit to + /// storage. Does NOT itself issue a commit, just waits for commit. So you should + /// ensure that someone else causes the commit to happen. + /// + /// Address until which we should wait for commit, default 0 for tail of log + /// + public async ValueTask WaitForCommitAsync(long untilAddress = 0) + { + var task = CommitTask; + var tailAddress = untilAddress; + if (tailAddress == 0) tailAddress = allocator.GetTailAddress(); + + while (true) + { + var linkedCommitInfo = await task; + if (linkedCommitInfo.CommitInfo.UntilAddress < tailAddress) + task = linkedCommitInfo.NextTask; + else + break; + } + } + #endregion + + #region Commit + + /// + /// Issue commit request for log (until tail) + /// + /// If true, spin-wait until commit completes. Otherwise, issue commit and return immediately. + /// + public void Commit(bool spinWait = false) + { + CommitInternal(spinWait); + } + + /// + /// Async commit log (until tail), completes only when we + /// complete the commit. Throws exception if this or any + /// ongoing commit fails. + /// + /// + public async ValueTask CommitAsync() + { + var task = CommitTask; + var tailAddress = CommitInternal(); + + while (true) + { + var linkedCommitInfo = await task; + if (linkedCommitInfo.CommitInfo.UntilAddress < tailAddress) + task = linkedCommitInfo.NextTask; + else + break; + } + } + + /// + /// Async commit log (until tail), completes only when we + /// complete the commit. Throws exception if any commit + /// from prevCommitTask to current fails. + /// + /// + public async ValueTask> CommitAsync(Task prevCommitTask) + { + if (prevCommitTask == null) prevCommitTask = commitTcs.Task; + var tailAddress = CommitInternal(); + + while (true) + { + var linkedCommitInfo = await prevCommitTask; + if (linkedCommitInfo.CommitInfo.UntilAddress < tailAddress) + prevCommitTask = linkedCommitInfo.NextTask; + else + return linkedCommitInfo.NextTask; + } + } + + #endregion + + #region EnqueueAndWaitForCommit + + /// + /// Append entry to log - spin-waits until entry is committed to storage. + /// Does NOT itself issue flush! + /// + /// + /// + public long EnqueueAndWaitForCommit(byte[] entry) + { + long logicalAddress; + while (!TryEnqueue(entry, out logicalAddress)) ; + while (CommittedUntilAddress < logicalAddress + 1) ; + return logicalAddress; + } + + /// + /// Append entry to log - spin-waits until entry is committed to storage. + /// Does NOT itself issue flush! + /// + /// + /// + public long EnqueueAndWaitForCommit(ReadOnlySpan entry) + { + long logicalAddress; + while (!TryEnqueue(entry, out logicalAddress)) ; + while (CommittedUntilAddress < logicalAddress + 1) ; + return logicalAddress; + } + + /// + /// Append batch of entries to log - spin-waits until entry is committed to storage. + /// Does NOT itself issue flush! + /// + /// + /// + public long EnqueueAndWaitForCommit(IReadOnlySpanBatch readOnlySpanBatch) + { + long logicalAddress; + while (!TryEnqueue(readOnlySpanBatch, out logicalAddress)) ; + while (CommittedUntilAddress < logicalAddress + 1) ; + return logicalAddress; + } + + #endregion + + #region EnqueueAndWaitForCommitAsync + + /// + /// Append entry to log (async) - completes after entry is committed to storage. + /// Does NOT itself issue flush! + /// + /// + /// + public async ValueTask EnqueueAndWaitForCommitAsync(byte[] entry) + { + long logicalAddress; + Task task; + + // Phase 1: wait for commit to memory + while (true) + { + task = CommitTask; + if (TryEnqueue(entry, out logicalAddress)) + break; + if (NeedToWait(CommittedUntilAddress, TailAddress)) + { + // Wait for *some* commit - failure can be ignored + try + { + await task; + } + catch { } + } + } + + // Phase 2: wait for commit/flush to storage + while (true) + { + LinkedCommitInfo linkedCommitInfo; + try + { + linkedCommitInfo = await task; + } + catch (CommitFailureException e) + { + linkedCommitInfo = e.LinkedCommitInfo; + if (logicalAddress >= linkedCommitInfo.CommitInfo.FromAddress && logicalAddress < linkedCommitInfo.CommitInfo.UntilAddress) + throw e; + } + if (linkedCommitInfo.CommitInfo.UntilAddress < logicalAddress + 1) + task = linkedCommitInfo.NextTask; + else + break; + } + + return logicalAddress; + } + + /// + /// Append entry to log (async) - completes after entry is committed to storage. + /// Does NOT itself issue flush! + /// + /// + /// + public async ValueTask EnqueueAndWaitForCommitAsync(ReadOnlyMemory entry) + { + long logicalAddress; + Task task; + + // Phase 1: wait for commit to memory + while (true) + { + task = CommitTask; + if (TryEnqueue(entry.Span, out logicalAddress)) + break; + if (NeedToWait(CommittedUntilAddress, TailAddress)) + { + // Wait for *some* commit - failure can be ignored + try + { + await task; + } + catch { } + } + } + + // Phase 2: wait for commit/flush to storage + while (true) + { + LinkedCommitInfo linkedCommitInfo; + try + { + linkedCommitInfo = await task; + } + catch (CommitFailureException e) + { + linkedCommitInfo = e.LinkedCommitInfo; + if (logicalAddress >= linkedCommitInfo.CommitInfo.FromAddress && logicalAddress < linkedCommitInfo.CommitInfo.UntilAddress) + throw e; + } + if (linkedCommitInfo.CommitInfo.UntilAddress < logicalAddress + 1) + task = linkedCommitInfo.NextTask; + else + break; + } + + return logicalAddress; + } + + /// + /// Append batch of entries to log (async) - completes after batch is committed to storage. + /// Does NOT itself issue flush! + /// + /// + /// + public async ValueTask EnqueueAndWaitForCommitAsync(IReadOnlySpanBatch readOnlySpanBatch) + { + long logicalAddress; + Task task; + + // Phase 1: wait for commit to memory + while (true) + { + task = CommitTask; + if (TryEnqueue(readOnlySpanBatch, out logicalAddress)) + break; + if (NeedToWait(CommittedUntilAddress, TailAddress)) + { + // Wait for *some* commit - failure can be ignored + try + { + await task; + } + catch { } + } + } + + // Phase 2: wait for commit/flush to storage + while (true) + { + LinkedCommitInfo linkedCommitInfo; + try + { + linkedCommitInfo = await task; + } + catch (CommitFailureException e) + { + linkedCommitInfo = e.LinkedCommitInfo; + if (logicalAddress >= linkedCommitInfo.CommitInfo.FromAddress && logicalAddress < linkedCommitInfo.CommitInfo.UntilAddress) + throw e; + } + if (linkedCommitInfo.CommitInfo.UntilAddress < logicalAddress + 1) + task = linkedCommitInfo.NextTask; + else + break; + } + + return logicalAddress; + } + #endregion + + /// + /// Truncate the log until, but not including, untilAddress + /// + /// + public void TruncateUntil(long untilAddress) + { + allocator.ShiftBeginAddress(untilAddress); + } + + /// + /// Pull-based iterator interface for scanning FASTER log + /// + /// Begin address for scan. + /// End address for scan (or long.MaxValue for tailing). + /// Name of iterator, if we need to persist/recover it (default null - do not persist). + /// Whether to recover named iterator from latest commit (if exists). If false, iterator starts from beginAddress. + /// Use single or double buffering + /// + public FasterLogScanIterator Scan(long beginAddress, long endAddress, string name = null, bool recover = true, ScanBufferingMode scanBufferingMode = ScanBufferingMode.DoublePageBuffering) + { + FasterLogScanIterator iter; + if (recover && name != null && RecoveredIterators != null && RecoveredIterators.ContainsKey(name)) + iter = new FasterLogScanIterator(this, allocator, RecoveredIterators[name], endAddress, getMemory, scanBufferingMode, epoch, headerSize, name); + else + iter = new FasterLogScanIterator(this, allocator, beginAddress, endAddress, getMemory, scanBufferingMode, epoch, headerSize, name); + + if (name != null) + { + if (name.Length > 20) + throw new Exception("Max length of iterator name is 20 characters"); + if (FasterLogScanIterator.PersistedIterators.ContainsKey(name)) + Debug.WriteLine("Iterator name exists, overwriting"); + FasterLogScanIterator.PersistedIterators[name] = iter; + } + + return iter; + } + + /// + /// Random read record from log, at given address + /// + /// Logical address to read from + /// Estimated length of entry, if known + /// + public async ValueTask<(byte[], int)> ReadAsync(long address, int estimatedLength = 0) + { + epoch.Resume(); + if (address >= CommittedUntilAddress || address < BeginAddress) + { + epoch.Suspend(); + return default; + } + var ctx = new SimpleReadContext + { + logicalAddress = address, + completedRead = new SemaphoreSlim(0) + }; + unsafe + { + allocator.AsyncReadRecordToMemory(address, headerSize + estimatedLength, AsyncGetFromDiskCallback, ref ctx); + } + epoch.Suspend(); + await ctx.completedRead.WaitAsync(); + return GetRecordAndFree(ctx.record); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int Align(int length) + { + return (length + 3) & ~3; + } + + /// + /// Commit log + /// + private void CommitCallback(CommitInfo commitInfo) + { + TaskCompletionSource _commitTcs = default; + + // We can only allow serial monotonic synchronous commit + lock (this) + { + if (CommittedBeginAddress > commitInfo.BeginAddress) + commitInfo.BeginAddress = CommittedBeginAddress; + if (CommittedUntilAddress > commitInfo.FromAddress) + commitInfo.FromAddress = CommittedUntilAddress; + if (CommittedUntilAddress > commitInfo.UntilAddress) + commitInfo.UntilAddress = CommittedUntilAddress; + + FasterLogRecoveryInfo info = new FasterLogRecoveryInfo + { + BeginAddress = commitInfo.BeginAddress, + FlushedUntilAddress = commitInfo.UntilAddress + }; + info.PopulateIterators(); + + logCommitManager.Commit(info.BeginAddress, info.FlushedUntilAddress, info.ToByteArray()); + CommittedBeginAddress = info.BeginAddress; + CommittedUntilAddress = info.FlushedUntilAddress; + + _commitTcs = commitTcs; + // If task is not faulted, create new task + // If task is faulted due to commit exception, create new task + if (commitTcs.Task.Status != TaskStatus.Faulted || commitTcs.Task.Exception.InnerException as CommitFailureException != null) + { + commitTcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + } + } + var lci = new LinkedCommitInfo + { + CommitInfo = commitInfo, + NextTask = commitTcs.Task + }; + + if (commitInfo.ErrorCode == 0) + _commitTcs?.TrySetResult(lci); + else + _commitTcs.TrySetException(new CommitFailureException(lci, $"Commit of address range [{commitInfo.FromAddress}-{commitInfo.UntilAddress}] failed with error code {commitInfo.ErrorCode}")); + } + + /// + /// Restore log + /// + private void Restore(out Dictionary recoveredIterators) + { + recoveredIterators = null; + FasterLogRecoveryInfo info = new FasterLogRecoveryInfo(); + var commitInfo = logCommitManager.GetCommitMetadata(); + + if (commitInfo == null) return; + + using (var r = new BinaryReader(new MemoryStream(commitInfo))) + { + info.Initialize(r); + } + + var headAddress = info.FlushedUntilAddress - allocator.GetOffsetInPage(info.FlushedUntilAddress); + if (headAddress == 0) headAddress = Constants.kFirstValidAddress; + + recoveredIterators = info.Iterators; + + allocator.RestoreHybridLog(info.FlushedUntilAddress, headAddress, info.BeginAddress); + CommittedUntilAddress = info.FlushedUntilAddress; + CommittedBeginAddress = info.BeginAddress; + } + + /// + /// Try to append batch of entries as a single atomic unit. Entire batch + /// needs to fit on one page. + /// + /// Batch to be appended to log + /// Logical address of first added entry + /// Actual allocated length + /// Whether the append succeeded + private unsafe bool TryAppend(IReadOnlySpanBatch readOnlySpanBatch, out long logicalAddress, out int allocatedLength) + { + logicalAddress = 0; + + int totalEntries = readOnlySpanBatch.TotalEntries(); + allocatedLength = 0; + for (int i = 0; i < totalEntries; i++) + { + allocatedLength += Align(readOnlySpanBatch.Get(i).Length) + headerSize; + } + + epoch.Resume(); + + logicalAddress = allocator.TryAllocate(allocatedLength); + if (logicalAddress == 0) + { + epoch.Suspend(); + return false; + } + + var physicalAddress = allocator.GetPhysicalAddress(logicalAddress); + for (int i = 0; i < totalEntries; i++) + { + var span = readOnlySpanBatch.Get(i); + var entryLength = span.Length; + fixed (byte* bp = &span.GetPinnableReference()) + Buffer.MemoryCopy(bp, (void*)(headerSize + physicalAddress), entryLength, entryLength); + SetHeader(entryLength, (byte*)physicalAddress); + physicalAddress += Align(entryLength) + headerSize; + } + + epoch.Suspend(); + return true; + } + + private unsafe void AsyncGetFromDiskCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + var ctx = (SimpleReadContext)Overlapped.Unpack(overlap).AsyncResult; + + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + ctx.record.Return(); + ctx.record = null; + ctx.completedRead.Release(); + } + else + { + var record = ctx.record.GetValidPointer(); + var length = GetLength(record); + + if (length < 0 || length > allocator.PageSize) + { + Debug.WriteLine("Invalid record length found: " + length); + ctx.record.Return(); + ctx.record = null; + ctx.completedRead.Release(); + } + else + { + int requiredBytes = headerSize + length; + if (ctx.record.available_bytes >= requiredBytes) + { + ctx.completedRead.Release(); + } + else + { + ctx.record.Return(); + allocator.AsyncReadRecordToMemory(ctx.logicalAddress, requiredBytes, AsyncGetFromDiskCallback, ref ctx); + } + } + } + Overlapped.Free(overlap); + } + + private (byte[], int) GetRecordAndFree(SectorAlignedMemory record) + { + if (record == null) + return (null, 0); + + byte[] result; + int length; + unsafe + { + var ptr = record.GetValidPointer(); + length = GetLength(ptr); + if (!VerifyChecksum(ptr, length)) + { + throw new Exception("Checksum failed for read"); + } + result = getMemory != null ? getMemory(length) : new byte[length]; + fixed (byte* bp = result) + { + Buffer.MemoryCopy(ptr + headerSize, bp, length, length); + } + } + record.Return(); + return (result, length); + } + + private long CommitInternal(bool spinWait = false) + { + epoch.Resume(); + if (allocator.ShiftReadOnlyToTail(out long tailAddress)) + { + if (spinWait) + { + while (CommittedUntilAddress < tailAddress) + { + epoch.ProtectAndDrain(); + Thread.Yield(); + } + } + epoch.Suspend(); + } + else + { + // May need to commit begin address and/or iterators + epoch.Suspend(); + var beginAddress = allocator.BeginAddress; + if (beginAddress > CommittedBeginAddress || FasterLogScanIterator.PersistedIterators.Count > 0) + CommitCallback(new CommitInfo { BeginAddress = beginAddress, + FromAddress = CommittedUntilAddress, + UntilAddress = CommittedUntilAddress, + ErrorCode = 0 }); + } + + return tailAddress; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal unsafe int GetLength(byte* ptr) + { + if (logChecksum == LogChecksumType.None) + return *(int*)ptr; + else if (logChecksum == LogChecksumType.PerEntry) + return *(int*)(ptr + 8); + return 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal unsafe bool VerifyChecksum(byte* ptr, int length) + { + if (logChecksum == LogChecksumType.PerEntry) + { + var cs = Utility.XorBytes(ptr + 8, length + 4); + if (cs != *(ulong*)ptr) + { + return false; + } + } + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal unsafe ulong GetChecksum(byte* ptr) + { + if (logChecksum == LogChecksumType.PerEntry) + { + return *(ulong*)ptr; + } + return 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private unsafe void SetHeader(int length, byte* dest) + { + if (logChecksum == LogChecksumType.None) + { + *(int*)dest = length; + return; + } + else if (logChecksum == LogChecksumType.PerEntry) + { + *(int*)(dest + 8) = length; + *(ulong*)dest = Utility.XorBytes(dest + 8, length + 4); + } + } + + /// + /// Do we need to await a commit to make forward progress? + /// + /// + /// + /// + private bool NeedToWait(long committedUntilAddress, long tailAddress) + { + Thread.Yield(); + return + allocator.GetPage(committedUntilAddress) <= + (allocator.GetPage(tailAddress) - allocator.BufferSize); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogIterator.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogIterator.cs new file mode 100644 index 0000000..385823a --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogIterator.cs @@ -0,0 +1,425 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Threading; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Threading.Tasks; +using System.Buffers; +using System.Collections.Generic; +using System.Collections.Concurrent; + +namespace FASTER.core +{ + /// + /// Scan iterator for hybrid log + /// + public class FasterLogScanIterator : IDisposable + { + private readonly int frameSize; + private readonly string name; + private readonly FasterLog fasterLog; + private readonly BlittableAllocator allocator; + private readonly long endAddress; + private readonly BlittableFrame frame; + private readonly CountdownEvent[] loaded; + private readonly CancellationTokenSource[] loadedCancel; + private readonly long[] loadedPage; + private readonly LightEpoch epoch; + private readonly GetMemory getMemory; + private readonly int headerSize; + private long currentAddress, nextAddress; + + /// + /// Current address + /// + public long CurrentAddress => currentAddress; + + /// + /// Next address + /// + public long NextAddress => nextAddress; + + internal static readonly ConcurrentDictionary PersistedIterators + = new ConcurrentDictionary(); + + /// + /// Constructor + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + internal unsafe FasterLogScanIterator(FasterLog fasterLog, BlittableAllocator hlog, long beginAddress, long endAddress, GetMemory getMemory, ScanBufferingMode scanBufferingMode, LightEpoch epoch, int headerSize, string name) + { + this.fasterLog = fasterLog; + this.allocator = hlog; + this.getMemory = getMemory; + this.epoch = epoch; + this.headerSize = headerSize; + + if (beginAddress == 0) + beginAddress = hlog.GetFirstValidLogicalAddress(0); + + this.name = name; + this.endAddress = endAddress; + currentAddress = beginAddress; + nextAddress = beginAddress; + + if (scanBufferingMode == ScanBufferingMode.SinglePageBuffering) + frameSize = 1; + else if (scanBufferingMode == ScanBufferingMode.DoublePageBuffering) + frameSize = 2; + else if (scanBufferingMode == ScanBufferingMode.NoBuffering) + { + frameSize = 0; + return; + } + + frame = new BlittableFrame(frameSize, hlog.PageSize, hlog.GetDeviceSectorSize()); + loaded = new CountdownEvent[frameSize]; + loadedCancel = new CancellationTokenSource[frameSize]; + loadedPage = new long[frameSize]; + for (int i = 0; i < frameSize; i++) + { + loadedPage[i] = -1; + loadedCancel[i] = new CancellationTokenSource(); + } + } + +#if DOTNETCORE + /// + /// Async enumerable for iterator + /// + /// Entry and entry length + public async IAsyncEnumerable<(byte[], int)> GetAsyncEnumerable() + { + while (true) + { + byte[] result; + int length; + while (!GetNext(out result, out length)) + { + if (currentAddress >= endAddress) + yield break; + await WaitAsync(); + } + yield return (result, length); + } + } + + /// + /// Async enumerable for iterator (memory pool based version) + /// + /// Entry and entry length + public async IAsyncEnumerable<(IMemoryOwner, int)> GetAsyncEnumerable(MemoryPool pool) + { + while (true) + { + IMemoryOwner result; + int length; + while (!GetNext(pool, out result, out length)) + { + if (currentAddress >= endAddress) + yield break; + await WaitAsync(); + } + yield return (result, length); + } + } +#endif + + /// + /// Wait for iteration to be ready to continue + /// + /// + public async ValueTask WaitAsync() + { + while (true) + { + var commitTask = fasterLog.CommitTask; + if (nextAddress >= fasterLog.CommittedUntilAddress) + { + // Ignore commit exceptions + try + { + await commitTask; + } + catch { } + } + else + break; + } + } + + /// + /// Get next record in iterator + /// + /// Copy of entry, if found + /// Actual length of entry + /// + public unsafe bool GetNext(out byte[] entry, out int entryLength) + { + if (GetNextInternal(out long physicalAddress, out entryLength, out bool epochTaken)) + { + if (getMemory != null) + { + // Use user delegate to allocate memory + entry = getMemory(entryLength); + if (entry.Length < entryLength) + throw new Exception("Byte array provided has invalid length"); + } + else + { + // We allocate a byte array from heap + entry = new byte[entryLength]; + } + + fixed (byte* bp = entry) + Buffer.MemoryCopy((void*)(headerSize + physicalAddress), bp, entryLength, entryLength); + + if (epochTaken) + epoch.Suspend(); + + return true; + } + entry = default; + return false; + } + + /// + /// GetNext supporting memory pools + /// + /// + /// + /// + /// + public unsafe bool GetNext(MemoryPool pool, out IMemoryOwner entry, out int entryLength) + { + if (GetNextInternal(out long physicalAddress, out entryLength, out bool epochTaken)) + { + entry = pool.Rent(entryLength); + + fixed (byte* bp = &entry.Memory.Span.GetPinnableReference()) + Buffer.MemoryCopy((void*)(headerSize + physicalAddress), bp, entryLength, entryLength); + + if (epochTaken) + epoch.Suspend(); + + return true; + } + entry = default; + entryLength = default; + return false; + } + + /// + /// Dispose the iterator + /// + public void Dispose() + { + frame?.Dispose(); + if (name != null) + PersistedIterators.TryRemove(name, out _); + } + + private unsafe void BufferAndLoad(long currentAddress, long currentPage, long currentFrame) + { + if (loadedPage[currentFrame] != currentPage) + { + if (loadedPage[currentFrame] != -1) + { + WaitForFrameLoad(currentFrame); + } + + allocator.AsyncReadPagesFromDeviceToFrame(currentAddress >> allocator.LogPageSizeBits, 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[currentFrame], 0, null, null, loadedCancel[currentFrame]); + loadedPage[currentFrame] = currentAddress >> allocator.LogPageSizeBits; + } + + if (frameSize == 2) + { + var nextPage = currentPage + 1; + var nextFrame = (currentFrame + 1) % frameSize; + + if (loadedPage[nextFrame] != nextPage) + { + if (loadedPage[nextFrame] != -1) + { + WaitForFrameLoad(nextFrame); + } + + allocator.AsyncReadPagesFromDeviceToFrame(1 + (currentAddress >> allocator.LogPageSizeBits), 1, endAddress, AsyncReadPagesCallback, Empty.Default, frame, out loaded[nextFrame], 0, null, null, loadedCancel[nextFrame]); + loadedPage[nextFrame] = 1 + (currentAddress >> allocator.LogPageSizeBits); + } + } + + WaitForFrameLoad(currentFrame); + } + + private void WaitForFrameLoad(long frame) + { + if (loaded[frame].IsSet) return; + + try + { + loaded[frame].Wait(loadedCancel[frame].Token); // Ensure we have completed ongoing load + } + catch (Exception e) + { + loadedPage[frame] = -1; + loadedCancel[frame] = new CancellationTokenSource(); + nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits; + throw new Exception("Page read from storage failed, skipping page. Inner exception: " + e.ToString()); + } + } + + private unsafe void AsyncReadPagesCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + var result = (PageAsyncReadResult)Overlapped.Unpack(overlap).AsyncResult; + + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + result.cts?.Cancel(); + } + + if (result.freeBuffer1 != null) + { + if (errorCode == 0) + allocator.PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, result.page); + result.freeBuffer1.Return(); + result.freeBuffer1 = null; + } + + if (errorCode == 0) + result.handle?.Signal(); + + Interlocked.MemoryBarrier(); + Overlapped.Free(overlap); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private int Align(int length) + { + return (length + 3) & ~3; + } + + /// + /// Retrieve physical address of next iterator value + /// (under epoch protection if it is from main page buffer) + /// + /// + /// + /// + /// + private unsafe bool GetNextInternal(out long physicalAddress, out int entryLength, out bool epochTaken) + { + physicalAddress = 0; + entryLength = 0; + epochTaken = false; + + currentAddress = nextAddress; + while (true) + { + // Check for boundary conditions + if (currentAddress < allocator.BeginAddress) + { + Debug.WriteLine("Iterator address is less than log BeginAddress " + allocator.BeginAddress + ", adjusting iterator address"); + currentAddress = allocator.BeginAddress; + } + + if ((currentAddress >= endAddress) || (currentAddress >= fasterLog.CommittedUntilAddress)) + { + nextAddress = currentAddress; + return false; + } + + if (frameSize == 0 && currentAddress < allocator.HeadAddress) + { + throw new Exception("Iterator address is less than log HeadAddress in memory-scan mode"); + } + + var currentPage = currentAddress >> allocator.LogPageSizeBits; + var offset = currentAddress & allocator.PageSizeMask; + + var headAddress = allocator.HeadAddress; + + if (currentAddress < headAddress) + { + BufferAndLoad(currentAddress, currentPage, currentPage % frameSize); + physicalAddress = frame.GetPhysicalAddress(currentPage % frameSize, offset); + } + else + { + epoch.Resume(); + headAddress = allocator.HeadAddress; + if (currentAddress < headAddress) // rare case + { + epoch.Suspend(); + continue; + } + + physicalAddress = allocator.GetPhysicalAddress(currentAddress); + } + + // Get and check entry length + entryLength = fasterLog.GetLength((byte*)physicalAddress); + if (entryLength == 0) + { + if (currentAddress >= headAddress) + epoch.Suspend(); + + nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits; + if (0 != fasterLog.GetChecksum((byte*)physicalAddress)) + { + var curPage = currentAddress >> allocator.LogPageSizeBits; + throw new Exception("Invalid checksum found during scan, skipping page " + curPage); + } + else + { + // We are likely at end of page, skip to next + currentAddress = nextAddress; + continue; + } + } + + int recordSize = headerSize + Align(entryLength); + if ((currentAddress & allocator.PageSizeMask) + recordSize > allocator.PageSize) + { + if (currentAddress >= headAddress) + epoch.Suspend(); + nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits; + throw new Exception("Invalid length of record found: " + entryLength + ", skipping page"); + } + + // Verify checksum if needed + if (currentAddress < headAddress) + { + if (!fasterLog.VerifyChecksum((byte*)physicalAddress, entryLength)) + { + var curPage = currentAddress >> allocator.LogPageSizeBits; + nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits; + throw new Exception("Invalid checksum found during scan, skipping page " + curPage); + } + } + + if ((currentAddress & allocator.PageSizeMask) + recordSize == allocator.PageSize) + nextAddress = (1 + (currentAddress >> allocator.LogPageSizeBits)) << allocator.LogPageSizeBits; + else + nextAddress = currentAddress + recordSize; + + epochTaken = currentAddress >= headAddress; + return true; + } + } + + } +} + + diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogRecoveryInfo.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogRecoveryInfo.cs new file mode 100644 index 0000000..4dd46d4 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogRecoveryInfo.cs @@ -0,0 +1,160 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; + +namespace FASTER.core +{ + /// + /// Recovery info for FASTER Log + /// + internal struct FasterLogRecoveryInfo + { + /// + /// Begin address + /// + public long BeginAddress; + + /// + /// Flushed logical address + /// + public long FlushedUntilAddress; + + /// + /// Persisted iterators + /// + public Dictionary Iterators; + + /// + /// Initialize + /// + public void Initialize() + { + BeginAddress = 0; + FlushedUntilAddress = 0; + } + + /// + /// Initialize from stream + /// + /// + public void Initialize(BinaryReader reader) + { + int version; + long checkSum; + try + { + version = reader.ReadInt32(); + checkSum = reader.ReadInt64(); + BeginAddress = reader.ReadInt64(); + FlushedUntilAddress = reader.ReadInt64(); + } + catch (Exception e) + { + throw new Exception("Unable to recover from previous commit. Inner exception: " + e.ToString()); + } + if (version != 0) + throw new Exception("Invalid version found during commit recovery"); + + if (checkSum != (BeginAddress ^ FlushedUntilAddress)) + throw new Exception("Invalid checksum found during commit recovery"); + + var count = 0; + try + { + count = reader.ReadInt32(); + } + catch { } + + if (count > 0) + { + Iterators = new Dictionary(); + for (int i = 0; i < count; i++) + { + Iterators.Add(reader.ReadString(), reader.ReadInt64()); + } + } + } + + /// + /// Recover info from token + /// + /// + /// + internal void Recover(ILogCommitManager logCommitManager) + { + var metadata = logCommitManager.GetCommitMetadata(); + if (metadata == null) + throw new Exception("Invalid log commit metadata during recovery"); + + Initialize(new BinaryReader(new MemoryStream(metadata))); + } + + /// + /// Reset + /// + public void Reset() + { + Initialize(); + } + + /// + /// Write info to byte array + /// + public byte[] ToByteArray() + { + using (var ms = new MemoryStream()) + { + using (var writer = new BinaryWriter(ms)) + { + writer.Write(0); // version + writer.Write(BeginAddress ^ FlushedUntilAddress); // checksum + writer.Write(BeginAddress); + writer.Write(FlushedUntilAddress); + if (Iterators?.Count > 0) + { + writer.Write(Iterators.Count); + foreach (var kvp in Iterators) + { + writer.Write(kvp.Key); + writer.Write(kvp.Value); + } + } + } + return ms.ToArray(); + } + } + + /// + /// Take snapshot of persisted iterators + /// + public void PopulateIterators() + { + if (FasterLogScanIterator.PersistedIterators.Count > 0) + { + Iterators = new Dictionary(); + + foreach (var kvp in FasterLogScanIterator.PersistedIterators) + { + Iterators.Add(kvp.Key, kvp.Value.CurrentAddress); + } + } + } + + /// + /// Print checkpoint info for debugging purposes + /// + public void DebugPrint() + { + Debug.WriteLine("******** Log Commit Info ********"); + + Debug.WriteLine("BeginAddress: {0}", BeginAddress); + Debug.WriteLine("FlushedUntilAddress: {0}", FlushedUntilAddress); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogSettings.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogSettings.cs new file mode 100644 index 0000000..8f02aad --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/FasterLogSettings.cs @@ -0,0 +1,99 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +using System; +using System.Diagnostics; +using System.IO; + +namespace FASTER.core +{ + /// + /// Delegate for getting memory from user + /// + /// Minimum length of returned byte array + /// + public delegate byte[] GetMemory(int minLength); + + /// + /// Type of checksum to add to log + /// + public enum LogChecksumType + { + /// + /// No checksums + /// + None, + /// + /// Checksum per entry + /// + PerEntry + } + + /// + /// FASTER Log Settings + /// + public class FasterLogSettings + { + /// + /// Device used for log + /// + public IDevice LogDevice = new NullDevice(); + + /// + /// Size of a page, in bits + /// + public int PageSizeBits = 22; + + /// + /// Total size of in-memory part of log, in bits + /// Should be at least one page long + /// Num pages = 2^(MemorySizeBits-PageSizeBits) + /// + public int MemorySizeBits = 23; + + /// + /// Size of a segment (group of pages), in bits + /// This is the granularity of files on disk + /// + public int SegmentSizeBits = 30; + + /// + /// Log commit manager + /// + public ILogCommitManager LogCommitManager = null; + + /// + /// Use specified directory for storing and retrieving checkpoints + /// This is a shortcut to providing the following: + /// FasterLogSettings.LogCommitManager = new LocalLogCommitManager(LogCommitFile) + /// + public string LogCommitFile = null; + + /// + /// User callback to allocate memory for read entries + /// + public GetMemory GetMemory = null; + + /// + /// Type of checksum to add to log + /// + public LogChecksumType LogChecksum = LogChecksumType.None; + + internal LogSettings GetLogSettings() + { + return new LogSettings + { + LogDevice = LogDevice, + PageSizeBits = PageSizeBits, + SegmentSizeBits = SegmentSizeBits, + MemorySizeBits = MemorySizeBits, + CopyReadsToTail = false, + MutableFraction = 0, + ObjectLogDevice = null, + ReadCacheSettings = null + }; + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/ILogCommitManager.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/ILogCommitManager.cs new file mode 100644 index 0000000..f3282ed --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/ILogCommitManager.cs @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System.IO; + +namespace FASTER.core +{ + /// + /// Log commit manager + /// + public interface ILogCommitManager + { + /// + /// Perform (synchronous) commit with specified metadata + /// + /// Committed begin address (for information only, not necessary to persist) + /// Address committed until (for information only, not necessary to persist) + /// Commit metadata - should be persisted + void Commit(long beginAddress, long untilAddress, byte[] commitMetadata); + + /// + /// Return prior commit metadata during recovery + /// + /// + byte[] GetCommitMetadata(); + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/IReadOnlySpanBatch.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/IReadOnlySpanBatch.cs new file mode 100644 index 0000000..15f61b1 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/IReadOnlySpanBatch.cs @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; + +namespace FASTER.core +{ + /// + /// Interface to provide a batch of ReadOnlySpan[byte] data to FASTER + /// + public interface IReadOnlySpanBatch + { + /// + /// Number of entries in provided batch + /// + /// Number of entries + int TotalEntries(); + + /// + /// Retrieve batch entry at specified index + /// + /// Index + /// + ReadOnlySpan Get(int index); + } +} diff --git a/ZeroLevel/Services/FASTER/Index/FasterLog/LocalLogCommitManager.cs b/ZeroLevel/Services/FASTER/Index/FasterLog/LocalLogCommitManager.cs new file mode 100644 index 0000000..f3cdc90 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/FasterLog/LocalLogCommitManager.cs @@ -0,0 +1,64 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System.IO; + +namespace FASTER.core +{ + /// + /// Implementation of checkpoint interface for local file storage + /// + public class LocalLogCommitManager : ILogCommitManager + { + private string CommitFile; + + /// + /// Create new instance of local checkpoint manager at given base directory + /// + /// + public LocalLogCommitManager(string CommitFile) + { + this.CommitFile = CommitFile; + } + + /// + /// Perform (synchronous) commit with specified metadata + /// + /// Committed begin address (for information only, not necessary to persist) + /// Address committed until (for information only, not necessary to persist) + /// Commit metadata + public void Commit(long beginAddress, long untilAddress, byte[] commitMetadata) + { + // Two phase to ensure we write metadata in single Write operation + using (var ms = new MemoryStream()) + { + using (var writer = new BinaryWriter(ms)) + { + writer.Write(commitMetadata.Length); + writer.Write(commitMetadata); + } + using (var writer = new BinaryWriter(new FileStream(CommitFile, FileMode.OpenOrCreate))) + { + writer.Write(ms.ToArray()); + writer.Flush(); + } + } + } + + /// + /// Retrieve commit metadata + /// + /// Metadata, or null if invalid + public byte[] GetCommitMetadata() + { + if (!File.Exists(CommitFile)) + return null; + + using (var reader = new BinaryReader(new FileStream(CommitFile, FileMode.Open))) + { + var len = reader.ReadInt32(); + return reader.ReadBytes(len); + } + } + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Interfaces/FunctionsBase.cs b/ZeroLevel/Services/FASTER/Index/Interfaces/FunctionsBase.cs new file mode 100644 index 0000000..195450b --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Interfaces/FunctionsBase.cs @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 1591 + +using System; + +namespace FASTER.core +{ + /// + /// Default empty functions base class to make it easy for users to provide + /// their own implementation + /// + /// + /// + /// + /// + /// + public abstract class FunctionsBase : IFunctions + { + public virtual void ConcurrentReader(ref Key key, ref Input input, ref Value value, ref Output dst) { } + public virtual void SingleReader(ref Key key, ref Input input, ref Value value, ref Output dst) { } + + public virtual bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst) { dst = src; return true; } + public virtual void SingleWriter(ref Key key, ref Value src, ref Value dst) => dst = src; + + public virtual void InitialUpdater(ref Key key, ref Input input, ref Value value) { } + public virtual void CopyUpdater(ref Key key, ref Input input, ref Value oldValue, ref Value newValue) { } + public virtual bool InPlaceUpdater(ref Key key, ref Input input, ref Value value) { return true; } + + public virtual void ReadCompletionCallback(ref Key key, ref Input input, ref Output output, Context ctx, Status status) { } + public virtual void RMWCompletionCallback(ref Key key, ref Input input, Context ctx, Status status) { } + public virtual void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx) { } + public virtual void DeleteCompletionCallback(ref Key key, Context ctx) { } + public virtual void CheckpointCompletionCallback(Guid sessionId, long serialNum) { } + } + + /// + /// Default empty functions base class to make it easy for users to provide + /// their own implementation + /// + /// + /// + /// + public class SimpleFunctions : FunctionsBase + { + private readonly Func merger; + public SimpleFunctions() => merger = (l, r) => l; + public SimpleFunctions(Func merger) => this.merger = merger; + + public override void ConcurrentReader(ref Key key, ref Value input, ref Value value, ref Value dst) => dst = value; + public override void SingleReader(ref Key key, ref Value input, ref Value value, ref Value dst) => dst = value; + + public override bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst) { dst = src; return true; } + public override void SingleWriter(ref Key key, ref Value src, ref Value dst) => dst = src; + + public override void InitialUpdater(ref Key key, ref Value input, ref Value value) => value = input; + public override void CopyUpdater(ref Key key, ref Value input, ref Value oldValue, ref Value newValue) => newValue = merger(input, oldValue); + public override bool InPlaceUpdater(ref Key key, ref Value input, ref Value value) { value = merger(input, value); return true; } + + public override void ReadCompletionCallback(ref Key key, ref Value input, ref Value output, Context ctx, Status status) { } + public override void RMWCompletionCallback(ref Key key, ref Value input, Context ctx, Status status) { } + public override void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx) { } + public override void DeleteCompletionCallback(ref Key key, Context ctx) { } + public override void CheckpointCompletionCallback(Guid sessionId, long serialNum) { } + } + + public class SimpleFunctions : SimpleFunctions + { + public SimpleFunctions() : base() { } + public SimpleFunctions(Func merger) : base(merger) { } + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Interfaces/IFasterEqualityComparer.cs b/ZeroLevel/Services/FASTER/Index/Interfaces/IFasterEqualityComparer.cs new file mode 100644 index 0000000..20be0c8 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Interfaces/IFasterEqualityComparer.cs @@ -0,0 +1,26 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +namespace FASTER.core +{ + /// + /// Key interface + /// + /// + public interface IFasterEqualityComparer + { + /// + /// Get 64-bit hash code + /// + /// + long GetHashCode64(ref T k); + + /// + /// Equality comparison + /// + /// Left side + /// Right side + /// + bool Equals(ref T k1, ref T k2); + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Interfaces/IFasterKV.cs b/ZeroLevel/Services/FASTER/Index/Interfaces/IFasterKV.cs new file mode 100644 index 0000000..a1e75da --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Interfaces/IFasterKV.cs @@ -0,0 +1,202 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Interface to FASTER key-value store + /// (customized for sample types Key, Value, Input, Output, Context) + /// Since there are pointers in the API, we cannot automatically create a + /// generic version covering arbitrary blittable types. Instead, the + /// user defines the customized interface and provides it to FASTER + /// so it can return a (generated) instance for that interface. + /// + public interface IFasterKV : IDisposable + where Key : new() + where Value : new() + { + /* Thread-related operations */ + + /// + /// Start a session with FASTER. FASTER sessions correspond to threads issuing + /// operations to FASTER. + /// + /// Session identifier + Guid StartSession(); + + /// + /// Continue a session after recovery. Provide FASTER with the identifier of the + /// session that is being continued. + /// + /// + /// Sequence number for resuming operations + long ContinueSession(Guid guid); + + /// + /// Stop a session and de-register the thread from FASTER. + /// + void StopSession(); + + /// + /// Refresh the session epoch. The caller is required to invoke Refresh periodically + /// in order to guarantee system liveness. + /// + void Refresh(); + + /* Store Interface */ + + /// + /// Read operation + /// + /// Key of read + /// Input argument used by Reader to select what part of value to read + /// Reader stores the read result in output + /// User context to identify operation in asynchronous callback + /// Increasing sequence number of operation (used for recovery) + /// Status of operation + Status Read(ref Key key, ref Input input, ref Output output, Context context, long lsn); + + /// + /// (Blind) upsert operation + /// + /// Key of read + /// Value being upserted + /// User context to identify operation in asynchronous callback + /// Increasing sequence number of operation (used for recovery) + /// Status of operation + Status Upsert(ref Key key, ref Value value, Context context, long lsn); + + /// + /// Atomic read-modify-write operation + /// + /// Key of read + /// Input argument used by RMW callback to perform operation + /// User context to identify operation in asynchronous callback + /// Increasing sequence number of operation (used for recovery) + /// Status of operation + Status RMW(ref Key key, ref Input input, Context context, long lsn); + + /// + /// Delete entry (use tombstone if necessary) + /// Hash entry is removed as a best effort (if key is in memory and at + /// the head of hash chain. + /// Value is set to null (using ConcurrentWrite) if it is in mutable region + /// + /// + /// + /// + /// + Status Delete(ref Key key, Context userContext, long monotonicSerialNum); + + /// + /// Complete all pending operations issued by this session + /// + /// Whether we spin-wait for pending operations to complete + /// Whether all pending operations have completed + bool CompletePending(bool wait); + + + /* Recovery */ + + /// + /// Take full checkpoint of FASTER + /// + /// Token describing checkpoint + /// Whether checkpoint was initiated + bool TakeFullCheckpoint(out Guid token); + + /// + /// Take checkpoint of FASTER index only (not log) + /// + /// Token describing checkpoin + /// Whether checkpoint was initiated + bool TakeIndexCheckpoint(out Guid token); + + /// + /// Take checkpoint of FASTER log only (not index) + /// + /// Token describing checkpoin + /// Whether checkpoint was initiated + bool TakeHybridLogCheckpoint(out Guid token); + + /// + /// Recover from last successfuly checkpoints + /// + void Recover(); + + /// + /// Recover using full checkpoint token + /// + /// + void Recover(Guid fullcheckpointToken); + + /// + /// Recover using a separate index and log checkpoint token + /// + /// + /// + void Recover(Guid indexToken, Guid hybridLogToken); + + /// + /// Complete ongoing checkpoint (spin-wait) + /// + /// + /// Whether checkpoint has completed + bool CompleteCheckpoint(bool wait); + + /// + /// Grow the hash index + /// + /// + bool GrowIndex(); + + /// + /// Get number of (non-zero) hash entries in FASTER + /// + long EntryCount { get; } + + /// + /// Get size of index in #cache lines (64 bytes each) + /// + long IndexSize { get; } + + /// + /// Get comparer used by this instance of FASTER + /// + IFasterEqualityComparer Comparer { get; } + + /// + /// Dump distribution of #entries in hash table + /// + string DumpDistribution(); + + /// + /// Experimental feature + /// Check if FASTER contains key in memory (between HeadAddress + /// and tail), or between the specified fromAddress (after + /// HeadAddress) and tail + /// + /// + /// + /// + Status ContainsKeyInMemory(ref Key key, long fromAddress = -1); + + /// + /// Get accessor for FASTER hybrid log + /// + LogAccessor Log { get; } + + /// + /// Get accessor for FASTER read cache + /// + LogAccessor ReadCache { get; } + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Interfaces/IFunctions.cs b/ZeroLevel/Services/FASTER/Index/Interfaces/IFunctions.cs new file mode 100644 index 0000000..a6d728a --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Interfaces/IFunctions.cs @@ -0,0 +1,115 @@ +using System; + +namespace FASTER.core +{ + /// + /// Callback functions to FASTER + /// + /// + /// + /// + /// + /// + public interface IFunctions + { + /// + /// Read completion + /// + /// + /// + /// + /// + /// + void ReadCompletionCallback(ref Key key, ref Input input, ref Output output, Context ctx, Status status); + + /// + /// Upsert completion + /// + /// + /// + /// + void UpsertCompletionCallback(ref Key key, ref Value value, Context ctx); + + /// + /// RMW completion + /// + /// + /// + /// + /// + void RMWCompletionCallback(ref Key key, ref Input input, Context ctx, Status status); + + /// + /// Delete completion + /// + /// + /// + void DeleteCompletionCallback(ref Key key, Context ctx); + + /// + /// Checkpoint completion callback (called per client session) + /// + /// Session ID reporting persistence + /// Checkpoint offset (CPR point) for session + void CheckpointCompletionCallback(Guid sessionId, long serialNum); + + /// + /// Initial update for RMW + /// + /// + /// + /// + void InitialUpdater(ref Key key, ref Input input, ref Value value); + + /// + /// Copy-update for RMW + /// + /// + /// + /// + /// + void CopyUpdater(ref Key key, ref Input input, ref Value oldValue, ref Value newValue); + + /// + /// In-place update for RMW + /// + /// + /// + /// + bool InPlaceUpdater(ref Key key, ref Input input, ref Value value); + + /// + /// Single reader + /// + /// + /// + /// + /// + void SingleReader(ref Key key, ref Input input, ref Value value, ref Output dst); + + /// + /// Conncurrent reader + /// + /// + /// + /// + /// + void ConcurrentReader(ref Key key, ref Input input, ref Value value, ref Output dst); + + /// + /// Single writer + /// + /// + /// + /// + void SingleWriter(ref Key key, ref Value src, ref Value dst); + + /// + /// Concurrent writer + /// + /// + /// + /// + bool ConcurrentWriter(ref Key key, ref Value src, ref Value dst); + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Interfaces/IObjectSerializer.cs b/ZeroLevel/Services/FASTER/Index/Interfaces/IObjectSerializer.cs new file mode 100644 index 0000000..6669c80 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Interfaces/IObjectSerializer.cs @@ -0,0 +1,111 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.IO; + +namespace FASTER.core +{ + /// + /// Object serializer interface + /// + /// + public interface IObjectSerializer + { + /// + /// Begin serialization to given stream + /// + /// + void BeginSerialize(Stream stream); + + /// + /// Serialize object + /// + /// + void Serialize(ref T obj); + + /// + /// End serialization to given stream + /// + void EndSerialize(); + + /// + /// Begin deserialization from given stream + /// + /// + void BeginDeserialize(Stream stream); + + /// + /// Deserialize object + /// + /// + void Deserialize(ref T obj); + + /// + /// End deserialization from given stream + /// + void EndDeserialize(); + } + + /// + /// Serializer base class for binary reader and writer + /// + /// + public abstract class BinaryObjectSerializer : IObjectSerializer + { + /// + /// Binary reader + /// + protected BinaryReader reader; + + /// + /// Binary writer + /// + protected BinaryWriter writer; + + /// + /// Begin deserialization + /// + /// + public void BeginDeserialize(Stream stream) + { + reader = new BinaryReader(stream); + } + + /// + /// Deserialize + /// + /// + public abstract void Deserialize(ref T obj); + + /// + /// End deserialize + /// + public void EndDeserialize() + { + } + + /// + /// Begin serialize + /// + /// + public void BeginSerialize(Stream stream) + { + writer = new BinaryWriter(stream); + } + + /// + /// Serialize + /// + /// + public abstract void Serialize(ref T obj); + + /// + /// End serialize + /// + public void EndSerialize() + { + writer.Dispose(); + } + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Recovery/Checkpoint.cs b/ZeroLevel/Services/FASTER/Index/Recovery/Checkpoint.cs new file mode 100644 index 0000000..2f07de3 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Recovery/Checkpoint.cs @@ -0,0 +1,729 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +//#define WAIT_FOR_INDEX_CHECKPOINT + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.core +{ + + /// + /// Checkpoint related function of FASTER + /// + public unsafe partial class FasterKV : FasterBase, IFasterKV + where Key : new() + where Value : new() + where Functions : IFunctions + { + private class EpochPhaseIdx + { + public const int PrepareForIndexCheckpt = 0; + + public const int Prepare = 1; + + public const int InProgress = 2; + + public const int WaitPending = 3; + + public const int WaitFlush = 4; + + public const int CheckpointCompletionCallback = 5; + } + + #region Starting points + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool InternalTakeCheckpoint(CheckpointType type) + { + if (_systemState.phase == Phase.GC) + { + Debug.WriteLine("Forcing completion of GC"); + GarbageCollectBuckets(0, true); + } + + if (_systemState.phase == Phase.REST) + { + var context = (long)type; + var currentState = SystemState.Make(Phase.REST, _systemState.version); + var nextState = GetNextState(currentState, type); + return GlobalMoveToNextState(currentState, nextState, ref context); + } + else + { + return false; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool InternalGrowIndex() + { + if (_systemState.phase == Phase.GC) + { + Debug.WriteLine("Forcing completion of GC"); + GarbageCollectBuckets(0, true); + } + + if (_systemState.phase == Phase.REST) + { + var version = _systemState.version; + long context = 0; + SystemState nextState = SystemState.Make(Phase.PREPARE_GROW, version); + if (GlobalMoveToNextState(SystemState.Make(Phase.REST, version), nextState, ref context)) + { + return true; + } + } + + return false; + } + #endregion + + + /// + /// Global transition function that coordinates various state machines. + /// A few characteristics about the state machine: + /// + /// + /// + /// Transitions happen atomically using a compare-and-swap operation. So, multiple threads can try to do the same transition. Only one will succeed. + /// + /// + /// + /// + /// Transition from state A to B happens via an intermediate state (INTERMEDIATE). This serves as a lock by a thread to perform the transition. + /// Some transitions are accompanied by actions that must be performed before the transitions such as initializing contexts, etc. + /// + /// + /// + /// + /// States can be part of multiple state machines. For example: PREP_INDEX_CHECKPOINT is part of both index-only and full checkpoints. + /// + /// + /// + /// + /// We currently support 5 different state machines: + /// + /// + /// Index-Only Checkpoint + /// REST -> PREP_INDEX_CHECKPOINT -> INDEX_CHECKPOINT -> REST + /// + /// + /// HybridLog-Only Checkpoint + /// REST -> PREPARE -> IN_PROGRESS -> WAIT_PENDING -> WAIT_FLUSH -> PERSISTENCE_CALLBACK -> REST + /// + /// + /// Full Checkpoint + /// REST -> PREP_INDEX_CHECKPOINT -> PREPARE -> IN_PROGRESS -> WAIT_PENDING -> WAIT_FLUSH -> PERSISTENCE_CALLBACK -> REST + /// + /// + /// GC + /// + /// + /// + /// Grow + /// + /// + /// + /// + /// from state of the transition. + /// to state of the transition. + /// optional additioanl parameter for transition. + /// true if transition succeeds. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool GlobalMoveToNextState(SystemState currentState, SystemState nextState, ref long context) + { + var intermediateState = SystemState.Make(Phase.INTERMEDIATE, currentState.version); + + // Move from S1 to I + if (MakeTransition(currentState, intermediateState)) + { + // Acquired ownership to make the transition from S1 to S2 + switch (nextState.phase) + { + case Phase.PREP_INDEX_CHECKPOINT: + { + _checkpointType = (CheckpointType)context; + switch (_checkpointType) + { + case CheckpointType.INDEX_ONLY: + { + _indexCheckpointToken = Guid.NewGuid(); + InitializeIndexCheckpoint(_indexCheckpointToken); + break; + } + case CheckpointType.FULL: + { + var fullCheckpointToken = Guid.NewGuid(); + _indexCheckpointToken = fullCheckpointToken; + _hybridLogCheckpointToken = fullCheckpointToken; + InitializeIndexCheckpoint(_indexCheckpointToken); + InitializeHybridLogCheckpoint(_hybridLogCheckpointToken, currentState.version); + break; + } + default: + throw new Exception(); + } + + ObtainCurrentTailAddress(ref _indexCheckpoint.info.startLogicalAddress); + + MakeTransition(intermediateState, nextState); + break; + } + case Phase.INDEX_CHECKPOINT: + { + if (UseReadCache && this.ReadCache.BeginAddress != this.ReadCache.TailAddress) + { + throw new Exception("Index checkpoint with read cache is not supported"); + } + TakeIndexFuzzyCheckpoint(); + + MakeTransition(intermediateState, nextState); + break; + } + case Phase.PREPARE: + { + switch (currentState.phase) + { + case Phase.REST: + { + _checkpointType = (CheckpointType)context; + Debug.Assert(_checkpointType == CheckpointType.HYBRID_LOG_ONLY); + _hybridLogCheckpointToken = Guid.NewGuid(); + InitializeHybridLogCheckpoint(_hybridLogCheckpointToken, currentState.version); + break; + } + case Phase.PREP_INDEX_CHECKPOINT: + { + if (UseReadCache && this.ReadCache.BeginAddress != this.ReadCache.TailAddress) + { + throw new Exception("Index checkpoint with read cache is not supported"); + } + TakeIndexFuzzyCheckpoint(); + break; + } + default: + throw new Exception(); + } + + ObtainCurrentTailAddress(ref _hybridLogCheckpoint.info.startLogicalAddress); + + if (!FoldOverSnapshot) + { + _hybridLogCheckpoint.info.flushedLogicalAddress = hlog.FlushedUntilAddress; + _hybridLogCheckpoint.info.useSnapshotFile = 1; + } + + MakeTransition(intermediateState, nextState); + break; + } + case Phase.IN_PROGRESS: + { + MakeTransition(intermediateState, nextState); + break; + } + case Phase.WAIT_PENDING: + { + var seg = hlog.GetSegmentOffsets(); + if (seg != null) + { + _hybridLogCheckpoint.info.objectLogSegmentOffsets = new long[seg.Length]; + Array.Copy(seg, _hybridLogCheckpoint.info.objectLogSegmentOffsets, seg.Length); + } + MakeTransition(intermediateState, nextState); + break; + } + case Phase.WAIT_FLUSH: + { + if (_checkpointType == CheckpointType.FULL) + { + _indexCheckpoint.info.num_buckets = overflowBucketsAllocator.GetMaxValidAddress(); + ObtainCurrentTailAddress(ref _indexCheckpoint.info.finalLogicalAddress); + } + + _hybridLogCheckpoint.info.headAddress = hlog.HeadAddress; + _hybridLogCheckpoint.info.beginAddress = hlog.BeginAddress; + + if (FoldOverSnapshot) + { + hlog.ShiftReadOnlyToTail(out long tailAddress); + + _hybridLogCheckpoint.info.finalLogicalAddress = tailAddress; + } + else + { + ObtainCurrentTailAddress(ref _hybridLogCheckpoint.info.finalLogicalAddress); + + _hybridLogCheckpoint.snapshotFileDevice = checkpointManager.GetSnapshotLogDevice(_hybridLogCheckpointToken); + _hybridLogCheckpoint.snapshotFileObjectLogDevice = checkpointManager.GetSnapshotObjectLogDevice(_hybridLogCheckpointToken); + _hybridLogCheckpoint.snapshotFileDevice.Initialize(hlog.GetSegmentSize()); + _hybridLogCheckpoint.snapshotFileObjectLogDevice.Initialize(hlog.GetSegmentSize()); + + long startPage = hlog.GetPage(_hybridLogCheckpoint.info.flushedLogicalAddress); + long endPage = hlog.GetPage(_hybridLogCheckpoint.info.finalLogicalAddress); + if (_hybridLogCheckpoint.info.finalLogicalAddress > hlog.GetStartLogicalAddress(endPage)) + { + endPage++; + } + + // This can be run on a new thread if we want to immediately parallelize + // the rest of the log flush + hlog.AsyncFlushPagesToDevice(startPage, + endPage, + _hybridLogCheckpoint.info.finalLogicalAddress, + _hybridLogCheckpoint.snapshotFileDevice, + _hybridLogCheckpoint.snapshotFileObjectLogDevice, + out _hybridLogCheckpoint.flushed); + } + + + MakeTransition(intermediateState, nextState); + break; + } + case Phase.PERSISTENCE_CALLBACK: + { + WriteHybridLogMetaInfo(); + + if (_checkpointType == CheckpointType.FULL) + WriteIndexMetaInfo(); + + MakeTransition(intermediateState, nextState); + break; + } + case Phase.GC: + { + hlog.ShiftBeginAddress(context); + + int numChunks = (int)(state[resizeInfo.version].size / Constants.kSizeofChunk); + if (numChunks == 0) numChunks = 1; // at least one chunk + + numPendingChunksToBeGCed = numChunks; + gcStatus = new long[numChunks]; + + MakeTransition(intermediateState, nextState); + break; + } + case Phase.PREPARE_GROW: + { + // Note that the transition must be done before bumping epoch here! + MakeTransition(intermediateState, nextState); + epoch.BumpCurrentEpoch(() => + { + long _context = 0; + GlobalMoveToNextState(nextState, SystemState.Make(Phase.IN_PROGRESS_GROW, nextState.version), ref _context); + }); + break; + } + case Phase.IN_PROGRESS_GROW: + { + // Set up the transition to new version of HT + int numChunks = (int)(state[resizeInfo.version].size / Constants.kSizeofChunk); + if (numChunks == 0) numChunks = 1; // at least one chunk + + numPendingChunksToBeSplit = numChunks; + splitStatus = new long[numChunks]; + + Initialize(1 - resizeInfo.version, state[resizeInfo.version].size * 2, sectorSize); + + resizeInfo.version = 1 - resizeInfo.version; + + MakeTransition(intermediateState, nextState); + break; + } + case Phase.REST: + { + switch (_checkpointType) + { + case CheckpointType.INDEX_ONLY: + { + _indexCheckpoint.info.num_buckets = overflowBucketsAllocator.GetMaxValidAddress(); + ObtainCurrentTailAddress(ref _indexCheckpoint.info.finalLogicalAddress); + WriteIndexMetaInfo(); + _indexCheckpoint.Reset(); + break; + } + case CheckpointType.FULL: + { + _indexCheckpoint.Reset(); + _hybridLogCheckpoint.Reset(); + break; + } + case CheckpointType.HYBRID_LOG_ONLY: + { + _hybridLogCheckpoint.Reset(); + break; + } + case CheckpointType.NONE: + break; + default: + throw new Exception(); + } + + _checkpointType = CheckpointType.NONE; + + MakeTransition(intermediateState, nextState); + break; + } + } + return true; + } + else + { + return false; + } + } + + /// + /// Corresponding thread-local actions that must be performed when any state machine is active + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void HandleCheckpointingPhases() + { + var previousState = SystemState.Make(threadCtx.Value.phase, threadCtx.Value.version); + var finalState = SystemState.Copy(ref _systemState); + + // Don't play around when system state is being changed + if (finalState.phase == Phase.INTERMEDIATE) + { + return; + } + + // We need to move from previousState to finalState one step at a time + do + { + var currentState = default(SystemState); + if (previousState.word == finalState.word) + { + currentState.word = previousState.word; + } + else + { + currentState = GetNextState(previousState, _checkpointType); + } + + switch (currentState.phase) + { + case Phase.PREP_INDEX_CHECKPOINT: + { + if (!threadCtx.Value.markers[EpochPhaseIdx.PrepareForIndexCheckpt]) + { + if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.PrepareForIndexCheckpt, threadCtx.Value.version)) + { + GlobalMoveToNextCheckpointState(currentState); + } + threadCtx.Value.markers[EpochPhaseIdx.PrepareForIndexCheckpt] = true; + } + break; + } + case Phase.INDEX_CHECKPOINT: + { + if (_checkpointType == CheckpointType.INDEX_ONLY) + { + // Reseting the marker for a potential FULL or INDEX_ONLY checkpoint in the future + threadCtx.Value.markers[EpochPhaseIdx.PrepareForIndexCheckpt] = false; + } + + if (IsIndexFuzzyCheckpointCompleted()) + { + GlobalMoveToNextCheckpointState(currentState); + } + break; + } + case Phase.PREPARE: + { + if (!threadCtx.Value.markers[EpochPhaseIdx.Prepare]) + { + // Thread local action + AcquireSharedLatchesForAllPendingRequests(); + + var idx = Interlocked.Increment(ref _hybridLogCheckpoint.info.numThreads); + idx -= 1; + + _hybridLogCheckpoint.info.guids[idx] = threadCtx.Value.guid; + + if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.Prepare, threadCtx.Value.version)) + { + GlobalMoveToNextCheckpointState(currentState); + } + + threadCtx.Value.markers[EpochPhaseIdx.Prepare] = true; + } + break; + } + case Phase.IN_PROGRESS: + { + // Need to be very careful here as threadCtx is changing + FasterExecutionContext ctx; + if (previousState.phase == Phase.PREPARE) + { + ctx = threadCtx.Value; + } + else + { + ctx = prevThreadCtx.Value; + } + + if (!ctx.markers[EpochPhaseIdx.InProgress]) + { + prevThreadCtx.Value = threadCtx.Value; + + InitLocalContext(prevThreadCtx.Value.guid); + + if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.InProgress, ctx.version)) + { + GlobalMoveToNextCheckpointState(currentState); + } + prevThreadCtx.Value.markers[EpochPhaseIdx.InProgress] = true; + } + break; + } + case Phase.WAIT_PENDING: + { + if (!prevThreadCtx.Value.markers[EpochPhaseIdx.WaitPending]) + { + var notify = (prevThreadCtx.Value.ioPendingRequests.Count == 0); + notify = notify && (prevThreadCtx.Value.retryRequests.Count == 0); + + if (notify) + { + if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.WaitPending, threadCtx.Value.version)) + { + GlobalMoveToNextCheckpointState(currentState); + } + prevThreadCtx.Value.markers[EpochPhaseIdx.WaitPending] = true; + } + + } + break; + } + case Phase.WAIT_FLUSH: + { + if (!prevThreadCtx.Value.markers[EpochPhaseIdx.WaitFlush]) + { + var notify = false; + if (FoldOverSnapshot) + { + notify = (hlog.FlushedUntilAddress >= _hybridLogCheckpoint.info.finalLogicalAddress); + } + else + { + notify = (_hybridLogCheckpoint.flushed != null) && _hybridLogCheckpoint.flushed.IsSet; + } + + if (_checkpointType == CheckpointType.FULL) + { + notify = notify && IsIndexFuzzyCheckpointCompleted(); + } + + if (notify) + { + _hybridLogCheckpoint.info.checkpointTokens.TryAdd(prevThreadCtx.Value.guid, prevThreadCtx.Value.serialNum); + + if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.WaitFlush, prevThreadCtx.Value.version)) + { + GlobalMoveToNextCheckpointState(currentState); + } + + prevThreadCtx.Value.markers[EpochPhaseIdx.WaitFlush] = true; + } + } + break; + } + + case Phase.PERSISTENCE_CALLBACK: + { + if (!prevThreadCtx.Value.markers[EpochPhaseIdx.CheckpointCompletionCallback]) + { + // Thread local action + functions.CheckpointCompletionCallback(threadCtx.Value.guid, prevThreadCtx.Value.serialNum); + + if (epoch.MarkAndCheckIsComplete(EpochPhaseIdx.CheckpointCompletionCallback, prevThreadCtx.Value.version)) + { + GlobalMoveToNextCheckpointState(currentState); + } + + prevThreadCtx.Value.markers[EpochPhaseIdx.CheckpointCompletionCallback] = true; + } + break; + } + case Phase.REST: + { + break; + } + default: + Debug.WriteLine("Error!"); + break; + } + + // update thread local variables + threadCtx.Value.phase = currentState.phase; + threadCtx.Value.version = currentState.version; + + previousState.word = currentState.word; + } while (previousState.word != finalState.word); + } + + #region Helper functions + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool GlobalMoveToNextCheckpointState(SystemState currentState) + { + long context = 0; + return GlobalMoveToNextState(currentState, GetNextState(currentState, _checkpointType), ref context); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool MakeTransition(SystemState currentState, SystemState nextState) + { + // Move from I to P2 + if (Interlocked.CompareExchange(ref _systemState.word, nextState.word, currentState.word) == currentState.word) + { + Debug.WriteLine("Moved to {0}, {1}", nextState.phase, nextState.version); + return true; + } + else + { + return false; + } + } + + private void AcquireSharedLatchesForAllPendingRequests() + { + foreach (var ctx in threadCtx.Value.retryRequests) + { + AcquireSharedLatch(ctx.key.Get()); + } + foreach (var ctx in threadCtx.Value.ioPendingRequests.Values) + { + AcquireSharedLatch(ctx.key.Get()); + } + } + + /* + * We have several state machines supported by this function. + * Full Checkpoint: + * REST -> PREP_INDEX_CHECKPOINT -> PREPARE -> IN_PROGRESS + * -> WAIT_PENDING -> WAIT_FLUSH -> PERSISTENCE_CALLBACK -> REST + * + * Index Checkpoint: + * REST -> PREP_INDEX_CHECKPOINT -> INDEX_CHECKPOINT -> REST + * + * Hybrid Log Checkpoint: + * REST -> PREPARE -> IN_PROGRESS -> WAIT_PENDING -> WAIT_FLUSH -> + * -> PERSISTENCE_CALLBACK -> REST + * + * Grow : + * REST -> PREPARE_GROW -> IN_PROGRESS_GROW -> REST + * + * GC: + * REST -> GC -> REST + */ + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private SystemState GetNextState(SystemState start, CheckpointType type = CheckpointType.FULL) + { + + var nextState = default(SystemState); + nextState.word = start.word; + switch (start.phase) + { + case Phase.REST: + switch (type) + { + case CheckpointType.HYBRID_LOG_ONLY: + nextState.phase = Phase.PREPARE; + break; + case CheckpointType.FULL: + case CheckpointType.INDEX_ONLY: + nextState.phase = Phase.PREP_INDEX_CHECKPOINT; + break; + } + break; + case Phase.PREP_INDEX_CHECKPOINT: + switch (type) + { + case CheckpointType.INDEX_ONLY: + nextState.phase = Phase.INDEX_CHECKPOINT; + break; + case CheckpointType.FULL: + nextState.phase = Phase.PREPARE; + break; + } + break; + case Phase.INDEX_CHECKPOINT: + switch(type) + { + case CheckpointType.FULL: + nextState.phase = Phase.PREPARE; + break; + default: + nextState.phase = Phase.REST; + break; + } + break; + case Phase.PREPARE: + nextState.phase = Phase.IN_PROGRESS; + nextState.version = start.version + 1; + break; + case Phase.IN_PROGRESS: + nextState.phase = Phase.WAIT_PENDING; + break; + case Phase.WAIT_PENDING: + nextState.phase = Phase.WAIT_FLUSH; + break; + case Phase.WAIT_FLUSH: + nextState.phase = Phase.PERSISTENCE_CALLBACK; + break; + case Phase.PERSISTENCE_CALLBACK: + nextState.phase = Phase.REST; + break; + + case Phase.GC: + nextState.phase = Phase.REST; + break; + case Phase.PREPARE_GROW: + nextState.phase = Phase.IN_PROGRESS_GROW; + break; + case Phase.IN_PROGRESS_GROW: + nextState.phase = Phase.REST; + break; + } + return nextState; + } + + private void WriteHybridLogMetaInfo() + { + checkpointManager.CommitLogCheckpoint(_hybridLogCheckpointToken, _hybridLogCheckpoint.info.ToByteArray()); + } + + private void WriteIndexMetaInfo() + { + checkpointManager.CommitIndexCheckpoint(_indexCheckpointToken, _indexCheckpoint.info.ToByteArray()); + } + + private bool ObtainCurrentTailAddress(ref long location) + { + var tailAddress = hlog.GetTailAddress(); + return Interlocked.CompareExchange(ref location, tailAddress, 0) == 0; + } + + private void InitializeIndexCheckpoint(Guid indexToken) + { + _indexCheckpoint.Initialize(indexToken, state[resizeInfo.version].size, checkpointManager); + } + + private void InitializeHybridLogCheckpoint(Guid hybridLogToken, int version) + { + _hybridLogCheckpoint.Initialize(hybridLogToken, version, checkpointManager); + } + + #endregion + } +} diff --git a/ZeroLevel/Services/FASTER/Index/Recovery/DirectoryConfiguration.cs b/ZeroLevel/Services/FASTER/Index/Recovery/DirectoryConfiguration.cs new file mode 100644 index 0000000..5868b21 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Recovery/DirectoryConfiguration.cs @@ -0,0 +1,134 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.IO; + +namespace FASTER.core +{ + class DirectoryConfiguration + { + private readonly string checkpointDir; + + public DirectoryConfiguration(string checkpointDir) + { + this.checkpointDir = checkpointDir; + } + + public const string index_base_folder = "index-checkpoints"; + public const string index_meta_file = "info"; + public const string hash_table_file = "ht"; + public const string overflow_buckets_file = "ofb"; + public const string snapshot_file = "snapshot"; + + public const string cpr_base_folder = "cpr-checkpoints"; + public const string cpr_meta_file = "info"; + + public void CreateIndexCheckpointFolder(Guid token) + { + var directory = GetIndexCheckpointFolder(token); + Directory.CreateDirectory(directory); + DirectoryInfo directoryInfo = new System.IO.DirectoryInfo(directory); + foreach (System.IO.FileInfo file in directoryInfo.GetFiles()) + file.Delete(); + } + public void CreateHybridLogCheckpointFolder(Guid token) + { + var directory = GetHybridLogCheckpointFolder(token); + Directory.CreateDirectory(directory); + DirectoryInfo directoryInfo = new System.IO.DirectoryInfo(directory); + foreach (System.IO.FileInfo file in directoryInfo.GetFiles()) + file.Delete(); + } + + public string GetIndexCheckpointFolder(Guid token = default(Guid)) + { + if (token != default(Guid)) + return GetMergedFolderPath(checkpointDir, index_base_folder, token.ToString()); + else + return GetMergedFolderPath(checkpointDir, index_base_folder); + } + + public string GetHybridLogCheckpointFolder(Guid token = default(Guid)) + { + if (token != default(Guid)) + return GetMergedFolderPath(checkpointDir, cpr_base_folder, token.ToString()); + else + return GetMergedFolderPath(checkpointDir, cpr_base_folder); + } + + public string GetIndexCheckpointMetaFileName(Guid token) + { + return GetMergedFolderPath(checkpointDir, + index_base_folder, + token.ToString(), + index_meta_file, + ".dat"); + } + + public string GetPrimaryHashTableFileName(Guid token) + { + return GetMergedFolderPath(checkpointDir, + index_base_folder, + token.ToString(), + hash_table_file, + ".dat"); + } + + public string GetOverflowBucketsFileName(Guid token) + { + return GetMergedFolderPath(checkpointDir, + index_base_folder, + token.ToString(), + overflow_buckets_file, + ".dat"); + } + + public string GetHybridLogCheckpointMetaFileName(Guid token) + { + return GetMergedFolderPath(checkpointDir, + cpr_base_folder, + token.ToString(), + cpr_meta_file, + ".dat"); + } + + public string GetHybridLogCheckpointContextFileName(Guid checkpointToken, Guid sessionToken) + { + return GetMergedFolderPath(checkpointDir, + cpr_base_folder, + checkpointToken.ToString(), + sessionToken.ToString(), + ".dat"); + } + + public string GetLogSnapshotFileName(Guid token) + { + return GetMergedFolderPath(checkpointDir, cpr_base_folder, token.ToString(), snapshot_file, ".dat"); + } + + public string GetObjectLogSnapshotFileName(Guid token) + { + return GetMergedFolderPath(checkpointDir, cpr_base_folder, token.ToString(), snapshot_file, ".obj.dat"); + } + + private static string GetMergedFolderPath(params String[] paths) + { + String fullPath = paths[0]; + + for (int i = 1; i < paths.Length; i++) + { + if (i == paths.Length - 1 && paths[i].Contains(".")) + { + fullPath += paths[i]; + } + else + { + fullPath += Path.DirectorySeparatorChar + paths[i]; + } + } + + return fullPath; + } + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Recovery/ICheckpointManager.cs b/ZeroLevel/Services/FASTER/Index/Recovery/ICheckpointManager.cs new file mode 100644 index 0000000..947d06d --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Recovery/ICheckpointManager.cs @@ -0,0 +1,111 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Interface for users to control creation and retrieval of checkpoint-related data + /// FASTER calls this interface during checkpoint/recovery in this sequence: + /// + /// Checkpoint: + /// InitializeIndexCheckpoint (for index checkpoints) -> + /// GetIndexDevice (for index checkpoints) -> + /// InitializeLogCheckpoint (for log checkpoints) -> + /// GetSnapshotLogDevice (for log checkpoints in snapshot mode) -> + /// GetSnapshotObjectLogDevice (for log checkpoints in snapshot mode with objects) -> + /// CommitLogCheckpoint (for log checkpoints) -> + /// CommitIndexCheckpoint (for index checkpoints) -> + /// + /// Recovery: + /// GetLatestCheckpoint (if request to recover to latest checkpoint) -> + /// GetIndexCommitMetadata -> + /// GetLogCommitMetadata -> + /// GetIndexDevice -> + /// GetSnapshotLogDevice (for recovery in snapshot mode) -> + /// GetSnapshotObjectLogDevice (for recovery in snapshot mode with objects) + /// + /// Provided devices will be closed directly by FASTER when done. + /// + public interface ICheckpointManager + { + /// + /// Initialize index checkpoint + /// + /// + void InitializeIndexCheckpoint(Guid indexToken); + + /// + /// Initialize log checkpoint (snapshot and fold-over) + /// + /// + void InitializeLogCheckpoint(Guid logToken); + + /// + /// Commit index checkpoint + /// + /// + /// + /// + void CommitIndexCheckpoint(Guid indexToken, byte[] commitMetadata); + + /// + /// Commit log checkpoint (snapshot and fold-over) + /// + /// + /// + /// + void CommitLogCheckpoint(Guid logToken, byte[] commitMetadata); + + /// + /// Retrieve commit metadata for specified index checkpoint + /// + /// Token + /// Metadata, or null if invalid + byte[] GetIndexCommitMetadata(Guid indexToken); + + /// + /// Retrieve commit metadata for specified log checkpoint + /// + /// Token + /// Metadata, or null if invalid + byte[] GetLogCommitMetadata(Guid logToken); + + /// + /// Provide device to store index checkpoint (including overflow buckets) + /// + /// + /// + IDevice GetIndexDevice(Guid indexToken); + + /// + /// Provide device to store snapshot of log (required only for snapshot checkpoints) + /// + /// + /// + IDevice GetSnapshotLogDevice(Guid token); + + /// + /// Provide device to store snapshot of object log (required only for snapshot checkpoints) + /// + /// + /// + IDevice GetSnapshotObjectLogDevice(Guid token); + + /// + /// Get latest valid checkpoint for recovery + /// + /// + /// + /// true if latest valid checkpoint found, false otherwise + bool GetLatestCheckpoint(out Guid indexToken, out Guid logToken); + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Recovery/IndexCheckpoint.cs b/ZeroLevel/Services/FASTER/Index/Recovery/IndexCheckpoint.cs new file mode 100644 index 0000000..b0d228f --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Recovery/IndexCheckpoint.cs @@ -0,0 +1,133 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; +using System.Threading; +using System.Threading.Tasks; + +namespace FASTER.core +{ + + + public unsafe partial class FasterBase + { + // Derived class facing persistence API + internal IndexCheckpointInfo _indexCheckpoint; + + internal void TakeIndexFuzzyCheckpoint() + { + var ht_version = resizeInfo.version; + + TakeMainIndexCheckpoint(ht_version, + _indexCheckpoint.main_ht_device, + out ulong ht_num_bytes_written); + + var sectorSize = _indexCheckpoint.main_ht_device.SectorSize; + var alignedIndexSize = (uint)((ht_num_bytes_written + (sectorSize - 1)) & ~(sectorSize - 1)); + overflowBucketsAllocator.TakeCheckpoint(_indexCheckpoint.main_ht_device, alignedIndexSize, out ulong ofb_num_bytes_written); + _indexCheckpoint.info.num_ht_bytes = ht_num_bytes_written; + _indexCheckpoint.info.num_ofb_bytes = ofb_num_bytes_written; + } + + internal void TakeIndexFuzzyCheckpoint(int ht_version, IDevice device, + out ulong numBytesWritten, IDevice ofbdevice, + out ulong ofbnumBytesWritten, out int num_ofb_buckets) + { + TakeMainIndexCheckpoint(ht_version, device, out numBytesWritten); + var sectorSize = device.SectorSize; + var alignedIndexSize = (uint)((numBytesWritten + (sectorSize - 1)) & ~(sectorSize - 1)); + overflowBucketsAllocator.TakeCheckpoint(ofbdevice, alignedIndexSize, out ofbnumBytesWritten); + num_ofb_buckets = overflowBucketsAllocator.GetMaxValidAddress(); + } + + internal bool IsIndexFuzzyCheckpointCompleted(bool waitUntilComplete = false) + { + bool completed1 = IsMainIndexCheckpointCompleted(waitUntilComplete); + bool completed2 = overflowBucketsAllocator.IsCheckpointCompleted(waitUntilComplete); + return completed1 && completed2; + } + + + // Implementation of an asynchronous checkpointing scheme + // for main hash index of FASTER + private CountdownEvent mainIndexCheckpointEvent; + + private void TakeMainIndexCheckpoint(int tableVersion, + IDevice device, + out ulong numBytes) + { + BeginMainIndexCheckpoint(tableVersion, device, out numBytes); + } + + private void BeginMainIndexCheckpoint( + int version, + IDevice device, + out ulong numBytesWritten) + { + int numChunks = 1; + long totalSize = state[version].size * sizeof(HashBucket); + Debug.Assert(totalSize < (long)uint.MaxValue); // required since numChunks = 1 + + uint chunkSize = (uint)(totalSize / numChunks); + mainIndexCheckpointEvent = new CountdownEvent(numChunks); + HashBucket* start = state[version].tableAligned; + + numBytesWritten = 0; + for (int index = 0; index < numChunks; index++) + { + long chunkStartBucket = (long)start + (index * chunkSize); + HashIndexPageAsyncFlushResult result = default(HashIndexPageAsyncFlushResult); + result.chunkIndex = index; + device.WriteAsync((IntPtr)chunkStartBucket, numBytesWritten, chunkSize, AsyncPageFlushCallback, result); + numBytesWritten += chunkSize; + } + } + + + private bool IsMainIndexCheckpointCompleted(bool waitUntilComplete = false) + { + bool completed = mainIndexCheckpointEvent.IsSet; + if (!completed && waitUntilComplete) + { + mainIndexCheckpointEvent.Wait(); + return true; + } + return completed; + } + + private void AsyncPageFlushCallback( + uint errorCode, + uint numBytes, + NativeOverlapped* overlap) + { + //Set the page status to flushed + var result = (HashIndexPageAsyncFlushResult)Overlapped.Unpack(overlap).AsyncResult; + + try + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + } + catch (Exception ex) + { + Trace.TraceError("Completion Callback error, {0}", ex.Message); + } + finally + { + mainIndexCheckpointEvent.Signal(); + Overlapped.Free(overlap); + } + } + + } + +} diff --git a/ZeroLevel/Services/FASTER/Index/Recovery/IndexRecovery.cs b/ZeroLevel/Services/FASTER/Index/Recovery/IndexRecovery.cs new file mode 100644 index 0000000..d1f0bb9 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Recovery/IndexRecovery.cs @@ -0,0 +1,144 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; +using System.Threading; +using System.Threading.Tasks; + +namespace FASTER.core +{ + /// + /// + /// + public unsafe partial class FasterBase + { + internal ICheckpointManager checkpointManager; + + // Derived class exposed API + internal void RecoverFuzzyIndex(IndexCheckpointInfo info) + { + var token = info.info.token; + var ht_version = resizeInfo.version; + Debug.Assert(state[ht_version].size == info.info.table_size); + + // Create devices to read from using Async API + info.main_ht_device = checkpointManager.GetIndexDevice(token); + + BeginMainIndexRecovery(ht_version, + info.main_ht_device, + info.info.num_ht_bytes); + + var sectorSize = info.main_ht_device.SectorSize; + var alignedIndexSize = (uint)((info.info.num_ht_bytes + (sectorSize - 1)) & ~(sectorSize - 1)); + + overflowBucketsAllocator.Recover(info.main_ht_device, alignedIndexSize, info.info.num_buckets, info.info.num_ofb_bytes); + + // Wait until reading is complete + IsFuzzyIndexRecoveryComplete(true); + + // close index checkpoint files appropriately + info.main_ht_device.Close(); + + // Delete all tentative entries! + DeleteTentativeEntries(); + } + + internal void RecoverFuzzyIndex(int ht_version, IDevice device, ulong num_ht_bytes, IDevice ofbdevice, int num_buckets, ulong num_ofb_bytes) + { + BeginMainIndexRecovery(ht_version, device, num_ht_bytes); + var sectorSize = device.SectorSize; + var alignedIndexSize = (uint)((num_ht_bytes + (sectorSize - 1)) & ~(sectorSize - 1)); + overflowBucketsAllocator.Recover(ofbdevice, alignedIndexSize, num_buckets, num_ofb_bytes); + } + + internal bool IsFuzzyIndexRecoveryComplete(bool waitUntilComplete = false) + { + bool completed1 = IsMainIndexRecoveryCompleted(waitUntilComplete); + bool completed2 = overflowBucketsAllocator.IsRecoveryCompleted(waitUntilComplete); + return completed1 && completed2; + } + + //Main Index Recovery Functions + private CountdownEvent mainIndexRecoveryEvent; + + private void BeginMainIndexRecovery( + int version, + IDevice device, + ulong num_bytes) + { + int numChunksToBeRecovered = 1; + long totalSize = state[version].size * sizeof(HashBucket); + Debug.Assert(totalSize < (long)uint.MaxValue); // required since numChunks = 1 + + uint chunkSize = (uint)(totalSize / numChunksToBeRecovered); + mainIndexRecoveryEvent = new CountdownEvent(numChunksToBeRecovered); + HashBucket* start = state[version].tableAligned; + + ulong numBytesRead = 0; + for (int index = 0; index < numChunksToBeRecovered; index++) + { + long chunkStartBucket = (long)start + (index * chunkSize); + HashIndexPageAsyncReadResult result = default(HashIndexPageAsyncReadResult); + result.chunkIndex = index; + device.ReadAsync(numBytesRead, (IntPtr)chunkStartBucket, chunkSize, AsyncPageReadCallback, result); + numBytesRead += chunkSize; + } + Debug.Assert(numBytesRead == num_bytes); + } + + private bool IsMainIndexRecoveryCompleted( + bool waitUntilComplete = false) + { + bool completed = mainIndexRecoveryEvent.IsSet; + if (!completed && waitUntilComplete) + { + mainIndexRecoveryEvent.Wait(); + return true; + } + return completed; + } + + private unsafe void AsyncPageReadCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + mainIndexRecoveryEvent.Signal(); + Overlapped.Free(overlap); + } + + internal void DeleteTentativeEntries() + { + HashBucketEntry entry = default(HashBucketEntry); + + int version = resizeInfo.version; + var table_size_ = state[version].size; + var ptable_ = state[version].tableAligned; + + for (long bucket = 0; bucket < table_size_; ++bucket) + { + HashBucket b = *(ptable_ + bucket); + while (true) + { + for (int bucket_entry = 0; bucket_entry < Constants.kOverflowBucketIndex; ++bucket_entry) + { + entry.word = b.bucket_entries[bucket_entry]; + if (entry.Tentative) + b.bucket_entries[bucket_entry] = 0; + } + + if (b.bucket_entries[Constants.kOverflowBucketIndex] == 0) break; + b = *((HashBucket*)overflowBucketsAllocator.GetPhysicalAddress((b.bucket_entries[Constants.kOverflowBucketIndex]))); + } + } + } + } +} diff --git a/ZeroLevel/Services/FASTER/Index/Recovery/LocalCheckpointManager.cs b/ZeroLevel/Services/FASTER/Index/Recovery/LocalCheckpointManager.cs new file mode 100644 index 0000000..9eb270d --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Recovery/LocalCheckpointManager.cs @@ -0,0 +1,206 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Implementation of checkpoint interface for local file storage + /// + public class LocalCheckpointManager : ICheckpointManager + { + private DirectoryConfiguration directoryConfiguration; + + /// + /// Create new instance of local checkpoint manager at given base directory + /// + /// + public LocalCheckpointManager(string CheckpointDir) + { + directoryConfiguration = new DirectoryConfiguration(CheckpointDir); + } + + /// + /// Initialize index checkpoint + /// + /// + public void InitializeIndexCheckpoint(Guid indexToken) + { + directoryConfiguration.CreateIndexCheckpointFolder(indexToken); + } + + /// + /// Initialize log checkpoint (snapshot and fold-over) + /// + /// + public void InitializeLogCheckpoint(Guid logToken) + { + directoryConfiguration.CreateHybridLogCheckpointFolder(logToken); + } + + /// + /// Commit index checkpoint + /// + /// + /// + public void CommitIndexCheckpoint(Guid indexToken, byte[] commitMetadata) + { + string filename = directoryConfiguration.GetIndexCheckpointMetaFileName(indexToken); + using (var writer = new BinaryWriter(new FileStream(filename, FileMode.Create))) + { + writer.Write(commitMetadata.Length); + writer.Write(commitMetadata); + writer.Flush(); + } + + string completed_filename = directoryConfiguration.GetIndexCheckpointFolder(indexToken); + completed_filename += Path.DirectorySeparatorChar + "completed.dat"; + using (var file = new FileStream(completed_filename, FileMode.Create)) + { + file.Flush(); + } + } + + /// + /// Commit log checkpoint (snapshot and fold-over) + /// + /// + /// + public void CommitLogCheckpoint(Guid logToken, byte[] commitMetadata) + { + string filename = directoryConfiguration.GetHybridLogCheckpointMetaFileName(logToken); + using (var writer = new BinaryWriter(new FileStream(filename, FileMode.Create))) + { + writer.Write(commitMetadata.Length); + writer.Write(commitMetadata); + writer.Flush(); + } + + string completed_filename = directoryConfiguration.GetHybridLogCheckpointFolder(logToken); + completed_filename += Path.DirectorySeparatorChar + "completed.dat"; + using (var file = new FileStream(completed_filename, FileMode.Create)) + { + file.Flush(); + } + } + + /// + /// Retrieve commit metadata for specified index checkpoint + /// + /// Token + /// Metadata, or null if invalid + public byte[] GetIndexCommitMetadata(Guid indexToken) + { + var dir = new DirectoryInfo(directoryConfiguration.GetIndexCheckpointFolder(indexToken)); + if (!File.Exists(dir.FullName + Path.DirectorySeparatorChar + "completed.dat")) + return null; + + string filename = directoryConfiguration.GetIndexCheckpointMetaFileName(indexToken); + using (var reader = new BinaryReader(new FileStream(filename, FileMode.Open))) + { + var len = reader.ReadInt32(); + return reader.ReadBytes(len); + } + } + + /// + /// Retrieve commit metadata for specified log checkpoint + /// + /// Token + /// Metadata, or null if invalid + public byte[] GetLogCommitMetadata(Guid logToken) + { + var dir = new DirectoryInfo(directoryConfiguration.GetHybridLogCheckpointFolder(logToken)); + if (!File.Exists(dir.FullName + Path.DirectorySeparatorChar + "completed.dat")) + return null; + + string checkpointInfoFile = directoryConfiguration.GetHybridLogCheckpointMetaFileName(logToken); + using (var reader = new BinaryReader(new FileStream(checkpointInfoFile, FileMode.Open))) + { + var len = reader.ReadInt32(); + return reader.ReadBytes(len); + } + } + + /// + /// Provide device to store index checkpoint (including overflow buckets) + /// + /// + /// + public IDevice GetIndexDevice(Guid indexToken) + { + return Devices.CreateLogDevice(directoryConfiguration.GetPrimaryHashTableFileName(indexToken), false); + } + + /// + /// Provide device to store snapshot of log (required only for snapshot checkpoints) + /// + /// + /// + public IDevice GetSnapshotLogDevice(Guid token) + { + return Devices.CreateLogDevice(directoryConfiguration.GetLogSnapshotFileName(token), false); + } + + /// + /// Provide device to store snapshot of object log (required only for snapshot checkpoints) + /// + /// + /// + public IDevice GetSnapshotObjectLogDevice(Guid token) + { + return Devices.CreateLogDevice(directoryConfiguration.GetObjectLogSnapshotFileName(token), false); + } + + /// + /// Get latest valid checkpoint for recovery + /// + /// + /// + /// + public bool GetLatestCheckpoint(out Guid indexToken, out Guid logToken) + { + var indexCheckpointDir = new DirectoryInfo(directoryConfiguration.GetIndexCheckpointFolder()); + var dirs = indexCheckpointDir.GetDirectories(); + foreach (var dir in dirs) + { + // Remove incomplete checkpoints + if (!File.Exists(dir.FullName + Path.DirectorySeparatorChar + "completed.dat")) + { + Directory.Delete(dir.FullName, true); + } + } + var latestICFolder = indexCheckpointDir.GetDirectories().OrderByDescending(f => f.LastWriteTime).First(); + if (latestICFolder == null || !Guid.TryParse(latestICFolder.Name, out indexToken)) + { + throw new Exception("No valid index checkpoint to recover from"); + } + + + var hlogCheckpointDir = new DirectoryInfo(directoryConfiguration.GetHybridLogCheckpointFolder()); + dirs = hlogCheckpointDir.GetDirectories(); + foreach (var dir in dirs) + { + // Remove incomplete checkpoints + if (!File.Exists(dir.FullName + Path.DirectorySeparatorChar + "completed.dat")) + { + Directory.Delete(dir.FullName, true); + } + } + var latestHLCFolder = hlogCheckpointDir.GetDirectories().OrderByDescending(f => f.LastWriteTime).First(); + if (latestHLCFolder == null || !Guid.TryParse(latestHLCFolder.Name, out logToken)) + { + throw new Exception("No valid hybrid log checkpoint to recover from"); + } + return true; + } + } +} \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Index/Recovery/Recovery.cs b/ZeroLevel/Services/FASTER/Index/Recovery/Recovery.cs new file mode 100644 index 0000000..4ae0388 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Index/Recovery/Recovery.cs @@ -0,0 +1,500 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#pragma warning disable 0162 + +using System; +using System.Diagnostics; +using System.IO; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Threading; +using System.Linq; +using System.Collections.Generic; + +namespace FASTER.core +{ + + internal enum ReadStatus { Pending, Done }; + internal enum FlushStatus { Pending, Done }; + + internal class RecoveryStatus + { + public long startPage; + public long endPage; + public long untilAddress; + public int capacity; + + public IDevice recoveryDevice; + public long recoveryDevicePageOffset; + public IDevice objectLogRecoveryDevice; + + public ReadStatus[] readStatus; + public FlushStatus[] flushStatus; + + public RecoveryStatus(int capacity, + long startPage, + long endPage, long untilAddress) + { + this.capacity = capacity; + this.startPage = startPage; + this.endPage = endPage; + this.untilAddress = untilAddress; + readStatus = new ReadStatus[capacity]; + flushStatus = new FlushStatus[capacity]; + for (int i = 0; i < capacity; i++) + { + flushStatus[i] = FlushStatus.Done; + readStatus[i] = ReadStatus.Pending; + } + } + } + + + /// + /// Partial class for recovery code in FASTER + /// + public unsafe partial class FasterKV : FasterBase, IFasterKV + where Key : new() + where Value : new() + where Functions : IFunctions + { + + private void InternalRecoverFromLatestCheckpoints() + { + checkpointManager.GetLatestCheckpoint(out Guid indexCheckpointGuid, out Guid hybridLogCheckpointGuid); + InternalRecover(indexCheckpointGuid, hybridLogCheckpointGuid); + } + + private bool IsCompatible(IndexRecoveryInfo indexInfo, HybridLogRecoveryInfo recoveryInfo) + { + var l1 = indexInfo.finalLogicalAddress; + var l2 = recoveryInfo.finalLogicalAddress; + return l1 <= l2; + } + + private void InternalRecover(Guid indexToken, Guid hybridLogToken) + { + Debug.WriteLine("********* Primary Recovery Information ********"); + Debug.WriteLine("Index Checkpoint: {0}", indexToken); + Debug.WriteLine("HybridLog Checkpoint: {0}", hybridLogToken); + + // Recovery appropriate context information + var recoveredICInfo = new IndexCheckpointInfo(); + recoveredICInfo.Recover(indexToken, checkpointManager); + recoveredICInfo.info.DebugPrint(); + + var recoveredHLCInfo = new HybridLogCheckpointInfo(); + recoveredHLCInfo.Recover(hybridLogToken, checkpointManager); + recoveredHLCInfo.info.DebugPrint(); + + // Check if the two checkpoints are compatible for recovery + if (!IsCompatible(recoveredICInfo.info, recoveredHLCInfo.info)) + { + throw new Exception("Cannot recover from (" + indexToken.ToString() + "," + hybridLogToken.ToString() + ") checkpoint pair!\n"); + } + + // Set new system state after recovery + var v = recoveredHLCInfo.info.version; + _systemState.phase = Phase.REST; + _systemState.version = (v + 1); + + // Recover fuzzy index from checkpoint + RecoverFuzzyIndex(recoveredICInfo); + + // Recover segment offsets for object log + if (recoveredHLCInfo.info.objectLogSegmentOffsets != null) + Array.Copy(recoveredHLCInfo.info.objectLogSegmentOffsets, + hlog.GetSegmentOffsets(), + recoveredHLCInfo.info.objectLogSegmentOffsets.Length); + + + // Make index consistent for version v + if (FoldOverSnapshot) + { + RecoverHybridLog(recoveredICInfo.info, recoveredHLCInfo.info); + } + else + { + RecoverHybridLogFromSnapshotFile(recoveredICInfo.info, recoveredHLCInfo.info); + } + + + // Read appropriate hybrid log pages into memory + hlog.RestoreHybridLog(recoveredHLCInfo.info.finalLogicalAddress, recoveredHLCInfo.info.headAddress, recoveredHLCInfo.info.beginAddress); + + // Recover session information + _recoveredSessions = recoveredHLCInfo.info.continueTokens; + } + + private void RecoverHybridLog(IndexRecoveryInfo indexRecoveryInfo, + HybridLogRecoveryInfo recoveryInfo) + { + var fromAddress = indexRecoveryInfo.startLogicalAddress; + var untilAddress = recoveryInfo.finalLogicalAddress; + + var startPage = hlog.GetPage(fromAddress); + var endPage = hlog.GetPage(untilAddress); + if ((untilAddress > hlog.GetStartLogicalAddress(endPage)) && (untilAddress > fromAddress)) + { + endPage++; + } + + // By default first page has one extra record + var capacity = hlog.GetCapacityNumPages(); + var recoveryStatus = new RecoveryStatus(capacity, startPage, endPage, untilAddress); + + int totalPagesToRead = (int)(endPage - startPage); + int numPagesToReadFirst = Math.Min(capacity, totalPagesToRead); + + // Issue request to read pages as much as possible + hlog.AsyncReadPagesFromDevice(startPage, numPagesToReadFirst, untilAddress, hlog.AsyncReadPagesCallbackForRecovery, recoveryStatus); + + for (long page = startPage; page < endPage; page++) + { + // Ensure page has been read into memory + int pageIndex = hlog.GetPageIndexForPage(page); + while (recoveryStatus.readStatus[pageIndex] == ReadStatus.Pending) + { + Thread.Sleep(10); + } + + var startLogicalAddress = hlog.GetStartLogicalAddress(page); + var endLogicalAddress = hlog.GetStartLogicalAddress(page + 1); + + var pageFromAddress = 0L; + if (fromAddress > startLogicalAddress && fromAddress < endLogicalAddress) + { + pageFromAddress = hlog.GetOffsetInPage(fromAddress); + } + + var pageUntilAddress = hlog.GetPageSize(); + if (endLogicalAddress > untilAddress) + { + pageUntilAddress = hlog.GetOffsetInPage(untilAddress); + } + + var physicalAddress = hlog.GetPhysicalAddress(startLogicalAddress); + RecoverFromPage(fromAddress, pageFromAddress, pageUntilAddress, + startLogicalAddress, physicalAddress, recoveryInfo.version); + + // OS thread flushes current page and issues a read request if necessary + recoveryStatus.readStatus[pageIndex] = ReadStatus.Pending; + recoveryStatus.flushStatus[pageIndex] = FlushStatus.Pending; + + hlog.AsyncFlushPages(page, 1, AsyncFlushPageCallbackForRecovery, recoveryStatus); + } + + // Assert that all pages have been flushed + var done = false; + while (!done) + { + done = true; + for (long page = startPage; page < endPage; page++) + { + int pageIndex = hlog.GetPageIndexForPage(page); + if (recoveryStatus.flushStatus[pageIndex] == FlushStatus.Pending) + { + done = false; + break; + } + } + } + + + } + + private void RecoverHybridLogFromSnapshotFile( + IndexRecoveryInfo indexRecoveryInfo, + HybridLogRecoveryInfo recoveryInfo) + { + var fileStartAddress = recoveryInfo.flushedLogicalAddress; + var fromAddress = indexRecoveryInfo.startLogicalAddress; + var untilAddress = recoveryInfo.finalLogicalAddress; + + // Compute startPage and endPage + var startPage = hlog.GetPage(fileStartAddress); + var endPage = hlog.GetPage(untilAddress); + if (untilAddress > hlog.GetStartLogicalAddress(endPage)) + { + endPage++; + } + + // By default first page has one extra record + var capacity = hlog.GetCapacityNumPages(); + var recoveryDevice = checkpointManager.GetSnapshotLogDevice(recoveryInfo.guid); + var objectLogRecoveryDevice = checkpointManager.GetSnapshotObjectLogDevice(recoveryInfo.guid); + recoveryDevice.Initialize(hlog.GetSegmentSize()); + objectLogRecoveryDevice.Initialize(hlog.GetSegmentSize()); + var recoveryStatus = new RecoveryStatus(capacity, startPage, endPage, untilAddress) + { + recoveryDevice = recoveryDevice, + objectLogRecoveryDevice = objectLogRecoveryDevice, + recoveryDevicePageOffset = startPage + }; + + // Initially issue read request for all pages that can be held in memory + int totalPagesToRead = (int)(endPage - startPage); + int numPagesToReadFirst = Math.Min(capacity, totalPagesToRead); + + hlog.AsyncReadPagesFromDevice(startPage, numPagesToReadFirst, untilAddress, + hlog.AsyncReadPagesCallbackForRecovery, + recoveryStatus, + recoveryStatus.recoveryDevicePageOffset, + recoveryStatus.recoveryDevice, recoveryStatus.objectLogRecoveryDevice); + + + for (long page = startPage; page < endPage; page++) + { + + // Ensure the page is read from file + int pageIndex = hlog.GetPageIndexForPage(page); + while (recoveryStatus.readStatus[pageIndex] == ReadStatus.Pending) + { + Thread.Sleep(10); + } + + // Page at hand + var startLogicalAddress = hlog.GetStartLogicalAddress(page); + var endLogicalAddress = hlog.GetStartLogicalAddress(page + 1); + + // Perform recovery if page in fuzzy portion of the log + if ((fromAddress < endLogicalAddress) && (fromAddress < untilAddress)) + { + /* + * Handling corner-cases: + * ---------------------- + * When fromAddress is in the middle of the page, + * then start recovery only from corresponding offset + * in page. Similarly, if untilAddress falls in the + * middle of the page, perform recovery only until that + * offset. Otherwise, scan the entire page [0, PageSize) + */ + var pageFromAddress = 0L; + if (fromAddress > startLogicalAddress && fromAddress < endLogicalAddress) + { + pageFromAddress = hlog.GetOffsetInPage(fromAddress); + } + + var pageUntilAddress = hlog.GetPageSize(); + if (endLogicalAddress > untilAddress) + { + pageUntilAddress = hlog.GetOffsetInPage(untilAddress); + } + + var physicalAddress = hlog.GetPhysicalAddress(startLogicalAddress); + RecoverFromPage(fromAddress, pageFromAddress, pageUntilAddress, + startLogicalAddress, physicalAddress, recoveryInfo.version); + + } + + // OS thread flushes current page and issues a read request if necessary + recoveryStatus.readStatus[pageIndex] = ReadStatus.Pending; + recoveryStatus.flushStatus[pageIndex] = FlushStatus.Pending; + + // Write back records from snapshot to main hybrid log + hlog.AsyncFlushPages(page, 1, AsyncFlushPageCallbackForRecovery, recoveryStatus); + } + + // Assert and wait until all pages have been flushed + var done = false; + while (!done) + { + done = true; + for (long page = startPage; page < endPage; page++) + { + int pageIndex = hlog.GetPageIndexForPage(page); + if (recoveryStatus.flushStatus[pageIndex] == FlushStatus.Pending) + { + done = false; + break; + } + } + } + + recoveryStatus.recoveryDevice.Close(); + recoveryStatus.objectLogRecoveryDevice.Close(); + } + + private void RecoverFromPage(long startRecoveryAddress, + long fromLogicalAddressInPage, + long untilLogicalAddressInPage, + long pageLogicalAddress, + long pagePhysicalAddress, + int version) + { + var hash = default(long); + var tag = default(ushort); + var pointer = default(long); + var recordStart = default(long); + var bucket = default(HashBucket*); + var entry = default(HashBucketEntry); + var slot = default(int); + + pointer = fromLogicalAddressInPage; + while (pointer < untilLogicalAddressInPage) + { + recordStart = pagePhysicalAddress + pointer; + ref RecordInfo info = ref hlog.GetInfo(recordStart); + + if (info.IsNull()) + { + pointer += RecordInfo.GetLength(); + continue; + } + + if (!info.Invalid) + { + hash = comparer.GetHashCode64(ref hlog.GetKey(recordStart)); + tag = (ushort)((ulong)hash >> Constants.kHashTagShift); + + entry = default(HashBucketEntry); + FindOrCreateTag(hash, tag, ref bucket, ref slot, ref entry, hlog.BeginAddress); + + if (info.Version <= version) + { + entry.Address = pageLogicalAddress + pointer; + entry.Tag = tag; + entry.Pending = false; + entry.Tentative = false; + bucket->bucket_entries[slot] = entry.word; + } + else + { + info.Invalid = true; + if (info.PreviousAddress < startRecoveryAddress) + { + entry.Address = info.PreviousAddress; + entry.Tag = tag; + entry.Pending = false; + entry.Tentative = false; + bucket->bucket_entries[slot] = entry.word; + } + } + } + pointer += hlog.GetRecordSize(recordStart); + } + } + + + private void AsyncFlushPageCallbackForRecovery(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + // Set the page status to flushed + var result = (PageAsyncFlushResult)Overlapped.Unpack(overlap).AsyncResult; + + if (Interlocked.Decrement(ref result.count) == 0) + { + int index = hlog.GetPageIndexForPage(result.page); + result.context.flushStatus[index] = FlushStatus.Done; + if (result.page + result.context.capacity < result.context.endPage) + { + long readPage = result.page + result.context.capacity; + if (FoldOverSnapshot) + { + hlog.AsyncReadPagesFromDevice(readPage, 1, result.context.untilAddress, hlog.AsyncReadPagesCallbackForRecovery, result.context); + } + else + { + hlog.AsyncReadPagesFromDevice(readPage, 1, result.context.untilAddress, hlog.AsyncReadPagesCallbackForRecovery, + result.context, + result.context.recoveryDevicePageOffset, + result.context.recoveryDevice, result.context.objectLogRecoveryDevice); + } + } + result.Free(); + } + Overlapped.Free(overlap); + } + } + + public unsafe abstract partial class AllocatorBase : IDisposable + where Key : new() + where Value : new() + { + /// + /// Restore log + /// + /// + /// + /// + public void RestoreHybridLog(long untilAddress, long headAddress, long beginAddress) + { + Debug.Assert(beginAddress <= headAddress); + Debug.Assert(headAddress <= untilAddress); + + // Special cases: we do not load any records into memory + if ( + (beginAddress == untilAddress) || // Empty log + ((headAddress == untilAddress) && (GetOffsetInPage(headAddress) == 0)) // Empty in-memory page + ) + { + if (!IsAllocated(GetPageIndexForAddress(headAddress))) + AllocatePage(GetPageIndexForAddress(headAddress)); + } + else + { + var tailPage = GetPage(untilAddress); + var headPage = GetPage(headAddress); + + var recoveryStatus = new RecoveryStatus(GetCapacityNumPages(), headPage, tailPage, untilAddress); + for (int i = 0; i < recoveryStatus.capacity; i++) + { + recoveryStatus.readStatus[i] = ReadStatus.Done; + } + + var numPages = 0; + for (var page = headPage; page <= tailPage; page++) + { + var pageIndex = GetPageIndexForPage(page); + recoveryStatus.readStatus[pageIndex] = ReadStatus.Pending; + numPages++; + } + + AsyncReadPagesFromDevice(headPage, numPages, untilAddress, AsyncReadPagesCallbackForRecovery, recoveryStatus); + + var done = false; + while (!done) + { + done = true; + for (long page = headPage; page <= tailPage; page++) + { + int pageIndex = GetPageIndexForPage(page); + if (recoveryStatus.readStatus[pageIndex] == ReadStatus.Pending) + { + done = false; + break; + } + } + } + } + + RecoveryReset(untilAddress, headAddress, beginAddress); + } + + internal void AsyncReadPagesCallbackForRecovery(uint errorCode, uint numBytes, NativeOverlapped* overlap) + { + if (errorCode != 0) + { + Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode); + } + + // Set the page status to flushed + var result = (PageAsyncReadResult)Overlapped.Unpack(overlap).AsyncResult; + + if (result.freeBuffer1 != null) + { + PopulatePage(result.freeBuffer1.GetValidPointer(), result.freeBuffer1.required_bytes, result.page); + result.freeBuffer1.Return(); + } + int index = GetPageIndexForPage(result.page); + result.context.readStatus[index] = ReadStatus.Done; + Interlocked.MemoryBarrier(); + Overlapped.Free(overlap); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Readme.txt b/ZeroLevel/Services/FASTER/Readme.txt new file mode 100644 index 0000000..724a867 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Readme.txt @@ -0,0 +1 @@ +Copied from https://github.com/microsoft/FASTER \ No newline at end of file diff --git a/ZeroLevel/Services/FASTER/Utilities/AsyncResultTypes.cs b/ZeroLevel/Services/FASTER/Utilities/AsyncResultTypes.cs new file mode 100644 index 0000000..2a4c06b --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/AsyncResultTypes.cs @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#define CALLOC + +using System; +using System.Threading; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; + +namespace FASTER.core +{ + internal struct AsyncGetFromDiskResult : IAsyncResult + { + public TContext context; + + public bool IsCompleted => throw new NotImplementedException(); + + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + public object AsyncState => throw new NotImplementedException(); + + public bool CompletedSynchronously => throw new NotImplementedException(); + } + + internal unsafe struct HashIndexPageAsyncFlushResult : IAsyncResult + { + public int chunkIndex; + + public bool IsCompleted => throw new NotImplementedException(); + + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + public object AsyncState => throw new NotImplementedException(); + + public bool CompletedSynchronously => throw new NotImplementedException(); + } + + internal unsafe struct HashIndexPageAsyncReadResult : IAsyncResult + { + public int chunkIndex; + + public bool IsCompleted => throw new NotImplementedException(); + + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + public object AsyncState => throw new NotImplementedException(); + + public bool CompletedSynchronously => throw new NotImplementedException(); + } + + internal struct OverflowPagesFlushAsyncResult : IAsyncResult + { + public bool IsCompleted => throw new NotImplementedException(); + + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + public object AsyncState => throw new NotImplementedException(); + + public bool CompletedSynchronously => throw new NotImplementedException(); + } + + internal struct OverflowPagesReadAsyncResult : IAsyncResult + { + + public bool IsCompleted => throw new NotImplementedException(); + + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + public object AsyncState => throw new NotImplementedException(); + + public bool CompletedSynchronously => throw new NotImplementedException(); + } + + internal struct CountdownEventAsyncResult : IAsyncResult + { + public CountdownEvent countdown; + public Action action; + + public bool IsCompleted => throw new NotImplementedException(); + + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + public object AsyncState => throw new NotImplementedException(); + + public bool CompletedSynchronously => throw new NotImplementedException(); + } +} diff --git a/ZeroLevel/Services/FASTER/Utilities/BufferPool.cs b/ZeroLevel/Services/FASTER/Utilities/BufferPool.cs new file mode 100644 index 0000000..cfcb4eb --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/BufferPool.cs @@ -0,0 +1,224 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using Microsoft.Win32.SafeHandles; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.ComponentModel; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; +using System.Threading; +using System.Threading.Tasks; + +namespace FASTER.core +{ + /// + /// Sector aligned memory allocator + /// + public unsafe class SectorAlignedMemory + { + /// + /// Actual buffer + /// + public byte[] buffer; + + /// + /// Handle + /// + internal GCHandle handle; + + /// + /// Offset + /// + public int offset; + + /// + /// Aligned pointer + /// + public byte* aligned_pointer; + + /// + /// Valid offset + /// + public int valid_offset; + + /// + /// Required bytes + /// + public int required_bytes; + + /// + /// Available bytes + /// + public int available_bytes; + + internal int level; + internal SectorAlignedBufferPool pool; + + /// + /// Return + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Return() + { + pool.Return(this); + } + + /// + /// Get valid pointer + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public byte* GetValidPointer() + { + return aligned_pointer + valid_offset; + } + + /// + /// ToString + /// + /// + public override string ToString() + { + return string.Format("{0} {1} {2} {3} {4}", (long)aligned_pointer, offset, valid_offset, required_bytes, available_bytes); + } + } + + /// + /// SectorAlignedBufferPool is a pool of memory. + /// Internally, it is organized as an array of concurrent queues where each concurrent + /// queue represents a memory of size in particular range. queue[i] contains memory + /// segments each of size (2^i * sectorSize). + /// + public class SectorAlignedBufferPool + { + /// + /// Disable buffer pool + /// + public static bool Disabled = false; + + private const int levels = 32; + private readonly int recordSize; + private readonly int sectorSize; + private readonly ConcurrentQueue[] queue; + + /// + /// Constructor + /// + /// Record size + /// Sector size + public SectorAlignedBufferPool(int recordSize, int sectorSize) + { + queue = new ConcurrentQueue[levels]; + this.recordSize = recordSize; + this.sectorSize = sectorSize; + } + + /// + /// Return + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Return(SectorAlignedMemory page) + { + Debug.Assert(queue[page.level] != null); + page.available_bytes = 0; + page.required_bytes = 0; + page.valid_offset = 0; + Array.Clear(page.buffer, 0, page.buffer.Length); + if (!Disabled) + queue[page.level].Enqueue(page); + else + { + page.handle.Free(); + page.buffer = null; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int Position(int v) + { + if (v == 1) return 0; + v--; + + int r = 0; // r will be lg(v) + while (true) // unroll for more speed... + { + v = v >> 1; + if (v == 0) break; + r++; + } + return r + 1; + } + + /// + /// Get buffer + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe SectorAlignedMemory Get(int numRecords) + { + int requiredSize = sectorSize + (((numRecords) * recordSize + (sectorSize - 1)) & ~(sectorSize - 1)); + int index = Position(requiredSize / sectorSize); + if (queue[index] == null) + { + var localPool = new ConcurrentQueue(); + Interlocked.CompareExchange(ref queue[index], localPool, null); + } + + if (!Disabled && queue[index].TryDequeue(out SectorAlignedMemory page)) + { + return page; + } + + page = new SectorAlignedMemory + { + level = index, + buffer = new byte[sectorSize * (1 << index)] + }; + page.handle = GCHandle.Alloc(page.buffer, GCHandleType.Pinned); + page.aligned_pointer = (byte*)(((long)page.handle.AddrOfPinnedObject() + (sectorSize - 1)) & ~(sectorSize - 1)); + page.offset = (int) ((long)page.aligned_pointer - (long)page.handle.AddrOfPinnedObject()); + page.pool = this; + return page; + } + + /// + /// Free buffer + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Free() + { + for (int i = 0; i < levels; i++) + { + if (queue[i] == null) continue; + while (queue[i].TryDequeue(out SectorAlignedMemory result)) + { + result.handle.Free(); + result.buffer = null; + } + } + } + + /// + /// Print pool contents + /// + public void Print() + { + for (int i = 0; i < levels; i++) + { + if (queue[i] == null) continue; + foreach (var item in queue[i]) + { + Console.WriteLine(" " + item.ToString()); + } + } + } + } +} diff --git a/ZeroLevel/Services/FASTER/Utilities/FasterEqualityComparer.cs b/ZeroLevel/Services/FASTER/Utilities/FasterEqualityComparer.cs new file mode 100644 index 0000000..1c7f52c --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/FasterEqualityComparer.cs @@ -0,0 +1,28 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System.Collections.Generic; + +namespace FASTER.core +{ + /// + /// Low-performance FASTER equality comparer wrapper around EqualityComparer.Default + /// + /// + internal sealed class FasterEqualityComparer : IFasterEqualityComparer + { + public static readonly FasterEqualityComparer Default = new FasterEqualityComparer(); + + private static readonly EqualityComparer DefaultEC = EqualityComparer.Default; + + public bool Equals(ref T k1, ref T k2) + { + return DefaultEC.Equals(k1, k2); + } + + public long GetHashCode64(ref T k) + { + return Utility.GetHashCode(DefaultEC.GetHashCode(k)); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Utilities/Native32.cs b/ZeroLevel/Services/FASTER/Utilities/Native32.cs new file mode 100644 index 0000000..45972b6 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/Native32.cs @@ -0,0 +1,332 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +namespace FASTER.core +{ + using System; + using System.Runtime.InteropServices; + using System.Security; + using Microsoft.Win32.SafeHandles; + using System.Threading; + using System.IO; + + /// + /// Interop with WINAPI for file I/O, threading, and NUMA functions. + /// + public static unsafe class Native32 + { + #region Native structs + [StructLayout(LayoutKind.Sequential)] + private struct LUID + { + public uint lp; + public int hp; + } + + [StructLayout(LayoutKind.Sequential)] + private struct LUID_AND_ATTRIBUTES + { + public LUID Luid; + public uint Attributes; + } + + [StructLayout(LayoutKind.Sequential)] + private struct TOKEN_PRIVILEGES + { + public uint PrivilegeCount; + public LUID_AND_ATTRIBUTES Privileges; + } + + [StructLayout(LayoutKind.Sequential)] + private struct MARK_HANDLE_INFO + { + public uint UsnSourceInfo; + public IntPtr VolumeHandle; + public uint HandleInfo; + } + #endregion + + #region io constants and flags + internal const int ERROR_IO_PENDING = 997; + internal const uint GENERIC_READ = 0x80000000; + internal const uint GENERIC_WRITE = 0x40000000; + internal const uint FILE_FLAG_DELETE_ON_CLOSE = 0x04000000; + internal const uint FILE_FLAG_NO_BUFFERING = 0x20000000; + internal const uint FILE_FLAG_OVERLAPPED = 0x40000000; + + internal const uint FILE_SHARE_DELETE = 0x00000004; + #endregion + + #region io functions + + [DllImport("Kernel32.dll", CharSet = CharSet.Unicode, SetLastError = true)] + internal static extern SafeFileHandle CreateFileW( + [In] string lpFileName, + [In] UInt32 dwDesiredAccess, + [In] UInt32 dwShareMode, + [In] IntPtr lpSecurityAttributes, + [In] UInt32 dwCreationDisposition, + [In] UInt32 dwFlagsAndAttributes, + [In] IntPtr hTemplateFile); + + [DllImport("Kernel32.dll", SetLastError = true)] + internal static extern bool ReadFile( + [In] SafeFileHandle hFile, + [Out] IntPtr lpBuffer, + [In] UInt32 nNumberOfBytesToRead, + [Out] out UInt32 lpNumberOfBytesRead, + [In] NativeOverlapped* lpOverlapped); + + [DllImport("Kernel32.dll", SetLastError = true)] + internal static extern bool WriteFile( + [In] SafeFileHandle hFile, + [In] IntPtr lpBuffer, + [In] UInt32 nNumberOfBytesToWrite, + [Out] out UInt32 lpNumberOfBytesWritten, + [In] NativeOverlapped* lpOverlapped); + + + internal enum EMoveMethod : uint + { + Begin = 0, + Current = 1, + End = 2 + } + + [DllImport("kernel32.dll", SetLastError = true)] + internal static extern uint SetFilePointer( + [In] SafeFileHandle hFile, + [In] int lDistanceToMove, + [In, Out] ref int lpDistanceToMoveHigh, + [In] EMoveMethod dwMoveMethod); + + [DllImport("kernel32.dll", SetLastError = true)] + internal static extern bool SetEndOfFile( + [In] SafeFileHandle hFile); + + + [DllImport("kernel32.dll", SetLastError = true, CharSet = CharSet.Auto)] + internal static extern bool GetDiskFreeSpace(string lpRootPathName, + out uint lpSectorsPerCluster, + out uint lpBytesPerSector, + out uint lpNumberOfFreeClusters, + out uint lpTotalNumberOfClusters); + + [DllImport("kernel32.dll", SetLastError = true)] + internal static extern bool DeleteFileW([MarshalAs(UnmanagedType.LPWStr)]string lpFileName); +#endregion + + #region Thread and NUMA functions + [DllImport("kernel32.dll")] + private static extern IntPtr GetCurrentThread(); + [DllImport("kernel32")] + internal static extern uint GetCurrentThreadId(); + [DllImport("kernel32.dll", SetLastError = true)] + private static extern uint GetCurrentProcessorNumber(); + [DllImport("kernel32.dll", SetLastError = true)] + private static extern uint GetActiveProcessorCount(uint count); + [DllImport("kernel32.dll", SetLastError = true)] + private static extern ushort GetActiveProcessorGroupCount(); + [DllImport("kernel32.dll", SetLastError = true)] + private static extern int SetThreadGroupAffinity(IntPtr hThread, ref GROUP_AFFINITY GroupAffinity, ref GROUP_AFFINITY PreviousGroupAffinity); + [DllImport("kernel32.dll", SetLastError = true)] + private static extern int GetThreadGroupAffinity(IntPtr hThread, ref GROUP_AFFINITY PreviousGroupAffinity); + + private static readonly uint ALL_PROCESSOR_GROUPS = 0xffff; + + [System.Runtime.InteropServices.StructLayoutAttribute(System.Runtime.InteropServices.LayoutKind.Sequential)] + private struct GROUP_AFFINITY + { + public ulong Mask; + public uint Group; + public uint Reserved1; + public uint Reserved2; + public uint Reserved3; + } + + /// + /// Accepts thread id = 0, 1, 2, ... and sprays them round-robin + /// across all cores (viewed as a flat space). On NUMA machines, + /// this gives us [socket, core] ordering of affinitization. That is, + /// if there are N cores per socket, then thread indices of 0 to N-1 map + /// to the range [socket 0, core 0] to [socket 0, core N-1]. + /// + /// Index of thread (from 0 onwards) + public static void AffinitizeThreadRoundRobin(uint threadIdx) + { + uint nrOfProcessors = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); + ushort nrOfProcessorGroups = GetActiveProcessorGroupCount(); + uint nrOfProcsPerGroup = nrOfProcessors / nrOfProcessorGroups; + + GROUP_AFFINITY groupAffinityThread = default(GROUP_AFFINITY); + GROUP_AFFINITY oldAffinityThread = default(GROUP_AFFINITY); + + IntPtr thread = GetCurrentThread(); + GetThreadGroupAffinity(thread, ref groupAffinityThread); + + threadIdx = threadIdx % nrOfProcessors; + + groupAffinityThread.Mask = (ulong)1L << ((int)(threadIdx % (int)nrOfProcsPerGroup)); + groupAffinityThread.Group = (uint)(threadIdx / nrOfProcsPerGroup); + + if (SetThreadGroupAffinity(thread, ref groupAffinityThread, ref oldAffinityThread) == 0) + { + throw new Exception("Unable to affinitize thread"); + } + } + + /// + /// Accepts thread id = 0, 1, 2, ... and sprays them round-robin + /// across all cores (viewed as a flat space). On NUMA machines, + /// this gives us [core, socket] ordering of affinitization. That is, + /// if there are N cores per socket, then thread indices of 0 to N-1 map + /// to the range [socket 0, core 0] to [socket N-1, core 0]. + /// + /// Index of thread (from 0 onwards) + /// Number of NUMA sockets + public static void AffinitizeThreadShardedNuma(uint threadIdx, ushort nrOfProcessorGroups) + { + uint nrOfProcessors = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); + uint nrOfProcsPerGroup = nrOfProcessors / nrOfProcessorGroups; + + threadIdx = nrOfProcsPerGroup * (threadIdx % nrOfProcessorGroups) + (threadIdx / nrOfProcessorGroups); + AffinitizeThreadRoundRobin(threadIdx); + return; + } + #endregion + + #region Advanced file ops + [DllImport("advapi32.dll", SetLastError = true)] + private static extern bool LookupPrivilegeValue(string lpSystemName, string lpName, ref LUID lpLuid); + + [DllImport("kernel32.dll", SetLastError = true)] + private static extern IntPtr GetCurrentProcess(); + + [DllImport("advapi32", SetLastError = true)] + private static extern bool OpenProcessToken(IntPtr ProcessHandle, uint DesiredAccess, out IntPtr TokenHandle); + + [DllImport("advapi32.dll", SetLastError = true)] + private static extern bool AdjustTokenPrivileges(IntPtr tokenhandle, int disableprivs, ref TOKEN_PRIVILEGES Newstate, int BufferLengthInBytes, int PreviousState, int ReturnLengthInBytes); + + [DllImport("kernel32.dll", SetLastError = true)] + private static extern bool CloseHandle(IntPtr hObject); + + [DllImport("Kernel32.dll", SetLastError = true)] + private static extern bool DeviceIoControl(SafeFileHandle hDevice, uint IoControlCode, void* InBuffer, int nInBufferSize, IntPtr OutBuffer, int nOutBufferSize, ref uint pBytesReturned, IntPtr Overlapped); + + [DllImport("kernel32.dll", SetLastError = true)] + private static extern bool SetFilePointerEx(SafeFileHandle hFile, long liDistanceToMove, out long lpNewFilePointer, uint dwMoveMethod); + + [DllImport("kernel32.dll", SetLastError = true)] + private static extern bool SetFileValidData(SafeFileHandle hFile, long ValidDataLength); + + [DllImport("kernel32.dll", SetLastError = true)] + private static extern SafeFileHandle CreateFile(string filename, uint access, uint share, IntPtr securityAttributes, uint creationDisposition, uint flagsAndAttributes, IntPtr templateFile); + + /// + /// Enable privilege for process + /// + /// + public static bool EnableProcessPrivileges() + { +#if DOTNETCORE + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + return false; +#endif + + TOKEN_PRIVILEGES token_privileges = default(TOKEN_PRIVILEGES); + token_privileges.PrivilegeCount = 1; + token_privileges.Privileges.Attributes = 0x2; + + if (!LookupPrivilegeValue(null, "SeManageVolumePrivilege", + ref token_privileges.Privileges.Luid)) return false; + + if (!OpenProcessToken(GetCurrentProcess(), 0x20, out IntPtr token)) return false; + + if (!AdjustTokenPrivileges(token, 0, ref token_privileges, 0, 0, 0)) return false; + if (Marshal.GetLastWin32Error() != 0) return false; + CloseHandle(token); + return true; + } + + private static uint CTL_CODE(uint DeviceType, uint Function, uint Method, uint Access) + { + return (((DeviceType) << 16) | ((Access) << 14) | ((Function) << 2) | (Method)); + } + + internal static bool EnableVolumePrivileges(string filename, SafeFileHandle handle) + { +#if DOTNETCORE + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + return false; +#endif + + string volume_string = "\\\\.\\" + filename.Substring(0, 2); + + uint fileCreation = unchecked((uint)FileMode.Open); + + SafeFileHandle volume_handle = CreateFile(volume_string, 0, 0, IntPtr.Zero, fileCreation, + 0x80, IntPtr.Zero); + if (volume_handle == null) + { + return false; + } + + MARK_HANDLE_INFO mhi; + mhi.UsnSourceInfo = 0x1; + mhi.VolumeHandle = volume_handle.DangerousGetHandle(); + mhi.HandleInfo = 0x1; + + uint bytes_returned = 0; + bool result = DeviceIoControl(handle, CTL_CODE(0x9, 63, 0, 0), + (void*)&mhi, sizeof(MARK_HANDLE_INFO), IntPtr.Zero, + 0, ref bytes_returned, IntPtr.Zero); + + if (!result) + { + return false; + } + + volume_handle.Close(); + return true; + } + + /// + /// Set file size + /// + /// + /// + /// + public static bool SetFileSize(SafeFileHandle file_handle, long file_size) + { +#if DOTNETCORE + if (!RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) + return false; +#endif + + if (!SetFilePointerEx(file_handle, file_size, out long newFilePtr, 0)) + { + return false; + } + + // Set a fixed file length + if (!SetEndOfFile(file_handle)) + { + return false; + } + + if (!SetFileValidData(file_handle, file_size)) + { + return false; + } + + return true; + } + + internal static int MakeHRFromErrorCode(int errorCode) + { + return unchecked(((int)0x80070000) | errorCode); + } + #endregion + } +} diff --git a/ZeroLevel/Services/FASTER/Utilities/PageAsyncResultTypes.cs b/ZeroLevel/Services/FASTER/Utilities/PageAsyncResultTypes.cs new file mode 100644 index 0000000..eb349ad --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/PageAsyncResultTypes.cs @@ -0,0 +1,140 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +#define CALLOC + +using System; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Result of async page read + /// + /// + public class PageAsyncReadResult : IAsyncResult + { + internal long page; + internal TContext context; + internal CountdownEvent handle; + internal SectorAlignedMemory freeBuffer1; + internal SectorAlignedMemory freeBuffer2; + internal IOCompletionCallback callback; + internal IDevice objlogDevice; + internal object frame; + internal CancellationTokenSource cts; + + /* Used for iteration */ + internal long resumePtr; + internal long untilPtr; + internal long maxPtr; + + /// + /// + /// + public bool IsCompleted => throw new NotImplementedException(); + + /// + /// + /// + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + /// + /// + /// + public object AsyncState => throw new NotImplementedException(); + + /// + /// + /// + public bool CompletedSynchronously => throw new NotImplementedException(); + + /// + /// Free + /// + public void Free() + { + if (freeBuffer1 != null) + { + freeBuffer1.Return(); + freeBuffer1 = null; + } + + if (freeBuffer2 != null) + { + freeBuffer2.Return(); + freeBuffer2 = null; + } + } + } + + /// + /// Page async flush result + /// + /// + public class PageAsyncFlushResult : IAsyncResult + { + /// + /// Page + /// + public long page; + /// + /// Context + /// + public TContext context; + /// + /// Count + /// + public int count; + + internal bool partial; + internal long fromAddress; + internal long untilAddress; + internal CountdownEvent handle; + internal IDevice objlogDevice; + internal SectorAlignedMemory freeBuffer1; + internal SectorAlignedMemory freeBuffer2; + internal AutoResetEvent done; + + /// + /// + /// + public bool IsCompleted => throw new NotImplementedException(); + + /// + /// + /// + public WaitHandle AsyncWaitHandle => throw new NotImplementedException(); + + /// + /// + /// + public object AsyncState => throw new NotImplementedException(); + + /// + /// + /// + public bool CompletedSynchronously => throw new NotImplementedException(); + + /// + /// Free + /// + public void Free() + { + if (freeBuffer1 != null) + { + freeBuffer1.Return(); + freeBuffer1 = null; + } + if (freeBuffer2 != null) + { + freeBuffer2.Return(); + freeBuffer2 = null; + } + if (handle != null) + { + handle.Signal(); + } + } + } +} diff --git a/ZeroLevel/Services/FASTER/Utilities/SafeConcurrentDictionary.cs b/ZeroLevel/Services/FASTER/Utilities/SafeConcurrentDictionary.cs new file mode 100644 index 0000000..e9513af --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/SafeConcurrentDictionary.cs @@ -0,0 +1,232 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Runtime.CompilerServices; + +namespace FASTER.core +{ + /// + /// A dictionary that supports concurrency with similar interface to .NET's ConcurrentDictionary. + /// However, this dictionary changes the implementation of AddOrUpdate and GetOrAdd functions to + /// guarantee atomicity per-key for factory lambdas. + /// + /// Type of keys in the dictionary + /// Type of values in the dictionary + internal sealed class SafeConcurrentDictionary : IEnumerable> + { + private readonly ConcurrentDictionary dictionary = new ConcurrentDictionary(); + + private readonly ConcurrentDictionary keyLocks = new ConcurrentDictionary(); + + /// + /// Returns the count of the dictionary. + /// + public int Count + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get + { + return dictionary.Count; + } + } + + /// + /// Returns whether or not the dictionary is empty. + /// + public bool IsEmpty + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get + { + return dictionary.IsEmpty; + } + } + + /// + /// Gets or sets the value associated with a key. + /// + public TValue this[TKey key] + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get + { + return dictionary[key]; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + set + { + dictionary[key] = value; + } + } + + /// + /// Returns a collection of the keys in the dictionary. + /// + public ICollection Keys + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get + { + return dictionary.Keys; + } + } + + /// + /// Returns a collection of the values in the dictionary. + /// + public ICollection Values + { + [MethodImpl(MethodImplOptions.AggressiveInlining)] + get + { + return dictionary.Values; + } + } + + /// + /// Adds or updates a key/value pair to the dictionary. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public TValue AddOrUpdate(TKey key, Func addValueFactory, Func updateValueFactory) + { + lock (GetLock(key)) + { + return dictionary.AddOrUpdate(key, addValueFactory, updateValueFactory); + } + } + + /// + /// Adds or updates a key/value pair to the dictionary. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public TValue AddOrUpdate(TKey key, TValue addValue, Func updateValueFactory) + { + lock (GetLock(key)) + { + return dictionary.AddOrUpdate(key, addValue, updateValueFactory); + } + } + + /// + /// Adds a key/value pair to the dictionary if it does not exist. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public TValue GetOrAdd(TKey key, Func valueFactory) + { + if (dictionary.TryGetValue(key, out TValue value)) + { + return value; + } + lock (GetLock(key)) + { + return dictionary.GetOrAdd(key, valueFactory); + } + } + + /// + /// Adds a key/value pair to the dictionary if it does not exist. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public TValue GetOrAdd(TKey key, TValue value) + { + return dictionary.GetOrAdd(key, value); + } + + /// + /// Clears the dictionary. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Clear() + { + dictionary.Clear(); + keyLocks.Clear(); + } + + /// + /// Returns whether or not the dictionary contains the specified key. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ContainsKey(TKey key) + { + return dictionary.ContainsKey(key); + } + + /// + /// Returns an enumerator of the elements in the dictionary. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public IEnumerator> GetEnumerator() + { + return dictionary.GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + /// + /// Copies the key/value pairs to a new array. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public KeyValuePair[] ToArray() + { + return dictionary.ToArray(); + } + + /// + /// Attempts to add the specified key/value to the dictionary if it does not exist. + /// Returns true or false depending on if the value was added or not, respectively. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryAdd(TKey key, TValue value) + { + return dictionary.TryAdd(key, value); + } + + /// + /// Attempts to get the value for the specified key. + /// Returns true if the key was in the dictionary or false otherwise. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryGetValue(TKey key, out TValue value) + { + return dictionary.TryGetValue(key, out value); + } + + /// + /// Attempts to remove the value for the specified key. + /// Returns true if the key was in the dictionary or false otherwise. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryRemove(TKey key, out TValue value) + { + return dictionary.TryRemove(key, out value); + } + + /// + /// Compares the existing value for the specified key with a specified value, + /// and updates it if and only if it is a match. Returns true is updated or + /// false otherwise. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool TryUpdate(TKey key, TValue newValue, TValue comparisonValue) + { + return dictionary.TryUpdate(key, newValue, comparisonValue); + } + + /// + /// Retrieves lock associated with a key (creating it if it does not exist). + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private object GetLock(TKey key) + { + return keyLocks.GetOrAdd(key, v => new object()); + } + } +} diff --git a/ZeroLevel/Services/FASTER/Utilities/StateTransitions.cs b/ZeroLevel/Services/FASTER/Utilities/StateTransitions.cs new file mode 100644 index 0000000..60ba27a --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/StateTransitions.cs @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Text; +using System.Threading; +using System.Threading.Tasks; + +namespace FASTER.core +{ + internal enum ResizeOperationStatus : int { IN_PROGRESS, DONE }; + + [StructLayout(LayoutKind.Explicit, Size = 8)] + internal unsafe struct ResizeInfo + { + [FieldOffset(0)] + public ResizeOperationStatus status; + + [FieldOffset(4)] + public int version; + + [FieldOffset(0)] + public long word; + } + + internal enum Phase : int { + PREP_INDEX_CHECKPOINT, INDEX_CHECKPOINT, + PREPARE, IN_PROGRESS, + WAIT_PENDING, WAIT_FLUSH, + REST, + PERSISTENCE_CALLBACK, + GC, + PREPARE_GROW, IN_PROGRESS_GROW, + INTERMEDIATE, + }; + + [StructLayout(LayoutKind.Explicit, Size = 8)] + internal unsafe struct SystemState + { + [FieldOffset(0)] + public Phase phase; + + [FieldOffset(4)] + public int version; + + [FieldOffset(0)] + public long word; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static SystemState Copy(ref SystemState other) + { + var info = default(SystemState); + info.word = other.word; + return info; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static SystemState Make(Phase status, int version) + { + var info = default(SystemState); + info.phase = status; + info.version = version; + return info; + } + + } + +} diff --git a/ZeroLevel/Services/FASTER/Utilities/Status.cs b/ZeroLevel/Services/FASTER/Utilities/Status.cs new file mode 100644 index 0000000..c48a024 --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/Status.cs @@ -0,0 +1,45 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +// ********************************************************************* +// Copyright (C) Microsoft. All rights reserved. +// +// @File: +// +// @Owner: +// @Test: +// +// Purpose: +// +// Notes: +// +// @EndHeader@ +// ********************************************************************* +namespace FASTER.core +{ + + /// + /// Status result of operation on FASTER + /// + public enum Status + { + /// + /// For Read and RMW, item being read was found, and + /// the operation completed successfully + /// For Upsert, item was upserted successfully + /// + OK, + /// + /// For Read and RMW, item being read was not found + /// + NOTFOUND, + /// + /// Operation went pending (async) + /// + PENDING, + /// + /// Operation resulted in some error + /// + ERROR + } +} diff --git a/ZeroLevel/Services/FASTER/Utilities/Utility.cs b/ZeroLevel/Services/FASTER/Utilities/Utility.cs new file mode 100644 index 0000000..c250d1b --- /dev/null +++ b/ZeroLevel/Services/FASTER/Utilities/Utility.cs @@ -0,0 +1,296 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT license. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using System.Runtime.InteropServices; +using System.Security; +using System.IO; +using System.Runtime.CompilerServices; +using Microsoft.Win32.SafeHandles; +using System.Diagnostics; +using System.Threading; + +namespace FASTER.core +{ + /// + /// Empty type + /// + public struct Empty + { + /// + /// Default + /// + public static readonly Empty Default = default(Empty); + } + + /// + /// FASTER utility functions + /// + public static class Utility + { + /// + /// Get size of type + /// + /// + /// + /// + internal static unsafe int GetSize(this T value) + { + T[] arr = new T[2]; + return (int)((long)Unsafe.AsPointer(ref arr[1]) - (long)Unsafe.AsPointer(ref arr[0])); + } + + /// + /// Is type blittable + /// + /// + /// + internal static bool IsBlittable() + { + if (default(T) == null) + return false; + + try + { + var tmp = new T[1]; + var h = GCHandle.Alloc(tmp, GCHandleType.Pinned); + h.Free(); + } + catch (Exception) + { + return false; + } + return true; + } + + /// + /// Check if two byte arrays of given length are equal + /// + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public unsafe static bool IsEqual(byte* src, byte* dst, int length) + { + for (int i = 0; i < length; i++) + { + if (*(src + i) != *(dst + i)) + { + return false; + } + } + return true; + } + + /// + /// Copy numBytes bytes from src to dest + /// + /// + /// + /// + public unsafe static void Copy(byte* src, byte* dest, int numBytes) + { + for(int i = 0; i < numBytes; i++) + { + *(dest + i) = *(src + i); + } + } + + /// + /// Get 64-bit hash code for a long value + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static long GetHashCode(long input) + { + long local_rand = input; + long local_rand_hash = 8; + + local_rand_hash = 40343 * local_rand_hash + ((local_rand) & 0xFFFF); + local_rand_hash = 40343 * local_rand_hash + ((local_rand >> 16) & 0xFFFF); + local_rand_hash = 40343 * local_rand_hash + ((local_rand >> 32) & 0xFFFF); + local_rand_hash = 40343 * local_rand_hash + (local_rand >> 48); + local_rand_hash = 40343 * local_rand_hash; + + return (long)Rotr64((ulong)local_rand_hash, 45); + } + + /// + /// Get 64-bit hash code for a byte array + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe long HashBytes(byte* pbString, int len) + { + const long magicno = 40343; + char* pwString = (char*)pbString; + int cbBuf = len / 2; + ulong hashState = (ulong)len; + + for (int i = 0; i < cbBuf; i++, pwString++) + hashState = magicno * hashState + *pwString; + + if ((len & 1) > 0) + { + byte* pC = (byte*)pwString; + hashState = magicno * hashState + *pC; + } + + return (long)Rotr64(magicno * hashState, 4); + } + + /// + /// Compute XOR of all provided bytes + /// + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe ulong XorBytes(byte* src, int length) + { + ulong result = 0; + byte* curr = src; + byte* end = src + length; + while (curr + 4 * sizeof(ulong) <= end) + { + result ^= *(ulong*)curr; + result ^= *(1 + (ulong*)curr); + result ^= *(2 + (ulong*)curr); + result ^= *(3 + (ulong*)curr); + curr += 4 * sizeof(ulong); + } + while (curr + sizeof(ulong) <= end) + { + result ^= *(ulong*)curr; + curr += sizeof(ulong); + } + while (curr + 1 <= end) + { + result ^= *curr; + curr++; + } + + return result; + } + + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static ulong Rotr64(ulong x, int n) + { + return (((x) >> n) | ((x) << (64 - n))); + } + + /// + /// Is power of 2 + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool IsPowerOfTwo(long x) + { + return (x > 0) && ((x & (x - 1)) == 0); + } + + internal static readonly int[] MultiplyDeBruijnBitPosition2 = new int[32] + { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 + }; + + /// + /// Get log base 2 + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int GetLogBase2(int x) + { + return MultiplyDeBruijnBitPosition2[(uint)(x * 0x077CB531U) >> 27]; + } + + /// + /// Get log base 2 + /// + /// + /// + public static int GetLogBase2(ulong value) + { + int i; + for (i = -1; value != 0; i++) + value >>= 1; + + return (i == -1) ? 0 : i; + } + + /// + /// Check if power of two + /// + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static bool Is32Bit(long x) + { + return ((ulong)x < 4294967295ul); + } + + + /// + /// A 32-bit murmur3 implementation. + /// + /// + /// + internal static int Murmur3(int h) + { + uint a = (uint)h; + a ^= a >> 16; + a *= 0x85ebca6b; + a ^= a >> 13; + a *= 0xc2b2ae35; + a ^= a >> 16; + return (int)a; + } + + /// + /// Updates the variable to newValue only if the current value is smaller than the new value. + /// + /// The variable to possibly replace + /// The value that replaces the variable if successful + /// The orignal value in the variable + /// if oldValue less than newValue + public static bool MonotonicUpdate(ref long variable, long newValue, out long oldValue) + { + do + { + oldValue = variable; + if (oldValue >= newValue) return false; + } while (Interlocked.CompareExchange(ref variable, newValue, oldValue) != oldValue); + return true; + } + + /// + /// Updates the variable to newValue only if the current value is smaller than the new value. + /// + /// The variable to possibly replace + /// The value that replaces the variable if successful + /// The orignal value in the variable + /// if oldValue less than or equal to newValue + public static bool MonotonicUpdate(ref int variable, int newValue, out int oldValue) + { + do + { + oldValue = variable; + if (oldValue >= newValue) return false; + } while (Interlocked.CompareExchange(ref variable, newValue, oldValue) != oldValue); + return true; + } + + } +} diff --git a/ZeroLevel/Services/Microservices/Dump/DumpStorage.cs b/ZeroLevel/Services/Microservices/Dump/DumpStorage.cs new file mode 100644 index 0000000..7ea1421 --- /dev/null +++ b/ZeroLevel/Services/Microservices/Dump/DumpStorage.cs @@ -0,0 +1,54 @@ +using FASTER.core; +using System.Collections.Generic; +using System.IO; +using System.Threading.Tasks; +using ZeroLevel.Services.FileSystem; +using ZeroLevel.Services.Serialization; + +namespace ZeroLevel.Services.Microservices.Dump +{ + public class DumpStorage + { + IDevice device; + FasterLog log; + + public DumpStorage() + { + var folder = Path.Combine(Configuration.BaseDirectory, "dump"); + if (false == Directory.Exists(folder)) + { + Directory.CreateDirectory(folder); + } + device = Devices.CreateLogDevice(Path.Combine(folder, $"dump.log"), + true, true, -1, false); + + log = new FasterLog(new FasterLogSettings { LogDevice = device }); + } + + public void Dump(T value) + { + var packet = MessageSerializer.SerializeCompatible(value); + while (!log.TryEnqueue(packet, out _)) ; + log.Commit(); + } + + public async Task DumpAsync(T value) + { + var packet = MessageSerializer.SerializeCompatible(value); + await log.EnqueueAndWaitForCommitAsync(packet); + } + + public IEnumerable ReadAndTruncate() + { + byte[] result; + using (var iter = log.Scan(log.BeginAddress, log.TailAddress)) + { + while (iter.GetNext(out result, out int length)) + { + yield return MessageSerializer.DeserializeCompatible(result); + } + log.TruncateUntil(iter.NextAddress); + } + } + } +} diff --git a/ZeroLevel/ZeroLevel.csproj b/ZeroLevel/ZeroLevel.csproj index c3ce10e..4da07a8 100644 --- a/ZeroLevel/ZeroLevel.csproj +++ b/ZeroLevel/ZeroLevel.csproj @@ -36,7 +36,11 @@ + + + +