// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT license.
using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Threading;
namespace FASTER.core
{
internal enum PMMFlushStatus : int { Flushed, InProgress };
internal enum PMMCloseStatus : int { Closed, Open };
[StructLayout(LayoutKind.Explicit)]
internal struct FullPageStatus
{
[FieldOffset(0)]
public long LastFlushedUntilAddress;
[FieldOffset(8)]
public long LastClosedUntilAddress;
}
[StructLayout(LayoutKind.Explicit)]
internal struct PageOffset
{
[FieldOffset(0)]
public int Offset;
[FieldOffset(4)]
public int Page;
[FieldOffset(0)]
public long PageAndOffset;
}
///
/// Base class for hybrid log memory allocator
///
///
///
public unsafe abstract partial class AllocatorBase : IDisposable
where Key : new()
where Value : new()
{
///
/// Epoch information
///
protected readonly LightEpoch epoch;
private readonly bool ownedEpoch;
///
/// Comparer
///
protected readonly IFasterEqualityComparer comparer;
#region Protected size definitions
///
/// Buffer size
///
internal readonly int BufferSize;
///
/// Log page size
///
internal readonly int LogPageSizeBits;
///
/// Page size
///
internal readonly int PageSize;
///
/// Page size mask
///
internal readonly int PageSizeMask;
///
/// Buffer size mask
///
protected readonly int BufferSizeMask;
///
/// Aligned page size in bytes
///
protected readonly int AlignedPageSizeBytes;
///
/// Total hybrid log size (bits)
///
protected readonly int LogTotalSizeBits;
///
/// Total hybrid log size (bytes)
///
protected readonly long LogTotalSizeBytes;
///
/// Segment size in bits
///
protected readonly int LogSegmentSizeBits;
///
/// Segment size
///
protected readonly long SegmentSize;
///
/// Segment buffer size
///
protected readonly int SegmentBufferSize;
///
/// HeadOffset lag (from tail)
///
protected readonly bool HeadOffsetExtraLag;
///
/// HeadOFfset lag address
///
protected readonly long HeadOffsetLagAddress;
///
/// Log mutable fraction
///
protected readonly double LogMutableFraction;
///
/// ReadOnlyOffset lag (from tail)
///
protected readonly long ReadOnlyLagAddress;
#endregion
#region Public addresses
///
/// Read-only address
///
public long ReadOnlyAddress;
///
/// Safe read-only address
///
public long SafeReadOnlyAddress;
///
/// Head address
///
public long HeadAddress;
///
/// Safe head address
///
public long SafeHeadAddress;
///
/// Flushed until address
///
public long FlushedUntilAddress;
///
/// Flushed until address
///
public long ClosedUntilAddress;
///
/// Begin address
///
public long BeginAddress;
#endregion
#region Protected device info
///
/// Device
///
protected readonly IDevice device;
///
/// Sector size
///
protected readonly int sectorSize;
#endregion
#region Private page metadata
// Array that indicates the status of each buffer page
internal readonly FullPageStatus[] PageStatusIndicator;
internal readonly PendingFlushList[] PendingFlush;
///
/// Global address of the current tail (next element to be allocated from the circular buffer)
///
private PageOffset TailPageOffset;
///
/// Number of pending reads
///
private int numPendingReads = 0;
#endregion
///
/// Buffer pool
///
protected SectorAlignedBufferPool bufferPool;
///
/// Read cache
///
protected readonly bool ReadCache = false;
///
/// Read cache eviction callback
///
protected readonly Action EvictCallback = null;
///
/// Flush callback
///
protected readonly Action FlushCallback = null;
///
/// Error handling
///
private readonly ErrorList errorList = new ErrorList();
///
/// Observer for records entering read-only region
///
internal IObserver> OnReadOnlyObserver;
#region Abstract methods
///
/// Initialize
///
public abstract void Initialize();
///
/// Get start logical address
///
///
///
public abstract long GetStartLogicalAddress(long page);
///
/// Get first valid logical address
///
///
///
public abstract long GetFirstValidLogicalAddress(long page);
///
/// Get physical address
///
///
///
public abstract long GetPhysicalAddress(long newLogicalAddress);
///
/// Get address info
///
///
///
public abstract ref RecordInfo GetInfo(long physicalAddress);
///
/// Get info from byte pointer
///
///
///
public abstract ref RecordInfo GetInfoFromBytePointer(byte* ptr);
///
/// Get key
///
///
///
public abstract ref Key GetKey(long physicalAddress);
///
/// Get value
///
///
///
public abstract ref Value GetValue(long physicalAddress);
///
/// Get address info for key
///
///
///
public abstract AddressInfo* GetKeyAddressInfo(long physicalAddress);
///
/// Get address info for value
///
///
///
public abstract AddressInfo* GetValueAddressInfo(long physicalAddress);
///
/// Get record size
///
///
///
public abstract int GetRecordSize(long physicalAddress);
///
/// Get number of bytes required
///
///
///
///
public virtual int GetRequiredRecordSize(long physicalAddress, int availableBytes) => GetAverageRecordSize();
///
/// Get average record size
///
///
public abstract int GetAverageRecordSize();
///
/// Get initial record size
///
///
///
///
///
public abstract int GetInitialRecordSize(ref Key key, ref Input input);
///
/// Get record size
///
///
///
///
public abstract int GetRecordSize(ref Key key, ref Value value);
///
/// Allocate page
///
///
internal abstract void AllocatePage(int index);
///
/// Whether page is allocated
///
///
///
protected abstract bool IsAllocated(int pageIndex);
///
/// Populate page
///
///
///
///
internal abstract void PopulatePage(byte* src, int required_bytes, long destinationPage);
///
/// Write async to device
///
///
///
///
///
///
///
///
///
protected abstract void WriteAsyncToDevice(long startPage, long flushPage, int pageSize, IOCompletionCallback callback, PageAsyncFlushResult result, IDevice device, IDevice objectLogDevice);
///
/// Read objects to memory (async)
///
///
///
///
///
///
protected abstract void AsyncReadRecordObjectsToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext context, SectorAlignedMemory result = default(SectorAlignedMemory));
///
/// Read page (async)
///
///
///
///
///
///
///
///
///
protected abstract void ReadAsync(ulong alignedSourceAddress, int destinationPageIndex, uint aligned_read_length, IOCompletionCallback callback, PageAsyncReadResult asyncResult, IDevice device, IDevice objlogDevice);
///
/// Clear page
///
/// Page number to be cleared
/// Offset to clear from (if partial clear)
protected abstract void ClearPage(long page, int offset = 0);
///
/// Write page (async)
///
///
///
///
///
protected abstract void WriteAsync(long flushPage, IOCompletionCallback callback, PageAsyncFlushResult asyncResult);
///
/// Retrieve full record
///
///
///
///
protected abstract bool RetrievedFullRecord(byte* record, ref AsyncIOContext ctx);
///
/// Retrieve value from context
///
///
///
public virtual ref Key GetContextRecordKey(ref AsyncIOContext ctx) => ref ctx.key;
///
/// Retrieve value from context
///
///
///
public virtual ref Value GetContextRecordValue(ref AsyncIOContext ctx) => ref ctx.value;
///
/// Get heap container for pending key
///
///
///
public abstract IHeapContainer GetKeyContainer(ref Key key);
///
/// Get heap container for pending value
///
///
///
public abstract IHeapContainer GetValueContainer(ref Value value);
///
/// Copy value to context
///
///
///
public virtual void PutContext(ref AsyncIOContext ctx, ref Value value) => ctx.value = value;
///
/// Whether key has objects
///
///
public abstract bool KeyHasObjects();
///
/// Whether value has objects
///
///
public abstract bool ValueHasObjects();
///
/// Get segment offsets
///
///
public abstract long[] GetSegmentOffsets();
///
/// Pull-based scan interface for HLOG
///
///
///
///
///
public abstract IFasterScanIterator Scan(long beginAddress, long endAddress, ScanBufferingMode scanBufferingMode = ScanBufferingMode.DoublePageBuffering);
#endregion
///
/// Instantiate base allocator
///
///
///
///
///
///
public AllocatorBase(LogSettings settings, IFasterEqualityComparer comparer, Action evictCallback, LightEpoch epoch, Action flushCallback)
{
if (evictCallback != null)
{
ReadCache = true;
EvictCallback = evictCallback;
}
FlushCallback = flushCallback;
this.comparer = comparer;
if (epoch == null)
{
this.epoch = new LightEpoch();
ownedEpoch = true;
}
else
this.epoch = epoch;
settings.LogDevice.Initialize(1L << settings.SegmentSizeBits, epoch);
settings.ObjectLogDevice?.Initialize(1L << settings.SegmentSizeBits, epoch);
// Page size
LogPageSizeBits = settings.PageSizeBits;
PageSize = 1 << LogPageSizeBits;
PageSizeMask = PageSize - 1;
// Total HLOG size
LogTotalSizeBits = settings.MemorySizeBits;
LogTotalSizeBytes = 1L << LogTotalSizeBits;
BufferSize = (int)(LogTotalSizeBytes / (1L << LogPageSizeBits));
BufferSizeMask = BufferSize - 1;
// HeadOffset lag (from tail).
var headOffsetLagSize = BufferSize - 1; // (ReadCache ? ReadCacheHeadOffsetLagNumPages : HeadOffsetLagNumPages);
if (BufferSize > 1 && HeadOffsetExtraLag) headOffsetLagSize--;
HeadOffsetLagAddress = (long)headOffsetLagSize << LogPageSizeBits;
// ReadOnlyOffset lag (from tail). This should not exceed HeadOffset lag.
LogMutableFraction = settings.MutableFraction;
ReadOnlyLagAddress = Math.Min((long)(LogMutableFraction * BufferSize) << LogPageSizeBits, HeadOffsetLagAddress);
// Segment size
LogSegmentSizeBits = settings.SegmentSizeBits;
SegmentSize = 1 << LogSegmentSizeBits;
SegmentBufferSize = 1 + (LogTotalSizeBytes / SegmentSize < 1 ? 1 : (int)(LogTotalSizeBytes / SegmentSize));
if (SegmentSize < PageSize)
throw new Exception("Segment must be at least of page size");
if (BufferSize < 1)
{
throw new Exception("Log buffer must be of size at least 1 page");
}
PageStatusIndicator = new FullPageStatus[BufferSize];
PendingFlush = new PendingFlushList[BufferSize];
for (int i = 0; i < BufferSize; i++)
PendingFlush[i] = new PendingFlushList();
device = settings.LogDevice;
sectorSize = (int)device.SectorSize;
AlignedPageSizeBytes = ((PageSize + (sectorSize - 1)) & ~(sectorSize - 1));
}
///
/// Initialize allocator
///
///
protected void Initialize(long firstValidAddress)
{
Debug.Assert(firstValidAddress <= PageSize);
bufferPool = new SectorAlignedBufferPool(1, sectorSize);
long tailPage = firstValidAddress >> LogPageSizeBits;
int tailPageIndex = (int)(tailPage % BufferSize);
AllocatePage(tailPageIndex);
// Allocate next page as well
int nextPageIndex = (int)(tailPage + 1) % BufferSize;
if ((!IsAllocated(nextPageIndex)))
{
AllocatePage(nextPageIndex);
}
SafeReadOnlyAddress = firstValidAddress;
ReadOnlyAddress = firstValidAddress;
SafeHeadAddress = firstValidAddress;
HeadAddress = firstValidAddress;
ClosedUntilAddress = firstValidAddress;
FlushedUntilAddress = firstValidAddress;
BeginAddress = firstValidAddress;
TailPageOffset.Page = (int)(firstValidAddress >> LogPageSizeBits);
TailPageOffset.Offset = (int)(firstValidAddress & PageSizeMask);
}
///
/// Acquire thread
///
public void Acquire()
{
if (ownedEpoch)
epoch.Acquire();
}
///
/// Release thread
///
public void Release()
{
if (ownedEpoch)
epoch.Release();
}
///
/// Dispose allocator
///
public virtual void Dispose()
{
TailPageOffset.Page = 0;
TailPageOffset.Offset = 0;
SafeReadOnlyAddress = 0;
ReadOnlyAddress = 0;
SafeHeadAddress = 0;
HeadAddress = 0;
BeginAddress = 1;
if (ownedEpoch)
epoch.Dispose();
bufferPool.Free();
OnReadOnlyObserver?.OnCompleted();
}
///
/// Delete in-memory portion of the log
///
internal abstract void DeleteFromMemory();
///
/// Segment size
///
///
public long GetSegmentSize()
{
return SegmentSize;
}
///
/// Get tail address
///
///
public long GetTailAddress()
{
var local = TailPageOffset;
if (local.Offset >= PageSize)
{
local.Page++;
local.Offset = 0;
}
return ((long)local.Page << LogPageSizeBits) | (uint)local.Offset;
}
///
/// Get page
///
///
///
public long GetPage(long logicalAddress)
{
return (logicalAddress >> LogPageSizeBits);
}
///
/// Get page index for page
///
///
///
public int GetPageIndexForPage(long page)
{
return (int)(page % BufferSize);
}
///
/// Get page index for address
///
///
///
public int GetPageIndexForAddress(long address)
{
return (int)((address >> LogPageSizeBits) % BufferSize);
}
///
/// Get capacity (number of pages)
///
///
public int GetCapacityNumPages()
{
return BufferSize;
}
///
/// Get page size
///
///
public long GetPageSize()
{
return PageSize;
}
///
/// Get offset in page
///
///
///
public long GetOffsetInPage(long address)
{
return address & PageSizeMask;
}
///
/// Get sector size for main hlog device
///
///
public int GetDeviceSectorSize()
{
return sectorSize;
}
///
/// Try allocate, no thread spinning allowed
/// May return 0 in case of inability to allocate
///
///
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public long TryAllocate(int numSlots = 1)
{
if (numSlots > PageSize)
throw new Exception("Entry does not fit on page");
PageOffset localTailPageOffset = default(PageOffset);
// Necessary to check because threads keep retrying and we do not
// want to overflow offset more than once per thread
if (TailPageOffset.Offset > PageSize)
return 0;
// Determine insertion index.
// ReSharper disable once CSharpWarnings::CS0420
#pragma warning disable 420
localTailPageOffset.PageAndOffset = Interlocked.Add(ref TailPageOffset.PageAndOffset, numSlots);
#pragma warning restore 420
int page = localTailPageOffset.Page;
int offset = localTailPageOffset.Offset - numSlots;
#region HANDLE PAGE OVERFLOW
if (localTailPageOffset.Offset > PageSize)
{
if (offset > PageSize)
{
return 0;
}
// The thread that "makes" the offset incorrect
// is the one that is elected to fix it and
// shift read-only/head.
long shiftAddress = ((long)(localTailPageOffset.Page + 1)) << LogPageSizeBits;
PageAlignedShiftReadOnlyAddress(shiftAddress);
PageAlignedShiftHeadAddress(shiftAddress);
if (CannotAllocate(localTailPageOffset.Page + 1))
{
// We should not allocate the next page; reset to end of page
// so that next attempt can retry
localTailPageOffset.Offset = PageSize;
Interlocked.Exchange(ref TailPageOffset.PageAndOffset, localTailPageOffset.PageAndOffset);
return 0;
}
// Allocate next page in advance, if needed
int nextPageIndex = (localTailPageOffset.Page + 2) % BufferSize;
if ((!IsAllocated(nextPageIndex)))
{
AllocatePage(nextPageIndex);
}
localTailPageOffset.Page++;
localTailPageOffset.Offset = 0;
TailPageOffset = localTailPageOffset;
return 0;
}
#endregion
return (((long)page) << LogPageSizeBits) | ((long)offset);
}
private bool CannotAllocate(int page)
{
return
(page >= BufferSize + (ClosedUntilAddress >> LogPageSizeBits));
}
///
/// Used by applications to make the current state of the database immutable quickly
///
///
public bool ShiftReadOnlyToTail(out long tailAddress)
{
tailAddress = GetTailAddress();
long localTailAddress = tailAddress;
long currentReadOnlyOffset = ReadOnlyAddress;
if (Utility.MonotonicUpdate(ref ReadOnlyAddress, tailAddress, out long oldReadOnlyOffset))
{
epoch.BumpCurrentEpoch(() => OnPagesMarkedReadOnly(localTailAddress));
return true;
}
return false;
}
///
/// Used by applications to move read-only forward
///
///
public bool ShiftReadOnlyAddress(long newReadOnlyAddress)
{
if (Utility.MonotonicUpdate(ref ReadOnlyAddress, newReadOnlyAddress, out long oldReadOnlyOffset))
{
epoch.BumpCurrentEpoch(() => OnPagesMarkedReadOnly(newReadOnlyAddress));
return true;
}
return false;
}
///
/// Shift begin address
///
///
public void ShiftBeginAddress(long newBeginAddress)
{
// First update the begin address
var b = Utility.MonotonicUpdate(ref BeginAddress, newBeginAddress, out long oldBeginAddress);
b = b && (oldBeginAddress >> LogSegmentSizeBits != newBeginAddress >> LogSegmentSizeBits);
// Then the head address
var h = Utility.MonotonicUpdate(ref HeadAddress, newBeginAddress, out long old);
// Finally the read-only address
var r = Utility.MonotonicUpdate(ref ReadOnlyAddress, newBeginAddress, out old);
if (h || r || b)
{
epoch.Resume();
// Clean up until begin address
epoch.BumpCurrentEpoch(() =>
{
if (r)
{
Utility.MonotonicUpdate(ref SafeReadOnlyAddress, newBeginAddress, out long _old);
Utility.MonotonicUpdate(ref FlushedUntilAddress, newBeginAddress, out _old);
}
if (h) OnPagesClosed(newBeginAddress);
if (b) TruncateUntilAddress(newBeginAddress);
});
epoch.Suspend();
}
}
///
/// Wraps when an allocator potentially has to interact with multiple devices
///
///
protected virtual void TruncateUntilAddress(long toAddress)
{
device.TruncateUntilAddress(toAddress);
}
///
/// Seal: make sure there are no longer any threads writing to the page
/// Flush: send page to secondary store
///
///
public void OnPagesMarkedReadOnly(long newSafeReadOnlyAddress)
{
if (Utility.MonotonicUpdate(ref SafeReadOnlyAddress, newSafeReadOnlyAddress, out long oldSafeReadOnlyAddress))
{
Debug.WriteLine("SafeReadOnly shifted from {0:X} to {1:X}", oldSafeReadOnlyAddress, newSafeReadOnlyAddress);
OnReadOnlyObserver?.OnNext(Scan(oldSafeReadOnlyAddress, newSafeReadOnlyAddress, ScanBufferingMode.NoBuffering));
AsyncFlushPages(oldSafeReadOnlyAddress, newSafeReadOnlyAddress);
}
}
///
/// Action to be performed for when all threads have
/// agreed that a page range is closed.
///
///
public void OnPagesClosed(long newSafeHeadAddress)
{
if (Utility.MonotonicUpdate(ref SafeHeadAddress, newSafeHeadAddress, out long oldSafeHeadAddress))
{
Debug.WriteLine("SafeHeadOffset shifted from {0:X} to {1:X}", oldSafeHeadAddress, newSafeHeadAddress);
for (long closePageAddress = oldSafeHeadAddress & ~PageSizeMask; closePageAddress < newSafeHeadAddress; closePageAddress += PageSize)
{
if (newSafeHeadAddress < closePageAddress + PageSize)
{
// Partial page - do not close
return;
}
int closePage = (int)(closePageAddress >> LogPageSizeBits);
int closePageIndex = closePage % BufferSize;
if (!IsAllocated(closePageIndex))
AllocatePage(closePageIndex);
else
ClearPage(closePage);
Utility.MonotonicUpdate(ref PageStatusIndicator[closePageIndex].LastClosedUntilAddress, closePageAddress + PageSize, out _);
ShiftClosedUntilAddress();
if (ClosedUntilAddress > FlushedUntilAddress)
{
throw new Exception($"Closed address {ClosedUntilAddress} exceeds flushed address {FlushedUntilAddress}");
}
}
}
}
private void DebugPrintAddresses(long closePageAddress)
{
var _flush = FlushedUntilAddress;
var _readonly = ReadOnlyAddress;
var _safereadonly = SafeReadOnlyAddress;
var _tail = GetTailAddress();
var _head = HeadAddress;
var _safehead = SafeHeadAddress;
Console.WriteLine("ClosePageAddress: {0}.{1}", GetPage(closePageAddress), GetOffsetInPage(closePageAddress));
Console.WriteLine("FlushedUntil: {0}.{1}", GetPage(_flush), GetOffsetInPage(_flush));
Console.WriteLine("Tail: {0}.{1}", GetPage(_tail), GetOffsetInPage(_tail));
Console.WriteLine("Head: {0}.{1}", GetPage(_head), GetOffsetInPage(_head));
Console.WriteLine("SafeHead: {0}.{1}", GetPage(_safehead), GetOffsetInPage(_safehead));
Console.WriteLine("ReadOnly: {0}.{1}", GetPage(_readonly), GetOffsetInPage(_readonly));
Console.WriteLine("SafeReadOnly: {0}.{1}", GetPage(_safereadonly), GetOffsetInPage(_safereadonly));
}
///
/// Called every time a new tail page is allocated. Here the read-only is
/// shifted only to page boundaries unlike ShiftReadOnlyToTail where shifting
/// can happen to any fine-grained address.
///
///
private void PageAlignedShiftReadOnlyAddress(long currentTailAddress)
{
long currentReadOnlyAddress = ReadOnlyAddress;
long pageAlignedTailAddress = currentTailAddress & ~PageSizeMask;
long desiredReadOnlyAddress = (pageAlignedTailAddress - ReadOnlyLagAddress);
if (Utility.MonotonicUpdate(ref ReadOnlyAddress, desiredReadOnlyAddress, out long oldReadOnlyAddress))
{
Debug.WriteLine("Allocate: Moving read-only offset from {0:X} to {1:X}", oldReadOnlyAddress, desiredReadOnlyAddress);
epoch.BumpCurrentEpoch(() => OnPagesMarkedReadOnly(desiredReadOnlyAddress));
}
}
///
/// Called whenever a new tail page is allocated or when the user is checking for a failed memory allocation
/// Tries to shift head address based on the head offset lag size.
///
///
private void PageAlignedShiftHeadAddress(long currentTailAddress)
{
//obtain local values of variables that can change
long currentHeadAddress = HeadAddress;
long currentFlushedUntilAddress = FlushedUntilAddress;
long pageAlignedTailAddress = currentTailAddress & ~PageSizeMask;
long desiredHeadAddress = (pageAlignedTailAddress - HeadOffsetLagAddress);
long newHeadAddress = desiredHeadAddress;
if (currentFlushedUntilAddress < newHeadAddress)
{
newHeadAddress = currentFlushedUntilAddress;
}
newHeadAddress = newHeadAddress & ~PageSizeMask;
if (ReadCache && (newHeadAddress > HeadAddress))
EvictCallback(HeadAddress, newHeadAddress);
if (Utility.MonotonicUpdate(ref HeadAddress, newHeadAddress, out long oldHeadAddress))
{
Debug.WriteLine("Allocate: Moving head offset from {0:X} to {1:X}", oldHeadAddress, newHeadAddress);
epoch.BumpCurrentEpoch(() => OnPagesClosed(newHeadAddress));
}
}
///
/// Tries to shift head address to specified value
///
///
public long ShiftHeadAddress(long desiredHeadAddress)
{
//obtain local values of variables that can change
long currentFlushedUntilAddress = FlushedUntilAddress;
long newHeadAddress = desiredHeadAddress;
if (currentFlushedUntilAddress < newHeadAddress)
{
newHeadAddress = currentFlushedUntilAddress;
}
if (ReadCache && (newHeadAddress > HeadAddress))
EvictCallback(HeadAddress, newHeadAddress);
if (Utility.MonotonicUpdate(ref HeadAddress, newHeadAddress, out long oldHeadAddress))
{
Debug.WriteLine("Allocate: Moving head offset from {0:X} to {1:X}", oldHeadAddress, newHeadAddress);
epoch.BumpCurrentEpoch(() => OnPagesClosed(newHeadAddress));
}
return newHeadAddress;
}
///
/// Every async flush callback tries to update the flushed until address to the latest value possible
/// Is there a better way to do this with enabling fine-grained addresses (not necessarily at page boundaries)?
///
protected void ShiftFlushedUntilAddress()
{
long currentFlushedUntilAddress = FlushedUntilAddress;
long page = GetPage(currentFlushedUntilAddress);
bool update = false;
long pageLastFlushedAddress = PageStatusIndicator[page % BufferSize].LastFlushedUntilAddress;
while (pageLastFlushedAddress >= currentFlushedUntilAddress && currentFlushedUntilAddress >= (page << LogPageSizeBits))
{
currentFlushedUntilAddress = pageLastFlushedAddress;
update = true;
page++;
pageLastFlushedAddress = PageStatusIndicator[page % BufferSize].LastFlushedUntilAddress;
}
if (update)
{
if (Utility.MonotonicUpdate(ref FlushedUntilAddress, currentFlushedUntilAddress, out long oldFlushedUntilAddress))
{
uint errorCode = 0;
if (errorList.Count > 0)
{
errorCode = errorList.CheckAndWait(oldFlushedUntilAddress, currentFlushedUntilAddress);
}
FlushCallback?.Invoke(
new CommitInfo
{
BeginAddress = BeginAddress,
FromAddress = oldFlushedUntilAddress,
UntilAddress = currentFlushedUntilAddress,
ErrorCode = errorCode
});
if (errorList.Count > 0)
{
errorList.RemoveUntil(currentFlushedUntilAddress);
}
}
}
}
///
/// Shift ClosedUntil address
///
protected void ShiftClosedUntilAddress()
{
long currentClosedUntilAddress = ClosedUntilAddress;
long page = GetPage(currentClosedUntilAddress);
bool update = false;
long pageLastClosedAddress = PageStatusIndicator[page % BufferSize].LastClosedUntilAddress;
while (pageLastClosedAddress >= currentClosedUntilAddress && currentClosedUntilAddress >= (page << LogPageSizeBits))
{
currentClosedUntilAddress = pageLastClosedAddress;
update = true;
page++;
pageLastClosedAddress = PageStatusIndicator[(int)(page % BufferSize)].LastClosedUntilAddress;
}
if (update)
{
Utility.MonotonicUpdate(ref ClosedUntilAddress, currentClosedUntilAddress, out long oldClosedUntilAddress);
}
}
///
/// Reset for recovery
///
///
///
///
public void RecoveryReset(long tailAddress, long headAddress, long beginAddress)
{
long tailPage = GetPage(tailAddress);
long offsetInPage = GetOffsetInPage(tailAddress);
TailPageOffset.Page = (int)tailPage;
TailPageOffset.Offset = (int)offsetInPage;
// allocate next page as well - this is an invariant in the allocator!
var pageIndex = (TailPageOffset.Page % BufferSize);
var nextPageIndex = (pageIndex + 1) % BufferSize;
if (tailAddress > 0)
if (!IsAllocated(nextPageIndex))
AllocatePage(nextPageIndex);
BeginAddress = beginAddress;
HeadAddress = headAddress;
SafeHeadAddress = headAddress;
ClosedUntilAddress = headAddress;
FlushedUntilAddress = tailAddress;
ReadOnlyAddress = tailAddress;
SafeReadOnlyAddress = tailAddress;
// for the last page which contains tailoffset, it must be open
pageIndex = GetPageIndexForAddress(tailAddress);
// clear the last page starting from tail address
ClearPage(pageIndex, (int)GetOffsetInPage(tailAddress));
// Printing debug info
Debug.WriteLine("******* Recovered HybridLog Stats *******");
Debug.WriteLine("Head Address: {0}", HeadAddress);
Debug.WriteLine("Safe Head Address: {0}", SafeHeadAddress);
Debug.WriteLine("ReadOnly Address: {0}", ReadOnlyAddress);
Debug.WriteLine("Safe ReadOnly Address: {0}", SafeReadOnlyAddress);
Debug.WriteLine("Tail Address: {0}", tailAddress);
}
///
/// Invoked by users to obtain a record from disk. It uses sector aligned memory to read
/// the record efficiently into memory.
///
///
///
///
///
///
internal void AsyncReadRecordToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, AsyncIOContext context, SectorAlignedMemory result = default(SectorAlignedMemory))
{
ulong fileOffset = (ulong)(AlignedPageSizeBytes * (fromLogical >> LogPageSizeBits) + (fromLogical & PageSizeMask));
ulong alignedFileOffset = (ulong)(((long)fileOffset / sectorSize) * sectorSize);
uint alignedReadLength = (uint)((long)fileOffset + numBytes - (long)alignedFileOffset);
alignedReadLength = (uint)((alignedReadLength + (sectorSize - 1)) & ~(sectorSize - 1));
var record = bufferPool.Get((int)alignedReadLength);
record.valid_offset = (int)(fileOffset - alignedFileOffset);
record.available_bytes = (int)(alignedReadLength - (fileOffset - alignedFileOffset));
record.required_bytes = numBytes;
var asyncResult = default(AsyncGetFromDiskResult>);
asyncResult.context = context;
asyncResult.context.record = record;
device.ReadAsync(alignedFileOffset,
(IntPtr)asyncResult.context.record.aligned_pointer,
alignedReadLength,
callback,
asyncResult);
}
///
/// Read record to memory - simple version
///
///
///
///
///
internal void AsyncReadRecordToMemory(long fromLogical, int numBytes, IOCompletionCallback callback, ref SimpleReadContext context)
{
ulong fileOffset = (ulong)(AlignedPageSizeBytes * (fromLogical >> LogPageSizeBits) + (fromLogical & PageSizeMask));
ulong alignedFileOffset = (ulong)(((long)fileOffset / sectorSize) * sectorSize);
uint alignedReadLength = (uint)((long)fileOffset + numBytes - (long)alignedFileOffset);
alignedReadLength = (uint)((alignedReadLength + (sectorSize - 1)) & ~(sectorSize - 1));
context.record = bufferPool.Get((int)alignedReadLength);
context.record.valid_offset = (int)(fileOffset - alignedFileOffset);
context.record.available_bytes = (int)(alignedReadLength - (fileOffset - alignedFileOffset));
context.record.required_bytes = numBytes;
device.ReadAsync(alignedFileOffset,
(IntPtr)context.record.aligned_pointer,
alignedReadLength,
callback,
context);
}
///
/// Read pages from specified device
///
///
///
///
///
///
///
///
///
///
public void AsyncReadPagesFromDevice(
long readPageStart,
int numPages,
long untilAddress,
IOCompletionCallback callback,
TContext context,
long devicePageOffset = 0,
IDevice logDevice = null, IDevice objectLogDevice = null)
{
AsyncReadPagesFromDevice(readPageStart, numPages, untilAddress, callback, context,
out _, devicePageOffset, logDevice, objectLogDevice);
}
///
/// Read pages from specified device
///
///
///
///
///
///
///
///
///
///
///
private void AsyncReadPagesFromDevice(
long readPageStart,
int numPages,
long untilAddress,
IOCompletionCallback callback,
TContext context,
out CountdownEvent completed,
long devicePageOffset = 0,
IDevice device = null, IDevice objectLogDevice = null)
{
var usedDevice = device;
IDevice usedObjlogDevice = objectLogDevice;
if (device == null)
{
usedDevice = this.device;
}
completed = new CountdownEvent(numPages);
for (long readPage = readPageStart; readPage < (readPageStart + numPages); readPage++)
{
int pageIndex = (int)(readPage % BufferSize);
if (!IsAllocated(pageIndex))
{
// Allocate a new page
AllocatePage(pageIndex);
}
else
{
ClearPage(readPage);
}
var asyncResult = new PageAsyncReadResult()
{
page = readPage,
context = context,
handle = completed,
maxPtr = PageSize
};
ulong offsetInFile = (ulong)(AlignedPageSizeBytes * readPage);
uint readLength = (uint)AlignedPageSizeBytes;
long adjustedUntilAddress = (AlignedPageSizeBytes * (untilAddress >> LogPageSizeBits) + (untilAddress & PageSizeMask));
if (adjustedUntilAddress > 0 && ((adjustedUntilAddress - (long)offsetInFile) < PageSize))
{
readLength = (uint)(adjustedUntilAddress - (long)offsetInFile);
asyncResult.maxPtr = readLength;
readLength = (uint)((readLength + (sectorSize - 1)) & ~(sectorSize - 1));
}
if (device != null)
offsetInFile = (ulong)(AlignedPageSizeBytes * (readPage - devicePageOffset));
ReadAsync(offsetInFile, pageIndex, readLength, callback, asyncResult, usedDevice, usedObjlogDevice);
}
}
///
/// Flush page range to disk
/// Called when all threads have agreed that a page range is sealed.
///
///
///
public void AsyncFlushPages(long fromAddress, long untilAddress)
{
long startPage = fromAddress >> LogPageSizeBits;
long endPage = untilAddress >> LogPageSizeBits;
int numPages = (int)(endPage - startPage);
long offsetInStartPage = GetOffsetInPage(fromAddress);
long offsetInEndPage = GetOffsetInPage(untilAddress);
// Extra (partial) page being flushed
if (offsetInEndPage > 0)
numPages++;
/* Request asynchronous writes to the device. If waitForPendingFlushComplete
* is set, then a CountDownEvent is set in the callback handle.
*/
for (long flushPage = startPage; flushPage < (startPage + numPages); flushPage++)
{
long pageStartAddress = flushPage << LogPageSizeBits;
long pageEndAddress = (flushPage + 1) << LogPageSizeBits;
var asyncResult = new PageAsyncFlushResult
{
page = flushPage,
count = 1,
partial = false,
fromAddress = pageStartAddress,
untilAddress = pageEndAddress
};
if (
((fromAddress > pageStartAddress) && (fromAddress < pageEndAddress)) ||
((untilAddress > pageStartAddress) && (untilAddress < pageEndAddress))
)
{
asyncResult.partial = true;
if (untilAddress < pageEndAddress)
asyncResult.untilAddress = untilAddress;
if (fromAddress > pageStartAddress)
asyncResult.fromAddress = fromAddress;
}
// Partial page starting point, need to wait until the
// ongoing adjacent flush is completed to ensure correctness
if (GetOffsetInPage(asyncResult.fromAddress) > 0)
{
// Enqueue work in shared queue
var index = GetPageIndexForAddress(asyncResult.fromAddress);
PendingFlush[index].Add(asyncResult);
if (PendingFlush[index].RemoveAdjacent(FlushedUntilAddress, out PageAsyncFlushResult request))
{
WriteAsync(request.fromAddress >> LogPageSizeBits, AsyncFlushPageCallback, request);
}
}
else
WriteAsync(flushPage, AsyncFlushPageCallback, asyncResult);
}
}
///
/// Flush pages asynchronously
///
///
///
///
///
///
public void AsyncFlushPages(
long flushPageStart,
int numPages,
IOCompletionCallback callback,
TContext context)
{
for (long flushPage = flushPageStart; flushPage < (flushPageStart + numPages); flushPage++)
{
int pageIndex = GetPageIndexForPage(flushPage);
var asyncResult = new PageAsyncFlushResult()
{
page = flushPage,
context = context,
count = 1,
partial = false,
untilAddress = (flushPage + 1) << LogPageSizeBits
};
WriteAsync(flushPage, callback, asyncResult);
}
}
///
/// Flush pages from startPage (inclusive) to endPage (exclusive)
/// to specified log device and obj device
///
///
///
///
///
///
///
public void AsyncFlushPagesToDevice(long startPage, long endPage, long endLogicalAddress, IDevice device, IDevice objectLogDevice, out CountdownEvent completed)
{
int totalNumPages = (int)(endPage - startPage);
completed = new CountdownEvent(totalNumPages);
for (long flushPage = startPage; flushPage < endPage; flushPage++)
{
var asyncResult = new PageAsyncFlushResult
{
handle = completed,
count = 1
};
var pageSize = PageSize;
if (flushPage == endPage - 1)
pageSize = (int)(endLogicalAddress - (flushPage << LogPageSizeBits));
// Intended destination is flushPage
WriteAsyncToDevice(startPage, flushPage, pageSize, AsyncFlushPageToDeviceCallback, asyncResult, device, objectLogDevice);
}
}
///
/// Async get from disk
///
///
///
///
///
public void AsyncGetFromDisk(long fromLogical,
int numBytes,
AsyncIOContext context,
SectorAlignedMemory result = default(SectorAlignedMemory))
{
if (epoch.IsProtected()) // Do not spin for unprotected IO threads
{
while (numPendingReads > 120)
{
Thread.Yield();
epoch.ProtectAndDrain();
}
}
Interlocked.Increment(ref numPendingReads);
if (result == null)
AsyncReadRecordToMemory(fromLogical, numBytes, AsyncGetFromDiskCallback, context, result);
else
AsyncReadRecordObjectsToMemory(fromLogical, numBytes, AsyncGetFromDiskCallback, context, result);
}
private void AsyncGetFromDiskCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
var result = (AsyncGetFromDiskResult>)Overlapped.Unpack(overlap).AsyncResult;
Interlocked.Decrement(ref numPendingReads);
var ctx = result.context;
var record = ctx.record.GetValidPointer();
int requiredBytes = GetRequiredRecordSize((long)record, ctx.record.available_bytes);
if (ctx.record.available_bytes >= requiredBytes)
{
// We have the complete record.
if (RetrievedFullRecord(record, ref ctx))
{
if (comparer.Equals(ref ctx.request_key.Get(), ref GetContextRecordKey(ref ctx)))
{
// The keys are same, so I/O is complete
// ctx.record = result.record;
ctx.callbackQueue.Add(ctx);
}
else
{
var oldAddress = ctx.logicalAddress;
// Keys are not same. I/O is not complete
ctx.logicalAddress = GetInfoFromBytePointer(record).PreviousAddress;
if (ctx.logicalAddress >= BeginAddress)
{
ctx.record.Return();
ctx.record = ctx.objBuffer = default(SectorAlignedMemory);
AsyncGetFromDisk(ctx.logicalAddress, requiredBytes, ctx);
}
else
{
ctx.callbackQueue.Add(ctx);
}
}
}
}
else
{
ctx.record.Return();
AsyncGetFromDisk(ctx.logicalAddress, requiredBytes, ctx);
}
Overlapped.Free(overlap);
}
// static DateTime last = DateTime.Now;
///
/// IOCompletion callback for page flush
///
///
///
///
private void AsyncFlushPageCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
/*
if (DateTime.Now - last > TimeSpan.FromSeconds(7))
{
last = DateTime.Now;
errorCode = 1;
Console.WriteLine("Disk error");
}*/
// Set the page status to flushed
PageAsyncFlushResult result = (PageAsyncFlushResult)Overlapped.Unpack(overlap).AsyncResult;
if (Interlocked.Decrement(ref result.count) == 0)
{
if (errorCode != 0)
{
errorList.Add(result.fromAddress);
}
Utility.MonotonicUpdate(ref PageStatusIndicator[result.page % BufferSize].LastFlushedUntilAddress, result.untilAddress, out _);
ShiftFlushedUntilAddress();
result.Free();
}
var _flush = FlushedUntilAddress;
if (GetOffsetInPage(_flush) > 0 && PendingFlush[GetPage(_flush) % BufferSize].RemoveAdjacent(_flush, out PageAsyncFlushResult request))
{
WriteAsync(request.fromAddress >> LogPageSizeBits, AsyncFlushPageCallback, request);
}
Overlapped.Free(overlap);
}
///
/// IOCompletion callback for page flush
///
///
///
///
private void AsyncFlushPageToDeviceCallback(uint errorCode, uint numBytes, NativeOverlapped* overlap)
{
if (errorCode != 0)
{
Trace.TraceError("OverlappedStream GetQueuedCompletionStatus error: {0}", errorCode);
}
PageAsyncFlushResult result = (PageAsyncFlushResult)Overlapped.Unpack(overlap).AsyncResult;
if (Interlocked.Decrement(ref result.count) == 0)
{
result.Free();
}
Overlapped.Free(overlap);
}
///
/// Shallow copy
///
///
///
public virtual void ShallowCopy(ref Key src, ref Key dst)
{
dst = src;
}
///
/// Shallow copy
///
///
///
public virtual void ShallowCopy(ref Value src, ref Value dst)
{
dst = src;
}
private string PrettyPrint(long address)
{
return $"{GetPage(address)}:{GetOffsetInPage(address)}";
}
}
}