Dryad/DryadVertex/VertexHost/system/managedchannel/AzureReader.cs

353 lines
12 KiB
C#

/*
Copyright (c) Microsoft Corporation
All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License
at http://www.apache.org/licenses/LICENSE-2.0
THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF
TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT.
See the Apache Version 2.0 License for specific language governing permissions and
limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Collections.Specialized;
using System.IO;
using System.Linq;
using System.Net;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Research.Peloponnese.Storage;
namespace Microsoft.Research.Dryad.Channel
{
internal class AzureReader : StreamReader
{
private static TimeSpan ExecutionTimeout = new TimeSpan(0, 0, 30); // 30 seconds before retrying
private AzureDfsClient client;
private Stream readStream;
private long bytesToRead;
private Uri source;
private int blobIndex;
public AzureReader(Uri path, int numberOfReaders, IReaderClient client, IDrLogging logger)
: base(ComputeBufferSize(numberOfReaders), client, logger)
{
source = path;
}
private static int ComputeBufferSize(int numberOfReaders)
{
// for now set the buffer size to 4MB since that gives decent performance
// with 4MB blocks
return AzureConsts.MaxBlockSize4MB;
/*
int blockSize = 4 * 1024;
int numberOfBlocksPerBuffer = 64 / numberOfReaders;
if (numberOfBlocksPerBuffer < 16)
{
numberOfBlocksPerBuffer = 16;
}
return blockSize * numberOfBlocksPerBuffer;
*/
}
public override Uri Source { get { return source; } }
public override bool IsLocal { get { return false; } }
protected override string DescribeException(Exception e)
{
return null;
}
protected override async Task Open()
{
Log.LogInformation("Opening read for " + source.AbsoluteUri);
string account, key, container, blobName;
AzureUtils.FromAzureUri(source, out account, out key, out container, out blobName);
client = new AzureDfsClient(account, key, container, false, new PeloponneseLogger(Log.Logger));
client.SetParallelThreadCount(4);
NameValueCollection query = System.Web.HttpUtility.ParseQueryString(source.Query);
if (query["blobs"] == null)
{
blobIndex = -1;
}
else
{
blobIndex = 0;
}
await OpenBlob();
}
private async Task OpenBlob()
{
string account, key, container, blobName;
AzureUtils.FromAzureUri(source, out account, out key, out container, out blobName);
NameValueCollection query = System.Web.HttpUtility.ParseQueryString(source.Query);
if (blobIndex >= 0)
{
string[] blobs = query["blobs"].Split(',');
blobName += blobs[blobIndex];
++blobIndex;
}
Log.LogInformation("Opening read for blob " + blobName);
readStream = (await client.GetDfsFileStreamAsync(blobName, ExecutionTimeout, new PeloponneseLogger(Log.Logger))).Stream;
long offset = -1;
if (query["offset"] != null)
{
offset = Int64.Parse(query["offset"]);
readStream.Seek(offset, SeekOrigin.Begin);
}
bytesToRead = Int64.MaxValue;
if (query["length"] != null)
{
bytesToRead = Int64.Parse(query["length"]);
}
if (query["seekboundaries"] != null)
{
if (offset == -1 || bytesToRead == Int64.MaxValue)
{
throw new ApplicationException("Reading " + source.AbsoluteUri + ": Can't look for line endings without block start and end metadata");
}
if (query["seekboundaries"] != "Microsoft.Research.DryadLinq.LineRecord")
{
throw new ApplicationException("Reading " + source.AbsoluteUri + ": Don't know how to seek for record boundaries of type " + query["seekboundaries"]);
}
// SeekLineRecordBoundaries updates bytesToRead
offset = await SeekLineRecordBoundaries(offset);
readStream.Seek(offset, SeekOrigin.Begin);
}
long thisLength;
if (bytesToRead == Int64.MaxValue)
{
thisLength = readStream.Length;
}
else
{
thisLength = bytesToRead;
}
long currentLength = TotalLength;
if (currentLength == -1)
{
currentLength = thisLength;
Log.LogInformation("Setting Azure read total length to " + currentLength);
}
else
{
currentLength += thisLength;
Log.LogInformation("Increasing Azure read total length to " + thisLength);
}
TotalLength = currentLength;
}
private int NumberOfBlobs
{
get
{
NameValueCollection query = System.Web.HttpUtility.ParseQueryString(source.Query);
return query["blobs"].Split(',').Length;
}
}
protected override async Task<ReadData> ReadBuffer(byte[] managedBuffer)
{
while (true)
{
ReadData readData = await ReadBufferInternal(managedBuffer);
if (readData.eof && blobIndex >= 0 && blobIndex < NumberOfBlobs)
{
// this is a multi-blob read; open the next one
await OpenBlob();
if (readData.nRead > 0)
{
readData.eof = false;
// we read some data; return it now
return readData;
}
// otherwise go around the loop again and read some data from the next blob
}
else
{
return readData;
}
}
}
private async Task<ReadData> ReadBufferInternal(byte[] managedBuffer)
{
ReadData readData;
if (bytesToRead == 0)
{
// we knew on open that there was nothing to read
readData.nRead = 0;
readData.eof = true;
return readData;
}
int toRead = (int)Math.Min((long)managedBuffer.Length, bytesToRead);
Log.LogInformation("About to read buffer length " + toRead);
readData.nRead = await AzureUtils.WrapInRetry(new PeloponneseLogger(Log.Logger), async () =>
{
return await readStream.ReadAsync(managedBuffer, 0, toRead);
});
if (bytesToRead == Int64.MaxValue)
{
readData.eof = (readData.nRead < toRead);
}
else
{
if (readData.nRead == 0)
{
throw new ApplicationException("Reader " + source.AbsoluteUri + " ran out of data with " + bytesToRead + " bytes remaining to read");
}
bytesToRead -= readData.nRead;
readData.eof = (bytesToRead == 0);
}
Log.LogInformation("Read buffer length " + toRead + " got " + readData.nRead + " eof " + readData.eof);
return readData;
}
protected override Task OnFinishedRead()
{
Log.LogInformation("Finished reading " + source.AbsoluteUri);
return Task.FromResult(true);
}
private async Task<long> SeekLineRecordBoundaries(long offset)
{
long startOfNextBlock = offset + bytesToRead;
if (offset != 0)
{
// we aren't reading the first block, so the previous reader is going to take care of the partial
// line that begins at this block. We'll look for the start of the next line
offset = await SeekStartOfLine(offset, startOfNextBlock);
if (offset > startOfNextBlock)
{
// We looked as far as the first character in the next block and didn't find the start of a
// new line. That means the preceding block reader is going to handle our entire block, so
// we don't need to do anything. By setting bytesToRead to 0, we ensure that the first read
// will return eof as desired
bytesToRead = 0;
return 0;
}
}
// determine the first line that the next block reader is going to take care of; keep looking all the
// way to the end of the stream if need be
long startOfFirstLineAfterBlock = await SeekStartOfLine(startOfNextBlock, Int64.MaxValue);
if (startOfFirstLineAfterBlock < offset)
{
throw new ApplicationException("Line boundary logic bug");
}
bytesToRead = startOfFirstLineAfterBlock - offset;
return offset;
}
private async Task<long> SeekStartOfLine(long currentOffset, long startOfNextBlock)
{
byte[] buffer = new byte[4 * 1024];
readStream.Seek(currentOffset, SeekOrigin.Begin);
long endOffset;
if (startOfNextBlock == Int64.MaxValue)
{
endOffset = Int64.MaxValue;
}
else
{
// we may need to look as far as the first character in the next block
endOffset = startOfNextBlock + 1;
}
bool foundReturn = false;
while (currentOffset < endOffset)
{
int toRead = (int)Math.Min(buffer.LongLength, endOffset - currentOffset);
int nRead = await AzureUtils.WrapInRetry(new PeloponneseLogger(Log.Logger), async () =>
{
return await readStream.ReadAsync(buffer, 0, toRead);
});
if (nRead == 0)
{
// we hit end of stream
return currentOffset;
}
for (int i = 0; i < nRead; ++i)
{
if (buffer[i] == '\n')
{
// the next character is the first character of a line
return currentOffset + i + 1;
}
else if (foundReturn)
{
// there was a '\r' that wasn't followed by a '\n', i.e. this is
// the first character of a line
return currentOffset + i;
}
else if (buffer[i] == '\r')
{
// just flag this for now; if the next character is '\n' the next line
// starts after that, otherwise the next line starts with the next character
foundReturn = true;
}
}
currentOffset += nRead;
}
// the next line starts at some character >= startOfNextBlock+1, so it's not our problem
return currentOffset;
}
private static IManagedReader OpenAzureReader(Uri uri, int numberOfReaders, IReaderClient client, IDrLogging logger)
{
return new AzureReader(uri, numberOfReaders, client, logger);
}
public static void Register()
{
Factory.RegisterReader("azureblob", OpenAzureReader);
}
}
}