890 lines
37 KiB
C#
890 lines
37 KiB
C#
/*
|
||
Copyright (c) Microsoft Corporation
|
||
|
||
All rights reserved.
|
||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
|
||
compliance with the License. You may obtain a copy of the License
|
||
at http://www.apache.org/licenses/LICENSE-2.0
|
||
|
||
|
||
THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER
|
||
EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED WARRANTIES OR CONDITIONS OF
|
||
TITLE, FITNESS FOR A PARTICULAR PURPOSE, MERCHANTABLITY OR NON-INFRINGEMENT.
|
||
|
||
|
||
See the Apache Version 2.0 License for specific language governing permissions and
|
||
limitations under the License.
|
||
|
||
*/
|
||
|
||
//
|
||
// <20> Microsoft Corporation. All rights reserved.
|
||
//
|
||
using System;
|
||
using System.Collections;
|
||
using System.Collections.Generic;
|
||
using System.IO;
|
||
using System.Text;
|
||
using System.Linq;
|
||
using System.Diagnostics;
|
||
using System.Linq.Expressions;
|
||
using System.Reflection;
|
||
using Microsoft.Research.Dryad.Hdfs;
|
||
using Microsoft.Research.DryadLinq.Internal;
|
||
using System.Globalization;
|
||
|
||
namespace Microsoft.Research.DryadLinq
|
||
{
|
||
// The base provider for all DryadLinq queries.
|
||
// Any IQueryable that we are handling should satisfy ((queryable.Provider is DryadLinqProviderBase) == true)
|
||
//
|
||
// For example:
|
||
// - all IQueryable extension methods ask for (queryable.Provider) and then call provider.CreateQuery(expr)
|
||
internal abstract class DryadLinqProviderBase : IQueryProvider
|
||
{
|
||
private HpcLinqContext m_context;
|
||
internal HpcLinqContext Context { get { return m_context; } }
|
||
|
||
internal DryadLinqProviderBase(HpcLinqContext context)
|
||
{
|
||
m_context = context;
|
||
}
|
||
|
||
public abstract IQueryable<TElement> CreateQuery<TElement>(Expression expression);
|
||
public abstract IQueryable CreateQuery(Expression expression);
|
||
public abstract TResult Execute<TResult>(Expression expression);
|
||
public abstract object Execute(Expression expression);
|
||
}
|
||
|
||
// The provider for DryadLinq queries that will be executed by the LocalDebug infrastructure.
|
||
internal sealed class DryadLinqLocalProvider : DryadLinqProviderBase
|
||
{
|
||
private IQueryProvider m_linqToObjectsProvider;
|
||
|
||
public DryadLinqLocalProvider(IQueryProvider linqToObjectsProvider, HpcLinqContext context)
|
||
: base(context)
|
||
{
|
||
this.m_linqToObjectsProvider = linqToObjectsProvider;
|
||
}
|
||
|
||
//Always throw for untyped call.
|
||
public override IQueryable CreateQuery(Expression expression)
|
||
{
|
||
MethodCallExpression callExpr = expression as MethodCallExpression;
|
||
if (callExpr == null)
|
||
{
|
||
throw new DryadLinqException(HpcLinqErrorCode.ExpressionMustBeMethodCall, SR.ExpressionMustBeMethodCall);
|
||
}
|
||
string methodName = callExpr.Method.Name;
|
||
throw new DryadLinqException(HpcLinqErrorCode.UntypedProviderMethodsNotSupported,
|
||
String.Format(SR.UntypedProviderMethodsNotSupported, methodName));
|
||
}
|
||
|
||
//Always throw for untyped call.
|
||
public override object Execute(Expression expression)
|
||
{
|
||
return this.CreateQuery(expression);
|
||
}
|
||
|
||
public override IQueryable<TElement> CreateQuery<TElement>(Expression expression)
|
||
{
|
||
ThrowIfUnsupported(expression);
|
||
var localQuery = this.m_linqToObjectsProvider.CreateQuery<TElement>(expression);
|
||
return new DryadLinqLocalQuery<TElement>(this, localQuery);
|
||
}
|
||
|
||
public override TResult Execute<TResult>(Expression expression)
|
||
{
|
||
ThrowIfUnsupported(expression);
|
||
return this.m_linqToObjectsProvider.Execute<TResult>(expression);
|
||
}
|
||
|
||
internal void ThrowIfUnsupported(Expression expression)
|
||
{
|
||
var mcexpr = expression as MethodCallExpression;
|
||
if (mcexpr != null)
|
||
{
|
||
// if (mcexpr.Method.Name == "SequenceEqual")
|
||
// {
|
||
// throw new NotSupportedException(SR.SequenceEqualNotSupported);
|
||
// }
|
||
}
|
||
}
|
||
}
|
||
|
||
// The IQueryable<T> that is used for LocalDebug queries.
|
||
// This is much simpler than DryadLinqQuery<T> as it only has to support fallback to LINQ-to-objects.
|
||
internal sealed class DryadLinqLocalQuery<T> : IOrderedQueryable<T>, IEnumerable<T>, IOrderedQueryable
|
||
{
|
||
private IQueryProvider m_queryProvider;
|
||
private IQueryable<T> m_localQuery;
|
||
|
||
public DryadLinqLocalQuery(IQueryProvider queryProvider, IQueryable<T> localQuery)
|
||
{
|
||
this.m_queryProvider = queryProvider;
|
||
this.m_localQuery = localQuery;
|
||
}
|
||
|
||
public Expression Expression
|
||
{
|
||
get { return this.m_localQuery.Expression; }
|
||
}
|
||
|
||
Type IQueryable.ElementType
|
||
{
|
||
get { return typeof(T); }
|
||
}
|
||
|
||
IQueryProvider IQueryable.Provider
|
||
{
|
||
get { return this.m_queryProvider; }
|
||
}
|
||
|
||
IEnumerator IEnumerable.GetEnumerator()
|
||
{
|
||
return this.GetEnumerator();
|
||
}
|
||
|
||
public IEnumerator<T> GetEnumerator()
|
||
{
|
||
return this.m_localQuery.GetEnumerator();
|
||
}
|
||
}
|
||
|
||
|
||
// The provider for DryadLinq queries that will be executed by the cluster infrastructure.
|
||
internal sealed class DryadLinqProvider : DryadLinqProviderBase
|
||
{
|
||
internal DryadLinqProvider(HpcLinqContext context)
|
||
: base(context)
|
||
{
|
||
}
|
||
|
||
//It is exercised by unit test "Bug11782_LowLevelQueryableManipulation"
|
||
// which ccn now expect to receive an exception
|
||
// Always throw for the untyped calls.
|
||
public override IQueryable CreateQuery(Expression expression)
|
||
{
|
||
MethodCallExpression callExpr = expression as MethodCallExpression;
|
||
if (callExpr == null)
|
||
{
|
||
throw new DryadLinqException(HpcLinqErrorCode.ExpressionMustBeMethodCall,
|
||
SR.ExpressionMustBeMethodCall);
|
||
}
|
||
string methodName = callExpr.Method.Name;
|
||
throw new DryadLinqException(HpcLinqErrorCode.UntypedProviderMethodsNotSupported,
|
||
String.Format(SR.UntypedProviderMethodsNotSupported, methodName));
|
||
}
|
||
|
||
public override IQueryable<TElement> CreateQuery<TElement>(Expression expression)
|
||
{
|
||
return new DryadLinqQuery<TElement>(this, expression);
|
||
}
|
||
|
||
//This is the IQueryProvider.Execute() method used for execution when a single value is produced (rather than an enumerable)
|
||
//This non-generic method simply delegates to the generic method
|
||
//Always throw for the untyped calls
|
||
public override object Execute(Expression expression)
|
||
{
|
||
return this.CreateQuery(expression); // which will throw.
|
||
}
|
||
|
||
//This is the IQueryProvider.Execute() method used for execution when a single value is produced (rather than an enumerable)
|
||
public override TResult Execute<TResult>(Expression expression)
|
||
{
|
||
MethodCallExpression callExpr = expression as MethodCallExpression;
|
||
if (callExpr == null)
|
||
{
|
||
throw new ArgumentException(String.Format(SR.ExpressionMustBeMethodCall,
|
||
HpcLinqExpression.Summarize(expression)), "expression");
|
||
}
|
||
string methodName = callExpr.Method.Name;
|
||
if (methodName == "FirstOrDefault" ||
|
||
methodName == "SingleOrDefault" ||
|
||
methodName == "LastOrDefault")
|
||
{
|
||
Type qType = typeof(DryadLinqQuery<>).MakeGenericType(typeof(AggregateValue<>).MakeGenericType(expression.Type));
|
||
AggregateValue<TResult> res = ((IEnumerable<AggregateValue<TResult>>)
|
||
Activator.CreateInstance(
|
||
qType,
|
||
BindingFlags.NonPublic | BindingFlags.Instance,
|
||
null,
|
||
new object[] { this, expression},
|
||
CultureInfo.CurrentCulture
|
||
)).Single();
|
||
if (res.Count == 0) return default(TResult);
|
||
return res.Value;
|
||
}
|
||
else
|
||
{
|
||
Type qType = typeof(DryadLinqQuery<>).MakeGenericType(expression.Type);
|
||
return ((IEnumerable<TResult>) Activator.CreateInstance(
|
||
qType,
|
||
BindingFlags.NonPublic | BindingFlags.Instance,
|
||
null,
|
||
new object[] { this, expression },
|
||
CultureInfo.CurrentCulture
|
||
)).Single();
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
//Note: cannot be sub-classed by users as they cannot provide overrides for the internal abstract properties.
|
||
internal abstract class DryadLinqQuery
|
||
{
|
||
protected DryadLinqProviderBase m_queryProvider;
|
||
private DataProvider m_dataProvider;
|
||
private bool m_isTemp;
|
||
private JobExecutor m_queryExecutor;
|
||
|
||
internal DryadLinqQuery(DryadLinqProviderBase queryProvider, DataProvider dataProvider)
|
||
{
|
||
this.m_queryProvider = queryProvider;
|
||
this.m_dataProvider = dataProvider;
|
||
this.m_queryExecutor = null;
|
||
}
|
||
|
||
//if non-null, this provided a data-backed DLQ that should be used in place of (this).
|
||
//query-execution causes a _backingData field to be set for the DLQ nodes that were specifically "executed".
|
||
//(used to be called _table/Table)
|
||
internal abstract DryadLinqQuery BackingDataDLQ { set; }
|
||
internal abstract bool IsDataBacked { get; }
|
||
|
||
internal abstract Type Type { get; }
|
||
internal abstract string DataSourceUri { get; }
|
||
internal abstract bool IsDynamic { get; }
|
||
internal abstract int PartitionCount { get; }
|
||
internal abstract DataSetInfo DataSetInfo { get; }
|
||
|
||
internal DryadLinqProviderBase QueryProvider
|
||
{
|
||
get { return this.m_queryProvider; }
|
||
}
|
||
|
||
internal DataProvider DataProvider
|
||
{
|
||
get { return this.m_dataProvider; }
|
||
}
|
||
|
||
internal bool IsTemp
|
||
{
|
||
set { this.m_isTemp = value; }
|
||
}
|
||
|
||
internal JobExecutor QueryExecutor
|
||
{
|
||
get { return this.m_queryExecutor; }
|
||
set { this.m_queryExecutor = value; }
|
||
}
|
||
|
||
protected void CopyToInternal(DryadLinqQuery otherQuery)
|
||
{
|
||
otherQuery.m_queryProvider = this.m_queryProvider;
|
||
otherQuery.m_dataProvider = this.m_dataProvider;
|
||
otherQuery.m_isTemp = this.m_isTemp;
|
||
otherQuery.m_queryExecutor = this.m_queryExecutor;
|
||
}
|
||
|
||
internal virtual VertexCodeGen GetVertexCodeGen()
|
||
{
|
||
return new VertexCodeGen();
|
||
}
|
||
}
|
||
|
||
// The IQueryable<T> that is used for cluster-execution queries.
|
||
internal class DryadLinqQuery<T> : DryadLinqQuery, IOrderedQueryable<T>, IEnumerable<T>, IOrderedQueryable
|
||
{
|
||
// If _backingDataDLQ is set, this is a normal query node that was executed and now has a
|
||
// "PlainData" DLQ available with the results.
|
||
private DryadLinqQuery<T> m_backingDataDLQ;
|
||
private Expression m_queryExpression;
|
||
private string m_dataSourceUri;
|
||
private DataSetInfo m_dataSetInfo;
|
||
private bool m_isDynamic;
|
||
private DryadLinqQueryEnumerable<T> m_tableEnumerable;
|
||
|
||
//ctor:
|
||
// 1. used by IQueryProvider. eg IQueryable<>.Select()
|
||
// 2. used by lazy queries which come in through here via
|
||
// DryadLinqIQueryable.ToPartitionedTableLazy()
|
||
// -> DryadLinqProvider.CreateQuery()
|
||
//
|
||
internal DryadLinqQuery(DryadLinqProviderBase provider, Expression expression)
|
||
: base(provider, null)
|
||
{
|
||
this.m_queryExpression = expression;
|
||
this.m_isDynamic = false;
|
||
this.m_tableEnumerable = null;
|
||
}
|
||
|
||
//ctor:
|
||
//[ML]: combined from MSR-DL ctors for PartitionedTable<> and DryadLinqQuery<>
|
||
// This ctor is used by DryadLinqQuery.Get(uri)
|
||
internal DryadLinqQuery(Expression queryExpression,
|
||
DryadLinqProvider queryProvider,
|
||
DataProvider dataProvider,
|
||
string dataUri)
|
||
: base(queryProvider, dataProvider)
|
||
{
|
||
if(!DataPath.IsDsc(dataUri) && !DataPath.IsHdfs(dataUri))
|
||
{
|
||
throw new DryadLinqException(HpcLinqErrorCode.UnrecognizedDataSource,
|
||
String.Format(SR.UnrecognizedDataSource, dataUri));
|
||
}
|
||
|
||
this.m_queryExpression = queryExpression;
|
||
this.m_dataSourceUri = DataPath.GetDataPath(dataUri);
|
||
this.m_isDynamic = false;
|
||
this.m_tableEnumerable = null;
|
||
}
|
||
|
||
internal void CopyTo(DryadLinqQuery<T> otherQuery)
|
||
{
|
||
this.CopyToInternal(otherQuery);
|
||
otherQuery.m_backingDataDLQ = this.m_backingDataDLQ;
|
||
otherQuery.m_queryExpression = this.m_queryExpression;
|
||
otherQuery.m_dataSourceUri = this.m_dataSourceUri;
|
||
otherQuery.m_dataSetInfo = this.m_dataSetInfo;
|
||
otherQuery.m_isDynamic = this.m_isDynamic;
|
||
otherQuery.m_tableEnumerable = this.m_tableEnumerable;
|
||
}
|
||
|
||
// returns true for DLQ that are pointing directly at plain data.
|
||
// Note: plain-data DLQ might also have an executor associated with it.. the data wont be
|
||
// available unless the executor completes sucessfully.
|
||
internal bool IsPlainData
|
||
{
|
||
get { return (this.m_dataSourceUri != null); }
|
||
}
|
||
|
||
// returns true for DLQ that are not themselves pointing directly at plain data, eg query-operators.
|
||
internal bool IsNormalQuery
|
||
{
|
||
get { return (this.m_dataSourceUri == null); }
|
||
}
|
||
|
||
internal override DryadLinqQuery BackingDataDLQ
|
||
{
|
||
set { m_backingDataDLQ = (DryadLinqQuery<T>)value; }
|
||
}
|
||
|
||
// returns true for a normal query that was executed and now has a backing data DLQ available.
|
||
internal override bool IsDataBacked
|
||
{
|
||
get { return (this.m_backingDataDLQ != null); }
|
||
}
|
||
|
||
// returns true if an executor is associated with the DLQ.
|
||
internal bool HasExecutor
|
||
{
|
||
get {
|
||
bool hasExec = (this.QueryExecutor != null);
|
||
if (hasExec && !IsPlainData)
|
||
{
|
||
throw new DryadLinqException("An executor should only be associated with a DLQ that is plain data");
|
||
}
|
||
return hasExec;
|
||
}
|
||
}
|
||
|
||
public HpcLinqContext Context
|
||
{
|
||
get { return m_queryProvider.Context; }
|
||
}
|
||
|
||
#region IQueryable members
|
||
Type IQueryable.ElementType
|
||
{
|
||
get { return typeof(T); }
|
||
}
|
||
|
||
//@@Comment-required: (bit unclear what the intended behavior is for localDebug)
|
||
//ML: combined from PT<T> and DLQ<T>..
|
||
IQueryProvider IQueryable.Provider
|
||
{
|
||
get
|
||
{
|
||
this.CheckAndInitialize();
|
||
return this.m_queryProvider;
|
||
}
|
||
}
|
||
#endregion
|
||
|
||
// Executes a query to a named Dsc URI. The query should _not_ be terminated with ToDsc().
|
||
internal DryadLinqQuery<T> ToTemporaryTable(HpcLinqContext context, string targetUri)
|
||
{
|
||
if ((!DataPath.IsDsc(targetUri)) && (!DataPath.IsHdfs(targetUri)))
|
||
{
|
||
throw new ArgumentException(String.Format(SR.UnrecognizedDataSource, targetUri));
|
||
}
|
||
|
||
HpcLinqQueryGen dryadGen = null;
|
||
string realTableUri = targetUri;
|
||
#if REMOVE_FOR_YARN
|
||
if (IsPlainData) // was if (this.m_queryExpression is ConstantExpression)
|
||
{
|
||
//@@TODO: I think this is dead code. See if it can be exercised.
|
||
|
||
// the input is a Plain-data DLQ.
|
||
// the output-target has been set
|
||
// We expect both to be DSC -- so we just use the DSC API to perform a copy rather than invoke dryad.
|
||
string inputUri = DataSourceUri;
|
||
|
||
Debug.Assert(DataPath.IsDsc(inputUri) && DataPath.IsDsc(targetUri), "both uris should be to Dsc");
|
||
|
||
using (DscInstance inputService = new DscInstance(new Uri(inputUri)))
|
||
using (DscInstance outputService = new DscInstance(new Uri(targetUri)))
|
||
{
|
||
DscStream inputStream = inputService.GetStream(new Uri(inputUri));
|
||
try
|
||
{
|
||
DscStream outputStream = outputService.GetStream(new Uri(targetUri));
|
||
outputStream.Delete();
|
||
}
|
||
catch (DscException)
|
||
{ }
|
||
inputStream.Copy(new Uri(targetUri));
|
||
////this.m_table = DryadLinqQuery.Get<T>(tableUri);
|
||
////return this.m_table;
|
||
//// [ML] part of deleting this.m_table. We just return (this) rather than (this.m_table)
|
||
|
||
DryadLinqQuery<T> databackedDLQ = DataProvider.GetPartitionedTable<T>(Context, targetUri);
|
||
this.m_backingDataDLQ = databackedDLQ; //ML: we set the new table as backing data for the source. (not sure what this gains)
|
||
return databackedDLQ;
|
||
}
|
||
|
||
}
|
||
else if (IsDataBacked)
|
||
{
|
||
// @@TODO: I think this is dead code. See if it can be exercised.
|
||
// if taken, we should be able to just recurse with the backing data (after doing _backingDataDLQ.CheckAndInitialize())
|
||
throw new NotImplementedException();
|
||
}
|
||
#endif
|
||
// Invoke Dryad
|
||
Debug.Assert(IsNormalQuery, "execution should only occur for a normal query");
|
||
if (dryadGen == null)
|
||
{
|
||
dryadGen = new HpcLinqQueryGen(context, this.GetVertexCodeGen(), this.Expression, realTableUri, true);
|
||
}
|
||
DryadLinqQuery[] resultTables = dryadGen.InvokeDryad();
|
||
this.m_backingDataDLQ = (DryadLinqQuery<T>) resultTables[0];
|
||
|
||
return this;
|
||
}
|
||
|
||
// Generate the query plan as an XML file and return the file name.
|
||
// provided for test-support. Access via reflection.
|
||
// returns the queryPlan xml path.
|
||
internal string ToDryadLinqProgram()
|
||
{
|
||
string tableUri = DataPath.DSC_URI_PREFIX + @"dummy/dummy";
|
||
HpcLinqQueryGen dryadGen = new HpcLinqQueryGen(Context, this.GetVertexCodeGen(), this.m_queryExpression, tableUri, true);
|
||
return dryadGen.GenerateDryadProgram();
|
||
}
|
||
|
||
// ML: complex ToString is problematic for the debugger -- eg Expression.ToString() leads to infinite recursion
|
||
// for PlainData which has Expression=ConstantExpression(this) and ConstantExpression(x).ToString() ==> x.ToString()
|
||
// Also, a rich ToString risks leaking internal details.
|
||
// @@TODO[P2]: this override should not be necessary.. however the debugger was acting up without it..
|
||
// eg timing out when inspecting DryadLinqQuery objects in watch window etc.
|
||
public override string ToString()
|
||
{
|
||
return base.ToString();
|
||
}
|
||
|
||
internal void Initialize()
|
||
{
|
||
//Detailed initialize behavior is only for plain-old-data.
|
||
//This was previously implicit (as only defined on the PartitionedData<> type)
|
||
if (this.IsPlainData)
|
||
{
|
||
// short-circuit if already initialized
|
||
if (this.m_tableEnumerable != null)
|
||
{
|
||
return;
|
||
}
|
||
|
||
Int32 parCount = 0;
|
||
Int64 estSize = -1;
|
||
this.m_isDynamic = false;
|
||
|
||
try
|
||
{
|
||
// YY: TODO: This could just be set to -1 if the xmlexechost will create the correct number of partitions
|
||
// YY: We need the partition count here: it is used in plan optimization.
|
||
if (DataPath.IsHdfs((this.m_dataSourceUri)))
|
||
{
|
||
//hdfs
|
||
/*
|
||
using (HdfsInstance hdfs = new HdfsInstance(this.m_dataSourceUri))
|
||
{
|
||
string path = hdfs.FromInternalUri(this.m_dataSourceUri);
|
||
HdfsFileInfo dataStream = hdfs.GetFileInfo(path, true);
|
||
estSize = (long)dataStream.totalSize;
|
||
parCount = (Int32)dataStream.blockArray.Length;
|
||
}
|
||
*/
|
||
WebHdfsClient.GetContentSummary(this.m_dataSourceUri, ref estSize, ref parCount);
|
||
|
||
}
|
||
else
|
||
{
|
||
//dsc
|
||
using (DscInstance dataService = new DscInstance(new Uri(this.m_dataSourceUri)))
|
||
{
|
||
DscStream dataStream = dataService.GetStream(new Uri(this.m_dataSourceUri));
|
||
estSize = (long)dataStream.Length;
|
||
parCount = (Int32)dataStream.PartitionCount;
|
||
|
||
}
|
||
}
|
||
}
|
||
catch (Exception e)
|
||
{
|
||
throw new DryadLinqException(HpcLinqErrorCode.FailedToGetStreamProps,
|
||
String.Format(SR.FailedToGetStreamProps, this.m_dataSourceUri), e);
|
||
}
|
||
|
||
// --- start metadata processing --- //
|
||
// Finally load any stored metadata to check settings, extract compression-setting and initialize the DataInfo for this Query.
|
||
string streamName = DataPath.GetFilesetNameFromUri(this.m_dataSourceUri); // we converted to uri.. now must go back to stream-name.
|
||
DryadLinqMetaData meta = null;
|
||
if (DataPath.IsDsc(this.m_dataSourceUri))
|
||
{
|
||
meta = DryadLinqMetaData.FromDscStream(Context, streamName);
|
||
}
|
||
if (meta != null)
|
||
{
|
||
//check the record-type matches meta-data. (disabled until final API is determined)
|
||
//if (meta.ElemType != typeof(T))
|
||
//{
|
||
// throw new HpcLinqException(HpcLinqErrorCode.MetadataRecordType,
|
||
// String.Format(SR.MetadataRecordType, typeof(T), meta.ElemType));
|
||
//}
|
||
|
||
//check the serialization flags match meta-data. (disabled as serialization flags are fixed. re-consider when flags become user-settable again.)
|
||
//if (StaticConfig.AllowNullFields != meta.AllowNullFields ||
|
||
// StaticConfig.AllowNullArrayElements != meta.AllowNullArrayElements ||
|
||
// StaticConfig.AllowNullRecords != meta.AllowNullRecords)
|
||
//{
|
||
// HpcClientSideLog.Add("Warning: Table was generated with AllowNullFields=" + meta.AllowNullFields +
|
||
// ", AllowNullRecords=" + meta.AllowNullRecords +
|
||
// ", and AllowNullArrayElements=" + meta.AllowNullArrayElements);
|
||
//}
|
||
|
||
}
|
||
|
||
// Initialize the DataInfo -- currently we always initialize to the "nothing" datainfo.
|
||
PartitionInfo pinfo = new RandomPartition(parCount);
|
||
OrderByInfo oinfo = DataSetInfo.NoOrderBy;
|
||
DistinctInfo dinfo = DataSetInfo.NoDistinct;
|
||
this.m_dataSetInfo = new DataSetInfo(pinfo, oinfo, dinfo);
|
||
|
||
// --- end metadata processing --- //
|
||
|
||
string fileSetName = DataPath.GetFilesetNameFromUri(this.m_dataSourceUri);
|
||
this.m_tableEnumerable = new DryadLinqQueryEnumerable<T>(this.Context, fileSetName);
|
||
|
||
//YY: query expression and provider are at least set consistently
|
||
if (Context.Configuration.LocalDebug)
|
||
{
|
||
this.m_queryExpression = Expression.Constant(this.m_tableEnumerable.AsQueryable());
|
||
IQueryProvider linqToObjectProvider = this.m_tableEnumerable.AsQueryable().Provider; // this should be an instance of "EnumerableQuery<T>"
|
||
this.m_queryProvider = new DryadLinqLocalProvider(linqToObjectProvider, Context);
|
||
}
|
||
else
|
||
{
|
||
this.m_queryExpression = Expression.Constant(this);
|
||
this.m_queryProvider = new DryadLinqProvider(Context);
|
||
}
|
||
}
|
||
}
|
||
|
||
internal override Type Type
|
||
{
|
||
get { return typeof(T); }
|
||
}
|
||
|
||
// only legal/valid for plainData and data-backed DLQ.
|
||
internal override string DataSourceUri
|
||
{
|
||
get {
|
||
|
||
if (this.IsPlainData)
|
||
{
|
||
// no need to CheckAndInitialize() as m_dataSourceUri should already be set.
|
||
// also, performing checkAndInitialize causes infinite recursion due to it accessing DataSourceUri
|
||
return this.m_dataSourceUri;
|
||
}
|
||
else if (this.IsDataBacked)
|
||
{
|
||
// as above, regarding CheckAndInitialize()
|
||
return (this.m_backingDataDLQ).m_dataSourceUri;
|
||
}
|
||
|
||
throw new DryadLinqException(HpcLinqErrorCode.OnlyAvailableForPhysicalData,
|
||
SR.OnlyAvailableForPhysicalData);
|
||
}
|
||
}
|
||
|
||
// combination of old approaches. return either the full expression, or just an expression for the table, if available.
|
||
// * Fundamental part of IQueryable system.
|
||
// Most IQueryable operators will access (source.Expression) and form a new IQueryable
|
||
// which is a MethodCall('method',{src.Expression,params})
|
||
//
|
||
// Plain data: we create an expression to represent plain-data
|
||
// Data-backed query: we behave as if the IQueryable were just the backing data (ie a simple expression to plain-data)
|
||
// Normal query: an normal query node will already have a m_queryExpression
|
||
public Expression Expression
|
||
{
|
||
get
|
||
{
|
||
if (this.IsDataBacked)
|
||
{
|
||
// if this is a data-backed-query, (recursively) return the expression for the backingDLQ
|
||
Debug.Assert(this.m_backingDataDLQ.IsPlainData, "backing data is expected to always be plain data");
|
||
return (this.m_backingDataDLQ).Expression;
|
||
}
|
||
this.CheckAndInitialize();
|
||
return this.m_queryExpression;
|
||
}
|
||
}
|
||
|
||
internal override int PartitionCount
|
||
{
|
||
get
|
||
{
|
||
if (IsPlainData)
|
||
{
|
||
this.CheckAndInitialize();
|
||
return this.m_dataSetInfo.partitionInfo.Count;
|
||
}
|
||
|
||
if (IsDataBacked)
|
||
{
|
||
this.m_backingDataDLQ.CheckAndInitialize();
|
||
return this.m_backingDataDLQ.PartitionCount;
|
||
}
|
||
|
||
throw new DryadLinqException(HpcLinqErrorCode.OnlyAvailableForPhysicalData,
|
||
SR.OnlyAvailableForPhysicalData);
|
||
}
|
||
}
|
||
|
||
|
||
|
||
internal override bool IsDynamic
|
||
{
|
||
get {
|
||
this.CheckAndInitialize();
|
||
return this.m_isDynamic; // possible issue: if(IsDataBacked) then using the value from backing data may be more appropriate.
|
||
}
|
||
}
|
||
|
||
internal override DataSetInfo DataSetInfo
|
||
{
|
||
get
|
||
{
|
||
//even if data-backed, the DataSetInfo for the normal-query is the best available.
|
||
//hence this._m_dataSetInfo is always best.
|
||
|
||
this.CheckAndInitialize();
|
||
return this.m_dataSetInfo;
|
||
}
|
||
}
|
||
|
||
internal void CheckAndInitialize()
|
||
{
|
||
if (HasExecutor)
|
||
{
|
||
Debug.Assert(IsPlainData, "We expect a DLQ with an executor to be a plain-data DLQ");
|
||
|
||
JobStatus status = this.QueryExecutor.WaitForCompletion();
|
||
if (status == JobStatus.Failure)
|
||
{
|
||
throw new DryadLinqException(HpcLinqErrorCode.JobToCreateTableFailed,
|
||
String.Format(SR.JobToCreateTableFailed, this.QueryExecutor.ErrorMsg));
|
||
}
|
||
if (status == JobStatus.Cancelled)
|
||
{
|
||
throw new DryadLinqException(HpcLinqErrorCode.JobToCreateTableWasCanceled,
|
||
SR.JobToCreateTableWasCanceled);
|
||
}
|
||
if (status == JobStatus.Success)
|
||
{
|
||
HpcClientSideLog.Add("Table " + this.DataSourceUri + " was created successfully.");
|
||
}
|
||
}
|
||
this.Initialize();
|
||
}
|
||
|
||
|
||
|
||
#region IEnumerable and IEnumerable<T> members
|
||
IEnumerator IEnumerable.GetEnumerator()
|
||
{
|
||
return this.GetEnumerator();
|
||
}
|
||
|
||
// combined GetEnumerator from PT and DLQ... use table if present, else start query to generate anonymous output table.
|
||
public IEnumerator<T> GetEnumerator()
|
||
{
|
||
// Process:
|
||
// 1. if this is a data-backed-query, return an enumerator over the backing data
|
||
// 2. if this is plain-data, return an enumerator over the data.
|
||
// 2. otherwise, start an anonymous query execution (which will produce a data-backed-query), and call GetEnumerator() again to hit the first path.
|
||
|
||
if (this.IsPlainData){
|
||
this.CheckAndInitialize();
|
||
return this.m_tableEnumerable.GetEnumerator();
|
||
}
|
||
else if (this.IsDataBacked)
|
||
{
|
||
m_backingDataDLQ.CheckAndInitialize();
|
||
return m_backingDataDLQ.m_tableEnumerable.GetEnumerator();
|
||
}
|
||
else
|
||
{
|
||
Debug.Assert(IsNormalQuery);
|
||
|
||
// if terminated in ToDsc, eg query.ToDsc("path").GetEnumerator();
|
||
// currently: treat this as an error. We throw in both cluster and LocalDebug modes.
|
||
// @@TODO[P2]: we could execute the query, producing data as specified by ToDsc(). (see DryadQueryGen ctors)
|
||
//
|
||
// otherwise, we create a temporary stream to hold the data and get an enumerator.
|
||
// - cluster mode will run a dryad query.
|
||
// - LocalDebug mode will write the data directly into DSC.
|
||
|
||
string hdfsPath = DataPath.MakeUniqueTemporaryHdfsFileSetUri(Context);
|
||
return this.ToTemporaryTable(Context, hdfsPath).GetEnumerator();
|
||
}
|
||
}
|
||
#endregion
|
||
}
|
||
|
||
|
||
//From PartitionedTableEnumerable
|
||
internal class DryadLinqQueryEnumerable<T> : IEnumerable<T>, IEnumerable
|
||
{
|
||
internal string m_fileSetName;
|
||
private HpcLinqContext m_context;
|
||
|
||
public DryadLinqQueryEnumerable(HpcLinqContext context, string fileSetName)
|
||
{
|
||
m_context = context;
|
||
m_fileSetName = fileSetName;
|
||
}
|
||
|
||
IEnumerator IEnumerable.GetEnumerator()
|
||
{
|
||
return this.GetEnumerator();
|
||
}
|
||
|
||
IEnumerator<T> IEnumerable<T>.GetEnumerator()
|
||
{
|
||
return this.GetEnumerator();
|
||
}
|
||
|
||
public IEnumerator<T> GetEnumerator()
|
||
{
|
||
List<string[]> filePathList; // a list of dsc files, each of which is represented by an array holding the replica paths
|
||
DscCompressionScheme compressionScheme;
|
||
try
|
||
{
|
||
DscFileSet fileSet = m_context.DscService.GetFileSet(m_fileSetName);
|
||
filePathList = fileSet.GetFiles().Select(file => file.ReadPaths).ToList();
|
||
DryadLinqMetaData metaData = DryadLinqMetaData.FromDscStream(m_context, m_fileSetName);
|
||
compressionScheme = metaData.CompressionScheme;
|
||
|
||
}
|
||
catch (Exception e)
|
||
{
|
||
throw new DryadLinqException(HpcLinqErrorCode.FailedToGetReadPathsForStream,
|
||
String.Format(SR.FailedToGetReadPathsForStream, this.m_fileSetName), e);
|
||
}
|
||
|
||
return new TableEnumerator(m_context, filePathList, m_fileSetName,compressionScheme);
|
||
}
|
||
|
||
// Internal enumerator class
|
||
private class TableEnumerator : IEnumerator<T>
|
||
{
|
||
private HpcLinqContext m_context;
|
||
private T m_current;
|
||
private List<string[]> m_filePathList; // a list of dsc files, each of which is represented by an array holding the replica paths
|
||
private string m_associatedDscStreamName; // stored here only to provide a better exception message in case of IO errors
|
||
private DscCompressionScheme m_compressionScheme;
|
||
private HpcLinqFactory<T> m_factory;
|
||
private HpcRecordReader<T> m_reader;
|
||
|
||
internal TableEnumerator(HpcLinqContext context,
|
||
List<string[]> filePathList,
|
||
string associatedDscStreamName,
|
||
DscCompressionScheme scheme)
|
||
{
|
||
this.m_context = context;
|
||
this.m_current = default(T);
|
||
this.m_filePathList = filePathList;
|
||
this.m_associatedDscStreamName = associatedDscStreamName;
|
||
this.m_compressionScheme = scheme;
|
||
this.m_factory = (HpcLinqFactory<T>)HpcLinqCodeGen.GetFactory(context, typeof(T));
|
||
bool appendNewLinesToFiles = (typeof(T) == typeof(LineRecord));
|
||
NativeBlockStream nativeStream = new MultiBlockStream(m_filePathList, m_associatedDscStreamName,
|
||
FileAccess.Read, m_compressionScheme,
|
||
appendNewLinesToFiles);
|
||
this.m_reader = this.m_factory.MakeReader(nativeStream);
|
||
|
||
if (context.Configuration.AllowConcurrentUserDelegatesInSingleProcess)
|
||
{
|
||
this.m_reader.StartWorker();
|
||
}
|
||
}
|
||
|
||
public bool MoveNext()
|
||
{
|
||
if (m_context.Configuration.AllowConcurrentUserDelegatesInSingleProcess)
|
||
{
|
||
return this.m_reader.ReadRecordAsync(ref this.m_current);
|
||
}
|
||
else
|
||
{
|
||
return this.m_reader.ReadRecordSync(ref this.m_current);
|
||
}
|
||
}
|
||
|
||
object IEnumerator.Current
|
||
{
|
||
get { return this.m_current; }
|
||
}
|
||
|
||
public T Current
|
||
{
|
||
get { return this.m_current; }
|
||
}
|
||
|
||
public void Reset()
|
||
{
|
||
this.m_current = default(T);
|
||
bool appendNewLineToFiles = (typeof(T) == typeof(LineRecord));
|
||
NativeBlockStream nativeStream = new MultiBlockStream(this.m_filePathList, m_associatedDscStreamName,
|
||
FileAccess.Read, this.m_compressionScheme,
|
||
appendNewLineToFiles);
|
||
this.m_reader = this.m_factory.MakeReader(nativeStream);
|
||
|
||
if (m_context.Configuration.AllowConcurrentUserDelegatesInSingleProcess)
|
||
{
|
||
this.m_reader.StartWorker();
|
||
}
|
||
}
|
||
|
||
void IDisposable.Dispose()
|
||
{
|
||
this.m_reader.Close();
|
||
}
|
||
}
|
||
}
|
||
}
|