fix memory bug, minor other changes
This commit is contained in:
parent
5ad103cba7
commit
86242db082
|
|
@ -77,7 +77,7 @@
|
||||||
<SpecificVersion>False</SpecificVersion>
|
<SpecificVersion>False</SpecificVersion>
|
||||||
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.HadoopBridge.dll</HintPath>
|
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.HadoopBridge.dll</HintPath>
|
||||||
</Reference>
|
</Reference>
|
||||||
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.0.0, Culture=neutral, processorArchitecture=AMD64">
|
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.2.0, Culture=neutral, processorArchitecture=AMD64">
|
||||||
<SpecificVersion>False</SpecificVersion>
|
<SpecificVersion>False</SpecificVersion>
|
||||||
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.Utils.dll</HintPath>
|
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.Utils.dll</HintPath>
|
||||||
</Reference>
|
</Reference>
|
||||||
|
|
|
||||||
|
|
@ -115,10 +115,12 @@
|
||||||
<Reference Include="Microsoft.Research.Peloponnese.HadoopBridge, Version=0.0.0.0, Culture=neutral, processorArchitecture=AMD64">
|
<Reference Include="Microsoft.Research.Peloponnese.HadoopBridge, Version=0.0.0.0, Culture=neutral, processorArchitecture=AMD64">
|
||||||
<SpecificVersion>False</SpecificVersion>
|
<SpecificVersion>False</SpecificVersion>
|
||||||
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.HadoopBridge.dll</HintPath>
|
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.HadoopBridge.dll</HintPath>
|
||||||
|
<Private>True</Private>
|
||||||
</Reference>
|
</Reference>
|
||||||
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.2.0, Culture=neutral, processorArchitecture=AMD64">
|
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.2.0, Culture=neutral, processorArchitecture=AMD64">
|
||||||
<SpecificVersion>False</SpecificVersion>
|
<SpecificVersion>False</SpecificVersion>
|
||||||
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.Utils.dll</HintPath>
|
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.Utils.dll</HintPath>
|
||||||
|
<Private>True</Private>
|
||||||
</Reference>
|
</Reference>
|
||||||
<Reference Include="Microsoft.Threading.Tasks">
|
<Reference Include="Microsoft.Threading.Tasks">
|
||||||
<HintPath>..\packages\Microsoft.Bcl.Async.1.0.166\lib\net40\Microsoft.Threading.Tasks.dll</HintPath>
|
<HintPath>..\packages\Microsoft.Bcl.Async.1.0.166\lib\net40\Microsoft.Threading.Tasks.dll</HintPath>
|
||||||
|
|
|
||||||
|
|
@ -284,7 +284,7 @@ namespace Microsoft.Research.DryadLinq
|
||||||
{
|
{
|
||||||
if (Context.PeloponneseHomeDirectory == null)
|
if (Context.PeloponneseHomeDirectory == null)
|
||||||
{
|
{
|
||||||
throw new ApplicationException("No Peloponnese home directory is set");
|
Context.PeloponneseHomeDirectory = Context.DryadHomeDirectory;
|
||||||
}
|
}
|
||||||
if (Context.DryadHomeDirectory == null)
|
if (Context.DryadHomeDirectory == null)
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -267,7 +267,7 @@ namespace Microsoft.Research.DryadLinq
|
||||||
{
|
{
|
||||||
if (Context.PeloponneseHomeDirectory == null)
|
if (Context.PeloponneseHomeDirectory == null)
|
||||||
{
|
{
|
||||||
throw new ApplicationException("No Peloponnese home directory is set");
|
Context.PeloponneseHomeDirectory = Context.DryadHomeDirectory;
|
||||||
}
|
}
|
||||||
if (Context.DryadHomeDirectory == null)
|
if (Context.DryadHomeDirectory == null)
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -94,6 +94,14 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private TaskCompletionSource<Process> finishWaiter;
|
private TaskCompletionSource<Process> finishWaiter;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// children that will be cancelled when finishWaiter is unblocked. These are materialized into a
|
||||||
|
/// separate set so that they can be discarded when they are no longer needed. Otherwise, when
|
||||||
|
/// used in WhenAny internally, they lead to the GC holding onto any other tasks in the WhenAny
|
||||||
|
/// clause until finishWaiter completes, which is a huge memory leak
|
||||||
|
/// </summary>
|
||||||
|
private HashSet<TaskCompletionSource<Process>> childFinishWaiters;
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// this blocks until the command loop exits
|
/// this blocks until the command loop exits
|
||||||
/// </summary>
|
/// </summary>
|
||||||
|
|
@ -140,6 +148,8 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
// make the Task that CommandLoop blocks on; when finishWaiter is started it returns null
|
// make the Task that CommandLoop blocks on; when finishWaiter is started it returns null
|
||||||
// causing CommandLoop to exit.
|
// causing CommandLoop to exit.
|
||||||
finishWaiter = new TaskCompletionSource<Process>();
|
finishWaiter = new TaskCompletionSource<Process>();
|
||||||
|
childFinishWaiters = new HashSet<TaskCompletionSource<Process>>();
|
||||||
|
finishWaiter.Task.ContinueWith((t) => Task.Run(() => SetChildFinishWaiters()));
|
||||||
|
|
||||||
// this is started when the Command Loop exits
|
// this is started when the Command Loop exits
|
||||||
exited = new TaskCompletionSource<bool>();
|
exited = new TaskCompletionSource<bool>();
|
||||||
|
|
@ -201,9 +211,54 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// a task that can be awaited and will asynchronously unblock when the finishWaiter result is set
|
/// set all the pending cancellations from the master finishWaiter
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private Task<Process> AsyncFinishWaiter { get { return finishWaiter.Task.ContinueWith((t) => t.Result); } }
|
private void SetChildFinishWaiters()
|
||||||
|
{
|
||||||
|
lock (this)
|
||||||
|
{
|
||||||
|
foreach (TaskCompletionSource<Process> waiter in childFinishWaiters)
|
||||||
|
{
|
||||||
|
waiter.SetResult(finishWaiter.Task.Result);
|
||||||
|
}
|
||||||
|
childFinishWaiters = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// get a task that can be awaited and will asynchronously unblock when the finishWaiter result is set
|
||||||
|
/// </summary>
|
||||||
|
private TaskCompletionSource<Process> GetAsyncFinishWaiter()
|
||||||
|
{
|
||||||
|
TaskCompletionSource<Process> thisCompletion = new TaskCompletionSource<Process>();
|
||||||
|
lock (this)
|
||||||
|
{
|
||||||
|
if (childFinishWaiters == null)
|
||||||
|
{
|
||||||
|
thisCompletion.SetResult(finishWaiter.Task.Result);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
childFinishWaiters.Add(thisCompletion);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return thisCompletion;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// take the finish waiter out of the list of pending waiters, since its target has completed
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="waiter">waiter to remove</param>
|
||||||
|
private void RemoveAsyncFinishWaiter(TaskCompletionSource<Process> waiter)
|
||||||
|
{
|
||||||
|
lock (this)
|
||||||
|
{
|
||||||
|
if (childFinishWaiters != null)
|
||||||
|
{
|
||||||
|
childFinishWaiters.Remove(waiter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// (asynchronously) block until there is a process available on the local queue, the rack queue
|
/// (asynchronously) block until there is a process available on the local queue, the rack queue
|
||||||
|
|
@ -248,7 +303,9 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
|
|
||||||
// we want to wait either for waiter to be matched with a Process in one of the three queues, or
|
// we want to wait either for waiter to be matched with a Process in one of the three queues, or
|
||||||
// for ShutDown to be called, so make an array of tasks and wait for the first one to be unblocked.
|
// for ShutDown to be called, so make an array of tasks and wait for the first one to be unblocked.
|
||||||
var unblocked = await Task.WhenAny(blocker, AsyncFinishWaiter);
|
TaskCompletionSource<Process> thisWaiter = GetAsyncFinishWaiter();
|
||||||
|
var unblocked = await Task.WhenAny(blocker, thisWaiter.Task);
|
||||||
|
RemoveAsyncFinishWaiter(thisWaiter);
|
||||||
|
|
||||||
if (unblocked.Result != null)
|
if (unblocked.Result != null)
|
||||||
{
|
{
|
||||||
|
|
@ -341,7 +398,9 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
{
|
{
|
||||||
logger.Log("Computer " + name + " reporting match with process " + process.Id);
|
logger.Log("Computer " + name + " reporting match with process " + process.Id);
|
||||||
|
|
||||||
await process.OnScheduled(this, nextTask, AsyncFinishWaiter, null);
|
TaskCompletionSource<Process> thisWaiter = GetAsyncFinishWaiter();
|
||||||
|
await process.OnScheduled(this, nextTask, thisWaiter.Task, null);
|
||||||
|
RemoveAsyncFinishWaiter(thisWaiter);
|
||||||
|
|
||||||
logger.Log("Computer " + name + " waiting for process " + process.Id + " to complete");
|
logger.Log("Computer " + name + " waiting for process " + process.Id + " to complete");
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -137,7 +137,7 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
public void DecrementQueueCount()
|
public void DecrementQueueCount()
|
||||||
{
|
{
|
||||||
Debug.Assert(queueCount > 0);
|
Debug.Assert(queueCount > 0);
|
||||||
++queueCount;
|
--queueCount;
|
||||||
if (owner == null && !scheduling && queueCount == 0)
|
if (owner == null && !scheduling && queueCount == 0)
|
||||||
{
|
{
|
||||||
// the queue count has dropped to zero without the process being matched
|
// the queue count has dropped to zero without the process being matched
|
||||||
|
|
|
||||||
|
|
@ -131,6 +131,11 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
{
|
{
|
||||||
processQueue = new Queue<Process>();
|
processQueue = new Queue<Process>();
|
||||||
waiterQueue = new Queue<ProcessWaiter>();
|
waiterQueue = new Queue<ProcessWaiter>();
|
||||||
|
|
||||||
|
// start background cleaning tasks
|
||||||
|
CleanProcessQueue();
|
||||||
|
CleanWaiterQueue();
|
||||||
|
|
||||||
active = true;
|
active = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -164,6 +169,76 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// background thread to periodically remove any claimed processes, so we don't
|
||||||
|
/// hang on to memory indefinitely
|
||||||
|
/// </summary>
|
||||||
|
private async void CleanProcessQueue()
|
||||||
|
{
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
lock (this)
|
||||||
|
{
|
||||||
|
if (processQueue == null)
|
||||||
|
{
|
||||||
|
// we have shut down, so exit this daemon
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Queue<Process> cleanedQueue = new Queue<Process>();
|
||||||
|
foreach (Process p in processQueue)
|
||||||
|
{
|
||||||
|
lock (p)
|
||||||
|
{
|
||||||
|
if (p.Unclaimed)
|
||||||
|
{
|
||||||
|
cleanedQueue.Enqueue(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
processQueue = cleanedQueue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean again in a second
|
||||||
|
await Task.Delay(1000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// background thread to periodically remove any claimed waiters, so we don't
|
||||||
|
/// hang on to memory indefinitely
|
||||||
|
/// </summary>
|
||||||
|
private async void CleanWaiterQueue()
|
||||||
|
{
|
||||||
|
while (true)
|
||||||
|
{
|
||||||
|
lock (this)
|
||||||
|
{
|
||||||
|
if (processQueue == null)
|
||||||
|
{
|
||||||
|
// we have shut down, so exit this daemon
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Queue<ProcessWaiter> cleanedQueue = new Queue<ProcessWaiter>();
|
||||||
|
foreach (ProcessWaiter w in waiterQueue)
|
||||||
|
{
|
||||||
|
lock (w)
|
||||||
|
{
|
||||||
|
if (w.Unclaimed)
|
||||||
|
{
|
||||||
|
cleanedQueue.Enqueue(w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
waiterQueue = cleanedQueue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean again in a second
|
||||||
|
await Task.Delay(1000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// <summary>
|
/// <summary>
|
||||||
/// add a schedulable process. If there is an unclaimed computer waiting, the
|
/// add a schedulable process. If there is an unclaimed computer waiting, the
|
||||||
/// process will be assigned to the computer and the computer's Task will be
|
/// process will be assigned to the computer and the computer's Task will be
|
||||||
|
|
@ -305,7 +380,7 @@ namespace Microsoft.Research.Dryad.LocalScheduler
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// even if there are processes, they may have been claimed by ther computers
|
// even if there are processes, they may have been claimed by other computers
|
||||||
// already, so use a loop here
|
// already, so use a loop here
|
||||||
while (active && processQueue.Count > 0 && !claimed)
|
while (active && processQueue.Count > 0 && !claimed)
|
||||||
{
|
{
|
||||||
|
|
|
||||||
|
|
@ -127,6 +127,11 @@
|
||||||
a completed application. We have tried to report errors in user application code back so that they are visible in the
|
a completed application. We have tried to report errors in user application code back so that they are visible in the
|
||||||
<link xlink:href="91822db3-8a00-4307-ad8a-595c94f449b0">DryadLINQ Job Browser</link> to avoid the need to consult
|
<link xlink:href="91822db3-8a00-4307-ad8a-595c94f449b0">DryadLINQ Job Browser</link> to avoid the need to consult
|
||||||
the logs.</para>
|
the logs.</para>
|
||||||
|
<para>
|
||||||
|
<mediaLinkInline>
|
||||||
|
<image xlink:href="Dryad on Azure Architecture"/>
|
||||||
|
</mediaLinkInline>
|
||||||
|
</para>
|
||||||
</content>
|
</content>
|
||||||
</conclusion>
|
</conclusion>
|
||||||
</procedure>
|
</procedure>
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue