fix memory bug, minor other changes

This commit is contained in:
Michael Isard 2014-04-20 12:49:30 -07:00
parent 5ad103cba7
commit 86242db082
8 changed files with 150 additions and 9 deletions

View File

@ -77,7 +77,7 @@
<SpecificVersion>False</SpecificVersion>
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.HadoopBridge.dll</HintPath>
</Reference>
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.0.0, Culture=neutral, processorArchitecture=AMD64">
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.2.0, Culture=neutral, processorArchitecture=AMD64">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.Utils.dll</HintPath>
</Reference>

View File

@ -115,10 +115,12 @@
<Reference Include="Microsoft.Research.Peloponnese.HadoopBridge, Version=0.0.0.0, Culture=neutral, processorArchitecture=AMD64">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.HadoopBridge.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="Microsoft.Research.Peloponnese.Utils, Version=0.7.2.0, Culture=neutral, processorArchitecture=AMD64">
<SpecificVersion>False</SpecificVersion>
<HintPath>..\packages\Microsoft.Research.Peloponnese.0.7.2-beta\lib\net45\Microsoft.Research.Peloponnese.Utils.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="Microsoft.Threading.Tasks">
<HintPath>..\packages\Microsoft.Bcl.Async.1.0.166\lib\net40\Microsoft.Threading.Tasks.dll</HintPath>

View File

@ -284,7 +284,7 @@ namespace Microsoft.Research.DryadLinq
{
if (Context.PeloponneseHomeDirectory == null)
{
throw new ApplicationException("No Peloponnese home directory is set");
Context.PeloponneseHomeDirectory = Context.DryadHomeDirectory;
}
if (Context.DryadHomeDirectory == null)
{

View File

@ -267,7 +267,7 @@ namespace Microsoft.Research.DryadLinq
{
if (Context.PeloponneseHomeDirectory == null)
{
throw new ApplicationException("No Peloponnese home directory is set");
Context.PeloponneseHomeDirectory = Context.DryadHomeDirectory;
}
if (Context.DryadHomeDirectory == null)
{

View File

@ -94,6 +94,14 @@ namespace Microsoft.Research.Dryad.LocalScheduler
/// </summary>
private TaskCompletionSource<Process> finishWaiter;
/// <summary>
/// children that will be cancelled when finishWaiter is unblocked. These are materialized into a
/// separate set so that they can be discarded when they are no longer needed. Otherwise, when
/// used in WhenAny internally, they lead to the GC holding onto any other tasks in the WhenAny
/// clause until finishWaiter completes, which is a huge memory leak
/// </summary>
private HashSet<TaskCompletionSource<Process>> childFinishWaiters;
/// <summary>
/// this blocks until the command loop exits
/// </summary>
@ -140,6 +148,8 @@ namespace Microsoft.Research.Dryad.LocalScheduler
// make the Task that CommandLoop blocks on; when finishWaiter is started it returns null
// causing CommandLoop to exit.
finishWaiter = new TaskCompletionSource<Process>();
childFinishWaiters = new HashSet<TaskCompletionSource<Process>>();
finishWaiter.Task.ContinueWith((t) => Task.Run(() => SetChildFinishWaiters()));
// this is started when the Command Loop exits
exited = new TaskCompletionSource<bool>();
@ -201,9 +211,54 @@ namespace Microsoft.Research.Dryad.LocalScheduler
}
/// <summary>
/// a task that can be awaited and will asynchronously unblock when the finishWaiter result is set
/// set all the pending cancellations from the master finishWaiter
/// </summary>
private Task<Process> AsyncFinishWaiter { get { return finishWaiter.Task.ContinueWith((t) => t.Result); } }
private void SetChildFinishWaiters()
{
lock (this)
{
foreach (TaskCompletionSource<Process> waiter in childFinishWaiters)
{
waiter.SetResult(finishWaiter.Task.Result);
}
childFinishWaiters = null;
}
}
/// <summary>
/// get a task that can be awaited and will asynchronously unblock when the finishWaiter result is set
/// </summary>
private TaskCompletionSource<Process> GetAsyncFinishWaiter()
{
TaskCompletionSource<Process> thisCompletion = new TaskCompletionSource<Process>();
lock (this)
{
if (childFinishWaiters == null)
{
thisCompletion.SetResult(finishWaiter.Task.Result);
}
else
{
childFinishWaiters.Add(thisCompletion);
}
}
return thisCompletion;
}
/// <summary>
/// take the finish waiter out of the list of pending waiters, since its target has completed
/// </summary>
/// <param name="waiter">waiter to remove</param>
private void RemoveAsyncFinishWaiter(TaskCompletionSource<Process> waiter)
{
lock (this)
{
if (childFinishWaiters != null)
{
childFinishWaiters.Remove(waiter);
}
}
}
/// <summary>
/// (asynchronously) block until there is a process available on the local queue, the rack queue
@ -248,7 +303,9 @@ namespace Microsoft.Research.Dryad.LocalScheduler
// we want to wait either for waiter to be matched with a Process in one of the three queues, or
// for ShutDown to be called, so make an array of tasks and wait for the first one to be unblocked.
var unblocked = await Task.WhenAny(blocker, AsyncFinishWaiter);
TaskCompletionSource<Process> thisWaiter = GetAsyncFinishWaiter();
var unblocked = await Task.WhenAny(blocker, thisWaiter.Task);
RemoveAsyncFinishWaiter(thisWaiter);
if (unblocked.Result != null)
{
@ -341,7 +398,9 @@ namespace Microsoft.Research.Dryad.LocalScheduler
{
logger.Log("Computer " + name + " reporting match with process " + process.Id);
await process.OnScheduled(this, nextTask, AsyncFinishWaiter, null);
TaskCompletionSource<Process> thisWaiter = GetAsyncFinishWaiter();
await process.OnScheduled(this, nextTask, thisWaiter.Task, null);
RemoveAsyncFinishWaiter(thisWaiter);
logger.Log("Computer " + name + " waiting for process " + process.Id + " to complete");

View File

@ -137,7 +137,7 @@ namespace Microsoft.Research.Dryad.LocalScheduler
public void DecrementQueueCount()
{
Debug.Assert(queueCount > 0);
++queueCount;
--queueCount;
if (owner == null && !scheduling && queueCount == 0)
{
// the queue count has dropped to zero without the process being matched

View File

@ -131,6 +131,11 @@ namespace Microsoft.Research.Dryad.LocalScheduler
{
processQueue = new Queue<Process>();
waiterQueue = new Queue<ProcessWaiter>();
// start background cleaning tasks
CleanProcessQueue();
CleanWaiterQueue();
active = true;
}
@ -164,6 +169,76 @@ namespace Microsoft.Research.Dryad.LocalScheduler
}
}
/// <summary>
/// background thread to periodically remove any claimed processes, so we don't
/// hang on to memory indefinitely
/// </summary>
private async void CleanProcessQueue()
{
while (true)
{
lock (this)
{
if (processQueue == null)
{
// we have shut down, so exit this daemon
return;
}
Queue<Process> cleanedQueue = new Queue<Process>();
foreach (Process p in processQueue)
{
lock (p)
{
if (p.Unclaimed)
{
cleanedQueue.Enqueue(p);
}
}
}
processQueue = cleanedQueue;
}
// clean again in a second
await Task.Delay(1000);
}
}
/// <summary>
/// background thread to periodically remove any claimed waiters, so we don't
/// hang on to memory indefinitely
/// </summary>
private async void CleanWaiterQueue()
{
while (true)
{
lock (this)
{
if (processQueue == null)
{
// we have shut down, so exit this daemon
return;
}
Queue<ProcessWaiter> cleanedQueue = new Queue<ProcessWaiter>();
foreach (ProcessWaiter w in waiterQueue)
{
lock (w)
{
if (w.Unclaimed)
{
cleanedQueue.Enqueue(w);
}
}
}
waiterQueue = cleanedQueue;
}
// clean again in a second
await Task.Delay(1000);
}
}
/// <summary>
/// add a schedulable process. If there is an unclaimed computer waiting, the
/// process will be assigned to the computer and the computer's Task will be
@ -305,7 +380,7 @@ namespace Microsoft.Research.Dryad.LocalScheduler
}
}
// even if there are processes, they may have been claimed by ther computers
// even if there are processes, they may have been claimed by other computers
// already, so use a loop here
while (active && processQueue.Count > 0 && !claimed)
{

View File

@ -127,6 +127,11 @@
a completed application. We have tried to report errors in user application code back so that they are visible in the
<link xlink:href="91822db3-8a00-4307-ad8a-595c94f449b0">DryadLINQ Job Browser</link> to avoid the need to consult
the logs.</para>
<para>
<mediaLinkInline>
<image xlink:href="Dryad on Azure Architecture"/>
</mediaLinkInline>
</para>
</content>
</conclusion>
</procedure>