4-nd round RFR (XS) 6988950: JDWP exit error JVMTI_ERROR_WRONG_PHASE(112)
Daniel D. Daugherty
daniel.daugherty at oracle.com
Fri Nov 7 18:10:45 UTC 2014
On 11/6/14 3:27 PM, serguei.spitsyn at oracle.com wrote:
> Hi reviewers,
>
> I'm suggesting to review a modified fix:
> http://cr.openjdk.java.net/~sspitsyn/webrevs/2014/jdk/6988950-JDWP-wrong-phase.4/
>
src/jdk.jdwp.agent/share/native/libjdwp/debugLoop.c
line 130: * that a command after VM_DEATH will be allowed to complete
Perhaps "VM command" instead of "command" just to be clear.
Thumbs up! And we'll want to push this changeset after a push
to Main_Baseline so that it has (at least) a week to bake...
Dan
>
>
> The 3-rd round fix is not right as it caused deadlocks in several
> tests (in nsk.jdi.testlist and jtreg com/sun/jdi).
>
> Here is a deadlock example:
>
> ----------------- lwp# 2 / thread# 2 --------------------
> ffffffff7e8dc6a4 lwp_cond_wait (100138748, 100138730, 0, 0)
> ffffffff7dcad148 void os::PlatformEvent::park() (100138700, d4788,
> d4400, 0, ffffffff7e357440, 100138730) + 100
> ffffffff7dc3151c int Monitor::IWait(Thread*,long) (ffffffff7e3c5b98,
> 100137000, 0, 1004405d0, 6e750, 0) + a4
> ffffffff7dc324d0 bool Monitor::wait(bool,long,bool) (1004405d0,
> 100137000, 0, 0, 1, 20000000) + 358
> ffffffff7de6c530 int JavaThread::java_suspend_self() (1004405d0,
> 100137000, 1, deab, 60000000, 100137000) + c8
> ffffffff7da5f478 int JvmtiRawMonitor::raw_wait(long,bool,Thread*)
> (10034bdc0, ffffffffffffffff, ffffffff7e3e6bd0, 100137000, 1, 2) + 258
> ffffffff7da2284c jvmtiError
> JvmtiEnv::RawMonitorWait(JvmtiRawMonitor*,long) (92800, 10034bdc0,
> ffffffffffffffff, 4, 9aeb0, 100137000) + 8c
> ffffffff7aa2f47c debugMonitorWait (ffffffff7ab3ba10, c28, c00,
> ffffffff7ab3ad18, ffffffff7ab3ad18, 0) + 3c
> ffffffff7aa1c804 enqueueCommand (10034bb90, 102c00, ffffffffffefd118,
> ffffffff7ab3ad18, 102c00, ffffffff7ab3bd60) + 14c
> ffffffff7aa1e23c eventHelper_reportEvents (d8, 100135d70, 2, 1, 1, 2)
> + 10c
> ffffffff7aa181f8 reportEvents (1001371f8, 0, 0, 14, 100135d70, 0) + 138
> ffffffff7aa187b8 event_callback (1001371f8, ffffffff7b0ffa88,
> ffffffff7aa23150, ffffffff7aa376a0, ffffffff7ab3ad18, 100441ad0) + 360
> ffffffff7aa1b870 cbVMDeath (800, 1001371f8, ffffffff7aa37c48,
> ffffffff7ab3ad18, 1018, 1000) + 1d8
> ffffffff7da3635c void JvmtiExport::post_vm_death() (1ffc, 100137000,
> ffffffff7e3e8b30, ffffffff7e357440, 1, 10010cf30) + 534
> ffffffff7d7bb104 void before_exit(JavaThread*) (100137000,
> ffffffff7e392350, ffffffff7e3fb938, 6ed99, ffffffff7e357440,
> ffffffff7e3e6b70) + 30c
> ffffffff7de72128 bool Threads::destroy_vm() (100137000, 100110a40,
> ffffffff7e3f22f4, ffffffff7e3e6ab0, ffffffff7e357440, 30000000) + 100
> ffffffff7d8d0664 jni_DestroyJavaVM (100137000, 1ffc,
> ffffffff7e3e8b30, ffffffff7e357440, 0, 10013700) + 1bc
> ffffffff7ee08680 JavaMain (ffffffff7e3da790, 0, ffffffff7e3da790,
> 10035de68, 0, ffffffff7e4143b0) + 860
> ffffffff7e8d8558 _lwp_start (0, 0, 0, 0, 0, 0)
>
> ----------------- lwp# 12 / thread# 12 --------------------
> ffffffff7e8dc6a4 lwp_cond_wait (100349948, 100349930, 0, 0)
> ffffffff7dcad148 void os::PlatformEvent::park() (100349900, d4788,
> d4400, 0, ffffffff7e357440, 100349930) + 100
> ffffffff7da5f010 int JvmtiRawMonitor::raw_enter(Thread*) (10034a070,
> 100348800, a, ffffffff7e3de340, 1, ffffffff7e115ff4) + 258
> ffffffff7da22450 jvmtiError
> JvmtiEnv::RawMonitorEnter(JvmtiRawMonitor*) (ffffffff7ea05a00,
> 10034a070, 1c7, 100348800, ffffffff7e357440, 4) + a0
> ffffffff7aa2f288 debugMonitorEnter (10034a070, c18, c00,
> ffffffff7ab3ad28, ffffffff7ab3b940, 0) + 38
> ffffffff7aa14134 debugLoop_run (ffffffff7ab3b940, 1000,
> ffffffff7ab3ad28, ffffffff7aa360d0, ffffffff5b2ff718, c18) + 11c
> ffffffff7aa2a4f8 connectionInitiated (ffffffff5b504010, 1358, 1000,
> ffffffff7ab3ad28, 1, ffffffff7ab3c080) + e0
> ffffffff7aa2a7d4 attachThread (ffffffffffefee48, 101000,
> ffffffff5b504010, ffffffff7ab3ad28, 0, 10000000) + 54
> ffffffff7da56b18 void JvmtiAgentThread::call_start_function()
> (100348800, ffffffff7e3e8b38, 916f0, ffffffff7e357440, 10034880, 1) + 128
> ffffffff7de6a678 void JavaThread::thread_main_inner() (100348800,
> 3d8, 1003497f8, 100349420, ffffffff5b2ff9f8, 0) + 90
> ffffffff7de6a5b4 void JavaThread::run() (100348800, 100349442, c,
> fffffffea5f3e048, 3d8, 1003497f8) + 3ac
> ffffffff7dc9f2e4 java_start (ca800, 100348800, ca904,
> ffffffff7e16ff31, ffffffff7e357440, 4797) + 2e4
> ffffffff7e8d8558 _lwp_start (0, 0, 0, 0, 0, 0)
>
> ----------------- lwp# 13 / thread# 13 --------------------
> ffffffff7e8dc6a4 lwp_cond_wait (10034d348, 10034d330, 0, 0)
> ffffffff7dcad148 void os::PlatformEvent::park() (10034d300, d4788,
> d4400, 0, ffffffff7e357440, 10034d330) + 100
> ffffffff7da5eac8 int JvmtiRawMonitor::SimpleWait(Thread*,long)
> (10034bed0, 10034c000, ffffffffffffffff, 241000, 0, 10034c000) + 100
> ffffffff7da5f300 int JvmtiRawMonitor::raw_wait(long,bool,Thread*)
> (10034bed0, ffffffffffffffff, 1, 10034c000, ffffffff7e357440,
> 10034c000) + e0
> ffffffff7da2284c jvmtiError
> JvmtiEnv::RawMonitorWait(JvmtiRawMonitor*,long) (92800, 10034bed0,
> ffffffffffffffff, 4, 9aeb0, 10034c000) + 8c
> ffffffff7aa2f47c debugMonitorWait (ffffffff7ab3ba10, c28, c00,
> ffffffff7ab3ad18, ffffffff7ab3b940, 0) + 3c
> ffffffff7aa1d838 doBlockCommandLoop (800, 1038, ffffffff7ab3ad18,
> 1000, ffffffff7ab3ad18, ffffffff7ab3bd60) + 48
> ffffffff7aa1da3c commandLoop (c28, 10034c1f8, c00, ffffffff7ab3ad18,
> 0, 10000000) + ac
> ffffffff7da56b18 void JvmtiAgentThread::call_start_function()
> (10034c000, ffffffff7e3e8b38, 916f0, ffffffff7e357440, 10034c00, 1) + 128
> ffffffff7de6a678 void JavaThread::thread_main_inner() (10034c000,
> 3d8, 10034cfe8, 10034cc10, ffffffff5b0ffbf8, 0) + 90
> ffffffff7de6a5b4 void JavaThread::run() (10034c000, 10034cc28, d,
> fffffffea5f3e290, 3d8, 10034cfe8) + 3ac
> ffffffff7dc9f2e4 java_start (ca800, 10034c000, ca904,
> ffffffff7e16ff31, ffffffff7e357440, 181a) + 2e4
> ffffffff7e8d8558 _lwp_start (0, 0, 0, 0, 0, 0)
>
>
> The details:
> - Thread #2: The cbVMDeath() event handler is waiting on the
> commandCompleteLock in the enqueueCommand().
> The call chain is:
> cbVMDeath() -> event_callback() -> reportEvents() ->
> eventHelper_reportEvents() -> enqueueCommand().
> The enqueueCommand() depends on the commandLoop() that has to call
> completeCommand(command) for the command being enqueued.
> This has not been set yet: gdata->vmDead = JNI_TRUE
>
> - Thread #12: The debugLoop_run blocked on the vmDeathLock enter
>
> - Thread #13: The commandLoop is waiting on the blockCommandLoopLock
> in the doBlockCommandLoop().
> It is because blockCommandLoop == JNI_TRUE which is set in the
> needBlockCommandLoop()
> if the following condition is true:
> (cmd->commandKind == COMMAND_REPORT_EVENT_COMPOSITE &&
> cmd->u.reportEventComposite.suspendPolicy ==
> JDWP_SUSPEND_POLICY(ALL))
>
>
> It seems, the debugLoop_run() block on the vmDeathLock causes the
> commandLoop() to wait indefinitely.
> The cbVMDeath() can not proceed because the commandLoop() does not
> make a progress.
>
> The vmDeathLock critical section in the cbVMDeath() event callback
> seems to be an overkill (unnecessary).
> A less intrusive synchronization is required here which is to wait
> until the current command is completed
> before returning to the JvmtiExport::post_vm_death().
>
> The new approach (see new webrev) is to extend the resumeLock
> synchronization pattern
> to all VirtualMachine set of commands, not only the resume command.
> The resumeLock name is replaced with the vmDeathLock to reflect new
> semantics.
>
> In general, we could consider to do the same for the rest of the JDWP
> command sets.
> But it is better to be careful and see how this change goes first.
>
>
> Thanks,
> Serguei
>
>
> On 11/5/14 2:27 AM, serguei.spitsyn at oracle.com wrote:
>> Hi David,
>>
>> Thank you for the concerns!
>> Testing showed several tests failing with deadlocks.
>> Scenarios are similar to that you describe.
>>
>> Trying to understand the details.
>>
>> Thanks,
>> Serguei
>>
>> On 11/4/14 4:09 PM, David Holmes wrote:
>>> Hi Serguei,
>>>
>>> On 3/11/2014 5:07 PM, serguei.spitsyn at oracle.com wrote:
>>>> On 11/2/14 8:58 PM, David Holmes wrote:
>>>>> On 1/11/2014 8:13 PM, Dmitry Samersoff wrote:
>>>>>> Serguei,
>>>>>>
>>>>>> Thank you for good finding. This approach looks much better for me.
>>>>>>
>>>>>> The fix looks good.
>>>>>>
>>>>>> Is it necessary to release vmDeathLock locks at
>>>>>> eventHandler.c:1244 before call
>>>>>>
>>>>>> EXIT_ERROR(error,"Can't clear event callbacks on vm death"); ?
>>>>>
>>>>> I agree this looks necessary, or at least more clean (if things are
>>>>> failing we really don't know what is happening).
>>>>
>>>> Agreed (replied to Dmitry).
>>>>
>>>>>
>>>>> More generally I'm concerned about whether any of the code paths
>>>>> taken
>>>>> while holding the new lock can result in deadlock - in particular
>>>>> with
>>>>> regard to the resumeLock ?
>>>>
>>>> The cbVMDeath() function never holds both vmDeathLock and
>>>> resumeLock at
>>>> the same time,
>>>> so there is no chance for a deadlock that involves both these locks.
>>>>
>>>> Two more locks used in the cbVMDeath() are the callbackBlock and
>>>> callbackLock.
>>>> These two locks look completely unrelated to the debugLoop_run().
>>>>
>>>> The debugLoop_run() function also uses the cmdQueueLock.
>>>> The debugLoop_run() never holds both vmDeathLock and cmdQueueLock
>>>> at the
>>>> same time.
>>>>
>>>> So that I do not see any potential to introduce new deadlock with the
>>>> vmDeathLock.
>>>>
>>>> However, it is still easy to overlook something here.
>>>> Please, let me know if you see any danger.
>>>
>>> I was mainly concerned about what might happen in the call chain for
>>> threadControl_resumeAll() (it certainly sounds like it might need to
>>> use a resumeLock :) ). I see direct use of the threadLock and
>>> indirectly the eventHandler lock; but there are further call paths I
>>> did not explore. Wish there was an easy way to determine the
>>> transitive closure of all locks used from a given call.
>>>
>>> Thanks,
>>> David
>>>
>>>> Thanks,
>>>> Serguei
>>>>
>>>>>
>>>>> David
>>>>>
>>>>>> -Dmitry
>>>>>>
>>>>>>
>>>>>>
>>>>>> On 2014-11-01 00:07, serguei.spitsyn at oracle.com wrote:
>>>>>>>
>>>>>>> It is 3-rd round of review for:
>>>>>>> https://bugs.openjdk.java.net/browse/JDK-6988950
>>>>>>>
>>>>>>> New webrev:
>>>>>>>
>>>>>>> http://cr.openjdk.java.net/~sspitsyn/webrevs/2014/jdk/6988950-JDWP-wrong-phase.3/
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> Summary
>>>>>>>
>>>>>>> For failing scenario, please, refer to the 1-st round RFR below.
>>>>>>>
>>>>>>> I've found what is missed in the jdwp agent shutdown and
>>>>>>> decided to
>>>>>>> switch from a workaround to a real fix.
>>>>>>>
>>>>>>> The agent VM_DEATH callback sets the gdata field:
>>>>>>> gdata->vmDead = 1.
>>>>>>> The agent debugLoop_run() has a guard against the VM shutdown:
>>>>>>>
>>>>>>> 165 } else if (gdata->vmDead &&
>>>>>>> 166 ((cmd->cmdSet) !=
>>>>>>> JDWP_COMMAND_SET(VirtualMachine))) {
>>>>>>> 167 /* Protect the VM from calls while dead.
>>>>>>> 168 * VirtualMachine cmdSet quietly ignores some
>>>>>>> cmds
>>>>>>> 169 * after VM death, so, it sends it's own
>>>>>>> errors.
>>>>>>> 170 */
>>>>>>> 171 outStream_setError(&out,
>>>>>>> JDWP_ERROR(VM_DEAD));
>>>>>>>
>>>>>>>
>>>>>>> However, the guard above does not help much if the VM_DEATH
>>>>>>> event
>>>>>>> happens in the middle of a command execution.
>>>>>>> There is a lack of synchronization here.
>>>>>>>
>>>>>>> The fix introduces new lock (vmDeathLock) which does not
>>>>>>> allow to
>>>>>>> execute the commands
>>>>>>> and the VM_DEATH event callback concurrently.
>>>>>>> It should work well for any function that is used in
>>>>>>> implementation of
>>>>>>> the JDWP_COMMAND_SET(VirtualMachine) .
>>>>>>>
>>>>>>>
>>>>>>> Testing:
>>>>>>> Run nsk.jdi.testlist, nsk.jdwp.testlist and JTREG com/sun/jdi
>>>>>>> tests
>>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Serguei
>>>>>>>
>>>>>>>
>>>>>>> On 10/29/14 6:05 PM, serguei.spitsyn at oracle.com wrote:
>>>>>>>> The updated webrev:
>>>>>>>> http://cr.openjdk.java.net/~sspitsyn/webrevs/2014/jdk/6988950-JDWP-wrong-phase.2/
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> The changes are:
>>>>>>>> - added a comment recommended by Staffan
>>>>>>>> - removed the ignore_wrong_phase() call from function
>>>>>>>> classSignature()
>>>>>>>>
>>>>>>>> The classSignature() function is called in 16 places.
>>>>>>>> Most of them do not tolerate the NULL in place of returned
>>>>>>>> signature
>>>>>>>> and will crash.
>>>>>>>> I'm not comfortable to fix all the occurrences now and suggest to
>>>>>>>> return to this
>>>>>>>> issue after gaining experience with more failure cases that are
>>>>>>>> still
>>>>>>>> expected.
>>>>>>>> The failure with the classSignature() involved was observed
>>>>>>>> only once
>>>>>>>> in the nightly
>>>>>>>> and should be extremely rare reproducible.
>>>>>>>> I'll file a placeholder bug if necessary.
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Serguei
>>>>>>>>
>>>>>>>> On 10/28/14 6:11 PM, serguei.spitsyn at oracle.com wrote:
>>>>>>>>> Please, review the fix for:
>>>>>>>>> https://bugs.openjdk.java.net/browse/JDK-6988950
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Open webrev:
>>>>>>>>> http://cr.openjdk.java.net/~sspitsyn/webrevs/2014/jdk/6988950-JDWP-wrong-phase.1/
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Summary:
>>>>>>>>>
>>>>>>>>> The failing scenario:
>>>>>>>>> The debugger and the debuggee are well aware a VM
>>>>>>>>> shutdown has
>>>>>>>>> been started in the target process.
>>>>>>>>> The debugger at this point is not expected to send any
>>>>>>>>> commands
>>>>>>>>> to the JDWP agent.
>>>>>>>>> However, the JDI layer (debugger side) and the jdwp agent
>>>>>>>>> (debuggee side)
>>>>>>>>> are not in sync with the consumer layers.
>>>>>>>>>
>>>>>>>>> One reason is because the test debugger does not invoke
>>>>>>>>> the JDI
>>>>>>>>> method VirtualMachine.dispose().
>>>>>>>>> Another reason is that the Debugger and the debuggee
>>>>>>>>> processes
>>>>>>>>> are uneasy to sync in general.
>>>>>>>>>
>>>>>>>>> As a result the following steps are possible:
>>>>>>>>> - The test debugger sends a 'quit' command to the test
>>>>>>>>> debuggee
>>>>>>>>> - The debuggee is normally exiting
>>>>>>>>> - The jdwp backend reports (over the jdwp protocol) an
>>>>>>>>> anonymous class unload event
>>>>>>>>> - The JDI InternalEventHandler thread handles the
>>>>>>>>> ClassUnloadEvent event
>>>>>>>>> - The InternalEventHandler wants to uncache the matching
>>>>>>>>> reference type.
>>>>>>>>> If there is more than one class with the same host
>>>>>>>>> class
>>>>>>>>> signature, it can't distinguish them,
>>>>>>>>> and so, deletes all references and re-retrieves them
>>>>>>>>> again
>>>>>>>>> (see tracing below):
>>>>>>>>> MY_TRACE: JDI:
>>>>>>>>> VirtualMachineImpl.retrieveClassesBySignature:
>>>>>>>>> sig=Ljava/lang/invoke/LambdaForm$DMH;
>>>>>>>>> - The jdwp backend debugLoop_run() gets the command
>>>>>>>>> from JDI
>>>>>>>>> and calls the functions
>>>>>>>>> classesForSignature() and classStatus() recursively.
>>>>>>>>> - The classStatus() makes a call to the JVMTI
>>>>>>>>> GetClassStatus()
>>>>>>>>> and gets the JVMTI_ERROR_WRONG_PHASE
>>>>>>>>> - As a result the jdwp backend reports the JVMTI error
>>>>>>>>> to the
>>>>>>>>> JDI, and so, the test fails
>>>>>>>>>
>>>>>>>>> For details, see the analysis in bug report closed as a
>>>>>>>>> dup of
>>>>>>>>> the bug 6988950:
>>>>>>>>> https://bugs.openjdk.java.net/browse/JDK-8024865
>>>>>>>>>
>>>>>>>>> Some similar cases can be found in the two bug reports
>>>>>>>>> (6988950
>>>>>>>>> and 8024865) describing this issue.
>>>>>>>>>
>>>>>>>>> The fix is to skip reporting the JVMTI_ERROR_WRONG_PHASE
>>>>>>>>> error
>>>>>>>>> as it is normal at the VM shutdown.
>>>>>>>>> The original jdwp backend implementation had a similar
>>>>>>>>> approach
>>>>>>>>> for the raw monitor functions.
>>>>>>>>> Threy use the ignore_vm_death() to workaround the
>>>>>>>>> JVMTI_ERROR_WRONG_PHASE errors.
>>>>>>>>> For reference, please, see the file: src/share/back/util.c
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Testing:
>>>>>>>>> Run nsk.jdi.testlist, nsk.jdwp.testlist and JTREG com/sun/jdi
>>>>>>>>> tests
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Thanks,
>>>>>>>>> Serguei
>>>>>>>>>
>>>>>>>>
>>>>>>>
>>>>>>
>>>>>>
>>>>
>>
>
More information about the serviceability-dev
mailing list