From fjiang at openjdk.org Sat Jun 1 01:47:11 2024 From: fjiang at openjdk.org (Feilong Jiang) Date: Sat, 1 Jun 2024 01:47:11 GMT Subject: RFR: 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions [v2] In-Reply-To: <-0PMAvAivEpLgl0qFHHn21m3MrU7Xa7QmO6g2qHgfRQ=.120a7ee2-5525-4bfb-887b-da860feca3d0@github.com> References: <-0PMAvAivEpLgl0qFHHn21m3MrU7Xa7QmO6g2qHgfRQ=.120a7ee2-5525-4bfb-887b-da860feca3d0@github.com> Message-ID: On Tue, 28 May 2024 10:49:26 GMT, Gui Cao wrote: >> Hi, We want to support vector-scalar and vector-immediate arithmetic instructions, It was implemented by referring to RVV v1.0 [1]. please take a look and have some reviews. Thanks a lot. >> We can use the Byte256VectorTests.java[2] to print the Opto JIT Code, verify and observe the generation of nodes. >> >> For example, we can use the following command to print the Opto JIT Code of a jtreg test case: >> >> >> /home/zifeihan/jtreg/bin/jtreg \ >> -v:default \ >> -concurrency:16 -timeout:50 \ >> -javaoption:-XX:+UnlockExperimentalVMOptions \ >> -javaoption:-XX:+UseRVV \ >> -javaoption:-XX:+PrintOptoAssembly \ >> -javaoption:-XX:LogFile=/home/zifeihan/jdk/Byte256VectorTests_PrintOptoAssembly.log \ >> -jdk:/home/zifeihan/jdk/build/linux-riscv64-server-fastdebug/jdk \ >> /home/zifeihan/jdk/test/jdk/jdk/incubator/vector/Byte256VectorTests.java >> >> >> >> we can observe the specified compilation log `Byte256VectorTests_PrintOptoAssembly.log`, which contains the vector-scalar and vector-immediate arithmetic instructions for the PR implementation. >> >> vadd_immI Node >> >> 16c addw R11, R10, zr #@convI2L_reg_reg >> 170 add R9, R31, R11 # ptr, #@addP_reg_reg >> 174 addi R9, R9, #16 # ptr, #@addP_reg_imm >> 176 loadV V1, [R9] # vector (rvv) >> 17e vadd_immI V1, V1, #7 >> 186 add R11, R15, R11 # ptr, #@addP_reg_reg >> 188 addi R11, R11, #16 # ptr, #@addP_reg_imm >> 18a storeV [R11], V1 # vector (rvv) >> >> >> vadd_immI_masked Node >> >> 1e8 B31: # out( B37 B32 ) <- in( B30 ) Freq: 76.2281 >> 1e8 loadV V2, [R31] # vector (rvv) >> 1f0 vloadmask V0, V1 >> 1f8 vadd_immI_masked V2, V2, #7 >> 200 addi R31, R10, #48 # ptr, #@addP_reg_imm >> 204 bgeu R30, R7, B37 #@cmpU_branch P=0.000001 C=-1.000000 >> >> >> vadd_regI Node >> >> 0c4 B4: # out( B9 B5 ) <- in( B8 B3 ) Freq: 1 >> 0c4 vloadcon V1 # generate iota indices >> 0cc spill [sp, #4] -> R30 # spill size = 32 >> 0ce vmul_regI V1, V1, R30 >> 0d6 spill [sp, #0] -> R29 # spill size = 32 >> 0d8 vadd_regI V1, V1, R29 >> >> >> vadd_regI_masked Node >> >> 244 B36: # out( B33 B37 ) <- in( B35 ) Freq: 7427.81 >> 244 # castII of R30, #@castII >> 244 addw R31, R30, zr #@convI2L_reg_reg >> 248 spill [sp, #32] -> R10 # spill size = 64 >> 24a add R10, R10, R31 # ptr, #@addP_reg_reg >> 24c addi R10, R10, #16 # ptr, #@addP_reg_imm >> 24e loadV V2, [R10] # vector (rvv) >> 256 vloadmask V0, V1 >> 25e vadd_regI_masked V2, V2, R29 >> >> >> ... > > Gui Cao has updated the pull request incrementally with one additional commit since the last revision: > > Code Format Looks good, with some minor comments. src/hotspot/cpu/riscv/riscv_v.ad line 411: > 409: format %{ "vadd_immL $dst, $src1, $con" %} > 410: ins_encode %{ > 411: BasicType bt = Matcher::vector_element_basic_type(this); Do we need to call `Matcher::vector_element_basic_type()` as we already know the `BasicType` is `T_LONG`? src/hotspot/cpu/riscv/riscv_v.ad line 441: > 439: format %{ "vadd_regL $dst, $src1, $src2" %} > 440: ins_encode %{ > 441: BasicType bt = Matcher::vector_element_basic_type(this); Ditto src/hotspot/cpu/riscv/riscv_v.ad line 471: > 469: format %{ "vadd_immL_masked $dst_src, $dst_src, $con" %} > 470: ins_encode %{ > 471: BasicType bt = Matcher::vector_element_basic_type(this); ditto src/hotspot/cpu/riscv/riscv_v.ad line 501: > 499: format %{ "vadd_regL_masked $dst_src, $dst_src, $src2" %} > 500: ins_encode %{ > 501: BasicType bt = Matcher::vector_element_basic_type(this); ditto src/hotspot/cpu/riscv/riscv_v.ad line 595: > 593: format %{ "vsub_regL $dst, $src1, $src2" %} > 594: ins_encode %{ > 595: BasicType bt = Matcher::vector_element_basic_type(this); ditto src/hotspot/cpu/riscv/riscv_v.ad line 625: > 623: format %{ "vsub_regL_masked $dst_src, $dst_src, $src2" %} > 624: ins_encode %{ > 625: BasicType bt = Matcher::vector_element_basic_type(this); ditto src/hotspot/cpu/riscv/riscv_v.ad line 1671: > 1669: format %{ "vmul_regL $dst, $src1, $src2" %} > 1670: ins_encode %{ > 1671: BasicType bt = Matcher::vector_element_basic_type(this); ditto src/hotspot/cpu/riscv/riscv_v.ad line 1701: > 1699: format %{ "vmul_regL_masked $dst_src, $dst_src, $src2" %} > 1700: ins_encode %{ > 1701: BasicType bt = Matcher::vector_element_basic_type(this); ditto ------------- Marked as reviewed by fjiang (Committer). PR Review: https://git.openjdk.org/jdk/pull/19415#pullrequestreview-2091915130 PR Review Comment: https://git.openjdk.org/jdk/pull/19415#discussion_r1623084296 PR Review Comment: https://git.openjdk.org/jdk/pull/19415#discussion_r1623084345 PR Review Comment: https://git.openjdk.org/jdk/pull/19415#discussion_r1623084474 PR Review Comment: https://git.openjdk.org/jdk/pull/19415#discussion_r1623084510 PR Review Comment: https://git.openjdk.org/jdk/pull/19415#discussion_r1623084552 PR Review Comment: https://git.openjdk.org/jdk/pull/19415#discussion_r1623084611 PR Review Comment: https://git.openjdk.org/jdk/pull/19415#discussion_r1623084640 PR Review Comment: https://git.openjdk.org/jdk/pull/19415#discussion_r1623084715 From gcao at openjdk.org Sat Jun 1 02:30:25 2024 From: gcao at openjdk.org (Gui Cao) Date: Sat, 1 Jun 2024 02:30:25 GMT Subject: RFR: 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions [v3] In-Reply-To: References: Message-ID: <5e2XTs2WT-M_WA7OmJUlrpGDhnOojhleFrUanNa5t-0=.73a0a929-27fd-4f53-8e7d-2c86897866e8@github.com> > Hi, We want to support vector-scalar and vector-immediate arithmetic instructions, It was implemented by referring to RVV v1.0 [1]. please take a look and have some reviews. Thanks a lot. > We can use the Byte256VectorTests.java[2] to print the Opto JIT Code, verify and observe the generation of nodes. > > For example, we can use the following command to print the Opto JIT Code of a jtreg test case: > > > /home/zifeihan/jtreg/bin/jtreg \ > -v:default \ > -concurrency:16 -timeout:50 \ > -javaoption:-XX:+UnlockExperimentalVMOptions \ > -javaoption:-XX:+UseRVV \ > -javaoption:-XX:+PrintOptoAssembly \ > -javaoption:-XX:LogFile=/home/zifeihan/jdk/Byte256VectorTests_PrintOptoAssembly.log \ > -jdk:/home/zifeihan/jdk/build/linux-riscv64-server-fastdebug/jdk \ > /home/zifeihan/jdk/test/jdk/jdk/incubator/vector/Byte256VectorTests.java > > > > we can observe the specified compilation log `Byte256VectorTests_PrintOptoAssembly.log`, which contains the vector-scalar and vector-immediate arithmetic instructions for the PR implementation. > > vadd_immI Node > > 16c addw R11, R10, zr #@convI2L_reg_reg > 170 add R9, R31, R11 # ptr, #@addP_reg_reg > 174 addi R9, R9, #16 # ptr, #@addP_reg_imm > 176 loadV V1, [R9] # vector (rvv) > 17e vadd_immI V1, V1, #7 > 186 add R11, R15, R11 # ptr, #@addP_reg_reg > 188 addi R11, R11, #16 # ptr, #@addP_reg_imm > 18a storeV [R11], V1 # vector (rvv) > > > vadd_immI_masked Node > > 1e8 B31: # out( B37 B32 ) <- in( B30 ) Freq: 76.2281 > 1e8 loadV V2, [R31] # vector (rvv) > 1f0 vloadmask V0, V1 > 1f8 vadd_immI_masked V2, V2, #7 > 200 addi R31, R10, #48 # ptr, #@addP_reg_imm > 204 bgeu R30, R7, B37 #@cmpU_branch P=0.000001 C=-1.000000 > > > vadd_regI Node > > 0c4 B4: # out( B9 B5 ) <- in( B8 B3 ) Freq: 1 > 0c4 vloadcon V1 # generate iota indices > 0cc spill [sp, #4] -> R30 # spill size = 32 > 0ce vmul_regI V1, V1, R30 > 0d6 spill [sp, #0] -> R29 # spill size = 32 > 0d8 vadd_regI V1, V1, R29 > > > vadd_regI_masked Node > > 244 B36: # out( B33 B37 ) <- in( B35 ) Freq: 7427.81 > 244 # castII of R30, #@castII > 244 addw R31, R30, zr #@convI2L_reg_reg > 248 spill [sp, #32] -> R10 # spill size = 64 > 24a add R10, R10, R31 # ptr, #@addP_reg_reg > 24c addi R10, R10, #16 # ptr, #@addP_reg_imm > 24e loadV V2, [R10] # vector (rvv) > 256 vloadmask V0, V1 > 25e vadd_regI_masked V2, V2, R29 > > > vsub_regI Node > > 112 B20: # out( B63 B21 ) <- in( B19 ) Freq: 77.0107 > 112 # castII of R20, #@castII > 112 addw R11, R2... Gui Cao has updated the pull request incrementally with one additional commit since the last revision: Simplify Code ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19415/files - new: https://git.openjdk.org/jdk/pull/19415/files/ac335baa..a2785f38 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19415&range=02 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19415&range=01-02 Stats: 16 lines in 1 file changed: 0 ins; 8 del; 8 mod Patch: https://git.openjdk.org/jdk/pull/19415.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19415/head:pull/19415 PR: https://git.openjdk.org/jdk/pull/19415 From fjiang at openjdk.org Sat Jun 1 02:49:00 2024 From: fjiang at openjdk.org (Feilong Jiang) Date: Sat, 1 Jun 2024 02:49:00 GMT Subject: RFR: 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions [v3] In-Reply-To: <5e2XTs2WT-M_WA7OmJUlrpGDhnOojhleFrUanNa5t-0=.73a0a929-27fd-4f53-8e7d-2c86897866e8@github.com> References: <5e2XTs2WT-M_WA7OmJUlrpGDhnOojhleFrUanNa5t-0=.73a0a929-27fd-4f53-8e7d-2c86897866e8@github.com> Message-ID: On Sat, 1 Jun 2024 02:30:25 GMT, Gui Cao wrote: >> Hi, We want to support vector-scalar and vector-immediate arithmetic instructions, It was implemented by referring to RVV v1.0 [1]. please take a look and have some reviews. Thanks a lot. >> We can use the Byte256VectorTests.java[2] to print the Opto JIT Code, verify and observe the generation of nodes. >> >> For example, we can use the following command to print the Opto JIT Code of a jtreg test case: >> >> >> /home/zifeihan/jtreg/bin/jtreg \ >> -v:default \ >> -concurrency:16 -timeout:50 \ >> -javaoption:-XX:+UnlockExperimentalVMOptions \ >> -javaoption:-XX:+UseRVV \ >> -javaoption:-XX:+PrintOptoAssembly \ >> -javaoption:-XX:LogFile=/home/zifeihan/jdk/Byte256VectorTests_PrintOptoAssembly.log \ >> -jdk:/home/zifeihan/jdk/build/linux-riscv64-server-fastdebug/jdk \ >> /home/zifeihan/jdk/test/jdk/jdk/incubator/vector/Byte256VectorTests.java >> >> >> >> we can observe the specified compilation log `Byte256VectorTests_PrintOptoAssembly.log`, which contains the vector-scalar and vector-immediate arithmetic instructions for the PR implementation. >> >> vadd_immI Node >> >> 16c addw R11, R10, zr #@convI2L_reg_reg >> 170 add R9, R31, R11 # ptr, #@addP_reg_reg >> 174 addi R9, R9, #16 # ptr, #@addP_reg_imm >> 176 loadV V1, [R9] # vector (rvv) >> 17e vadd_immI V1, V1, #7 >> 186 add R11, R15, R11 # ptr, #@addP_reg_reg >> 188 addi R11, R11, #16 # ptr, #@addP_reg_imm >> 18a storeV [R11], V1 # vector (rvv) >> >> >> vadd_immI_masked Node >> >> 1e8 B31: # out( B37 B32 ) <- in( B30 ) Freq: 76.2281 >> 1e8 loadV V2, [R31] # vector (rvv) >> 1f0 vloadmask V0, V1 >> 1f8 vadd_immI_masked V2, V2, #7 >> 200 addi R31, R10, #48 # ptr, #@addP_reg_imm >> 204 bgeu R30, R7, B37 #@cmpU_branch P=0.000001 C=-1.000000 >> >> >> vadd_regI Node >> >> 0c4 B4: # out( B9 B5 ) <- in( B8 B3 ) Freq: 1 >> 0c4 vloadcon V1 # generate iota indices >> 0cc spill [sp, #4] -> R30 # spill size = 32 >> 0ce vmul_regI V1, V1, R30 >> 0d6 spill [sp, #0] -> R29 # spill size = 32 >> 0d8 vadd_regI V1, V1, R29 >> >> >> vadd_regI_masked Node >> >> 244 B36: # out( B33 B37 ) <- in( B35 ) Freq: 7427.81 >> 244 # castII of R30, #@castII >> 244 addw R31, R30, zr #@convI2L_reg_reg >> 248 spill [sp, #32] -> R10 # spill size = 64 >> 24a add R10, R10, R31 # ptr, #@addP_reg_reg >> 24c addi R10, R10, #16 # ptr, #@addP_reg_imm >> 24e loadV V2, [R10] # vector (rvv) >> 256 vloadmask V0, V1 >> 25e vadd_regI_masked V2, V2, R29 >> >> >> ... > > Gui Cao has updated the pull request incrementally with one additional commit since the last revision: > > Simplify Code Marked as reviewed by fjiang (Committer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19415#pullrequestreview-2091931310 From gcao at openjdk.org Sat Jun 1 02:58:01 2024 From: gcao at openjdk.org (Gui Cao) Date: Sat, 1 Jun 2024 02:58:01 GMT Subject: RFR: 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions [v2] In-Reply-To: References: <-0PMAvAivEpLgl0qFHHn21m3MrU7Xa7QmO6g2qHgfRQ=.120a7ee2-5525-4bfb-887b-da860feca3d0@github.com> Message-ID: On Sat, 1 Jun 2024 01:44:15 GMT, Feilong Jiang wrote: > Looks good, with some minor comments. Thanks for your review. Fixed. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19415#issuecomment-2143242939 From stuefe at openjdk.org Sat Jun 1 06:04:03 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Sat, 1 Jun 2024 06:04:03 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v6] In-Reply-To: References: Message-ID: On Fri, 31 May 2024 16:59:02 GMT, Andrew Haley wrote: > > Just a code-style review. > > Question: could there be some sort of regression test for this, with different examples and edge cases? > > I have no idea, really. assert_different_registers is used all over the place, and I'm going for bootcycle and tier1. You could write a death test gtest. Like this: TEST_VM_ASSERT_MSG(AssemblerAArch64, assert_different_regs, ".*Multiple uses of register: c_rarg0.*") { Register reg1 = r0; Register reg2 = r0; assert_different_registers(reg1, reg2); } ------------- PR Comment: https://git.openjdk.org/jdk/pull/16617#issuecomment-2143312464 From mbaesken at openjdk.org Sat Jun 1 08:10:07 2024 From: mbaesken at openjdk.org (Matthias Baesken) Date: Sat, 1 Jun 2024 08:10:07 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset 18446744073709551614 to null pointer [v3] In-Reply-To: References: Message-ID: <6Q2TKxyvLCqgYEk59Q0pzXw5H3jjRb8uDfX1XmtC1W4=.a30ed48d-9c1a-4039-9936-bc852fd370ed@github.com> On Fri, 31 May 2024 11:37:55 GMT, Matthias Baesken wrote: >> I guess Matthias only wanted to fix UB in hotspot ASAP and doesn't have the bandwidth to change the design everywhere. Sounds like you guys already have an alternative solution which already works. Maybe you would like to put it into a PR and we continue the discussion there? >> Nevertheless, having `sub / add_to_ptr_maybe_null` available in hotspot may be a good thing. There are some places where we really use additions with nullptr (e.g. `index_oop_from_field_offset_long` in unsafe.cpp). > >> I guess Matthias only wanted to fix UB in hotspot ASAP and doesn't have the bandwidth to change the design everywhere. > > Yes . > The first goal to make the '--enable-ubsan' configure flag useful; currently we have the configure flag but still fail already in the OpenJDK build (because of a number of ubsan related issues in HS). > @MBaesken can close this PR and re-assign this bug to me if he don't have time to do proposed changes to code. Sounds good to me. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19424#issuecomment-2143352841 From mbaesken at openjdk.org Sat Jun 1 08:10:07 2024 From: mbaesken at openjdk.org (Matthias Baesken) Date: Sat, 1 Jun 2024 08:10:07 GMT Subject: Withdrawn: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset 18446744073709551614 to null pointer In-Reply-To: References: Message-ID: On Tue, 28 May 2024 12:36:40 GMT, Matthias Baesken wrote: > When running on macOS with ubsan enabled, we see some issues in relocInfo (hpp and cpp); those already occur in the build quite early. > > /jdk/src/hotspot/share/code/relocInfo.cpp:155:30: runtime error: applying non-zero offset 18446744073709551614 to null pointer > > Similar happens when we add to the _current pointer > _current++; > this gives : > relocInfo.hpp:606:13: runtime error: applying non-zero offset to non-null pointer 0xfffffffffffffffe produced null pointer > > Seems the pointer subtraction/addition worked so far, so it might be an option to disable ubsan for those 2 functions. This pull request has been closed without being integrated. ------------- PR: https://git.openjdk.org/jdk/pull/19424 From mdoerr at openjdk.org Sat Jun 1 08:41:07 2024 From: mdoerr at openjdk.org (Martin Doerr) Date: Sat, 1 Jun 2024 08:41:07 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset 18446744073709551614 to null pointer [v3] In-Reply-To: References: Message-ID: On Fri, 31 May 2024 08:04:27 GMT, Matthias Baesken wrote: >> When running on macOS with ubsan enabled, we see some issues in relocInfo (hpp and cpp); those already occur in the build quite early. >> >> /jdk/src/hotspot/share/code/relocInfo.cpp:155:30: runtime error: applying non-zero offset 18446744073709551614 to null pointer >> >> Similar happens when we add to the _current pointer >> _current++; >> this gives : >> relocInfo.hpp:606:13: runtime error: applying non-zero offset to non-null pointer 0xfffffffffffffffe produced null pointer >> >> Seems the pointer subtraction/addition worked so far, so it might be an option to disable ubsan for those 2 functions. > > Matthias Baesken has updated the pull request incrementally with one additional commit since the last revision: > > rename templates Thank you, Vladimir! I appreciate it. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19424#issuecomment-2143363054 From duke at openjdk.org Sat Jun 1 16:44:01 2024 From: duke at openjdk.org (Abdelhak Zaaim) Date: Sat, 1 Jun 2024 16:44:01 GMT Subject: RFR: 8333366: C2: CmpU3Nodes are not pushed back to worklist in PhaseCCP leading to non-fixpoint assertion failure In-Reply-To: References: Message-ID: On Fri, 31 May 2024 17:34:41 GMT, Christian Hagedorn wrote: > The current code to push uses back to the worklist during CCP handles `CmpU` nodes but misses `CmpU3` nodes. This leads to an assertion failure that we have not reached a fixpoint. > > The fix is straight forward to add a case for `CmpU3` at the case where we already handle `CmpU` nodes such that they can be added back to the worklist like `CmpU` nodes during CCP. > > This was found during the analysis of [JDK-8332920](https://bugs.openjdk.org/browse/JDK-8332920) by trying to simplify the regression test (thanks to @TobiHartmann!). To properly add regression tests for JDK-8332920 and avoid hitting this bug here with some flag combination, we should fix this first. I will soon propose a PR for JDK-8332920 as well. > > Thanks, > Christian Marked as reviewed by abdelhak-zaaim at github.com (no known OpenJDK username). ------------- PR Review: https://git.openjdk.org/jdk/pull/19504#pullrequestreview-2092139270 From duke at openjdk.org Sat Jun 1 19:41:00 2024 From: duke at openjdk.org (Abdelhak Zaaim) Date: Sat, 1 Jun 2024 19:41:00 GMT Subject: RFR: 8333276: RISC-V: client VM build failure after JDK-8241503 In-Reply-To: References: Message-ID: On Thu, 30 May 2024 14:05:42 GMT, Gui Cao wrote: > Hi, please review this patch that fix the client VM build failed for riscv. > > Error log for client VM build to see: [JDK-8333276](https://bugs.openjdk.org/browse/JDK-8333276) > > The root cause is that `src/hotspot/share/code/compiledIC.hpp` include `"opto/c2_MacroAssembler.hpp"`, after that `opto/c2_MacroAssembler.hpp` include `c2_MacroAssembler_riscv.hpp`. > > The fix is that we extracted the `spill_vmask, unspill_vmask` function definitions into `c2_MacroAssembler_riscv.cpp`. `c2_MacroAssembler_riscv.cpp` will only compile if the `COMPILER2` macro is present. > ### Testing > - [x] linux-riscv client VM fastdebug native build Marked as reviewed by abdelhak-zaaim at github.com (no known OpenJDK username). ------------- PR Review: https://git.openjdk.org/jdk/pull/19481#pullrequestreview-2092165282 From jbhateja at openjdk.org Sun Jun 2 15:43:39 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Sun, 2 Jun 2024 15:43:39 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v2] In-Reply-To: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: > Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. > > This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. > > Kindly review and share your feedback. > > Best Regards, > Jatin > > PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) > > [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 > [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: Review Comments Incorporated. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19442/files - new: https://git.openjdk.org/jdk/pull/19442/files/12cc2b42..102b78ae Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19442&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19442&range=00-01 Stats: 9 lines in 3 files changed: 1 ins; 0 del; 8 mod Patch: https://git.openjdk.org/jdk/pull/19442.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19442/head:pull/19442 PR: https://git.openjdk.org/jdk/pull/19442 From gcao at openjdk.org Mon Jun 3 01:47:04 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 3 Jun 2024 01:47:04 GMT Subject: RFR: 8333276: RISC-V: client VM build failure after JDK-8241503 In-Reply-To: References: Message-ID: <5MLgAFchYuIIDBCSFPLsFC_jG5G5gnF4f3GiGg2w2KM=.86eb9c6d-bb59-4761-851a-412967302a68@github.com> On Thu, 30 May 2024 14:05:42 GMT, Gui Cao wrote: > Hi, please review this patch that fix the client VM build failed for riscv. > > Error log for client VM build to see: [JDK-8333276](https://bugs.openjdk.org/browse/JDK-8333276) > > The root cause is that `src/hotspot/share/code/compiledIC.hpp` include `"opto/c2_MacroAssembler.hpp"`, after that `opto/c2_MacroAssembler.hpp` include `c2_MacroAssembler_riscv.hpp`. > > The fix is that we extracted the `spill_vmask, unspill_vmask` function definitions into `c2_MacroAssembler_riscv.cpp`. `c2_MacroAssembler_riscv.cpp` will only compile if the `COMPILER2` macro is present. > ### Testing > - [x] linux-riscv client VM fastdebug native build Thanks all for the review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19481#issuecomment-2144131913 From gcao at openjdk.org Mon Jun 3 01:47:04 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 3 Jun 2024 01:47:04 GMT Subject: Integrated: 8333276: RISC-V: client VM build failure after JDK-8241503 In-Reply-To: References: Message-ID: On Thu, 30 May 2024 14:05:42 GMT, Gui Cao wrote: > Hi, please review this patch that fix the client VM build failed for riscv. > > Error log for client VM build to see: [JDK-8333276](https://bugs.openjdk.org/browse/JDK-8333276) > > The root cause is that `src/hotspot/share/code/compiledIC.hpp` include `"opto/c2_MacroAssembler.hpp"`, after that `opto/c2_MacroAssembler.hpp` include `c2_MacroAssembler_riscv.hpp`. > > The fix is that we extracted the `spill_vmask, unspill_vmask` function definitions into `c2_MacroAssembler_riscv.cpp`. `c2_MacroAssembler_riscv.cpp` will only compile if the `COMPILER2` macro is present. > ### Testing > - [x] linux-riscv client VM fastdebug native build This pull request has now been integrated. Changeset: cfe91ed3 Author: Gui Cao Committer: Fei Yang URL: https://git.openjdk.org/jdk/commit/cfe91ed39c9a0c8e8b16e142ee8cf3a90a6c69c3 Stats: 28 lines in 2 files changed: 15 ins; 11 del; 2 mod 8333276: RISC-V: client VM build failure after JDK-8241503 Reviewed-by: fyang ------------- PR: https://git.openjdk.org/jdk/pull/19481 From gcao at openjdk.org Mon Jun 3 01:48:10 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 3 Jun 2024 01:48:10 GMT Subject: RFR: 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions [v2] In-Reply-To: <1UGjH0ulqBYXt8Jeq7CCGUdQ7-mxZSBIRzk5MY8Wyus=.17aee7cc-bc08-4cfc-8272-6c0f10249e5c@github.com> References: <-0PMAvAivEpLgl0qFHHn21m3MrU7Xa7QmO6g2qHgfRQ=.120a7ee2-5525-4bfb-887b-da860feca3d0@github.com> <1UGjH0ulqBYXt8Jeq7CCGUdQ7-mxZSBIRzk5MY8Wyus=.17aee7cc-bc08-4cfc-8272-6c0f10249e5c@github.com> Message-ID: On Wed, 29 May 2024 02:24:35 GMT, Fei Yang wrote: >> Gui Cao has updated the pull request incrementally with one additional commit since the last revision: >> >> Code Format > > Updated change looks good. Thanks. @RealFYang @feilongjiang : Thanks all for the review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19415#issuecomment-2144133883 From gcao at openjdk.org Mon Jun 3 01:51:05 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 3 Jun 2024 01:51:05 GMT Subject: Integrated: 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions In-Reply-To: References: Message-ID: On Mon, 27 May 2024 16:33:30 GMT, Gui Cao wrote: > Hi, We want to support vector-scalar and vector-immediate arithmetic instructions, It was implemented by referring to RVV v1.0 [1]. please take a look and have some reviews. Thanks a lot. > We can use the Byte256VectorTests.java[2] to print the Opto JIT Code, verify and observe the generation of nodes. > > For example, we can use the following command to print the Opto JIT Code of a jtreg test case: > > > /home/zifeihan/jtreg/bin/jtreg \ > -v:default \ > -concurrency:16 -timeout:50 \ > -javaoption:-XX:+UnlockExperimentalVMOptions \ > -javaoption:-XX:+UseRVV \ > -javaoption:-XX:+PrintOptoAssembly \ > -javaoption:-XX:LogFile=/home/zifeihan/jdk/Byte256VectorTests_PrintOptoAssembly.log \ > -jdk:/home/zifeihan/jdk/build/linux-riscv64-server-fastdebug/jdk \ > /home/zifeihan/jdk/test/jdk/jdk/incubator/vector/Byte256VectorTests.java > > > > we can observe the specified compilation log `Byte256VectorTests_PrintOptoAssembly.log`, which contains the vector-scalar and vector-immediate arithmetic instructions for the PR implementation. > > vadd_immI Node > > 16c addw R11, R10, zr #@convI2L_reg_reg > 170 add R9, R31, R11 # ptr, #@addP_reg_reg > 174 addi R9, R9, #16 # ptr, #@addP_reg_imm > 176 loadV V1, [R9] # vector (rvv) > 17e vadd_immI V1, V1, #7 > 186 add R11, R15, R11 # ptr, #@addP_reg_reg > 188 addi R11, R11, #16 # ptr, #@addP_reg_imm > 18a storeV [R11], V1 # vector (rvv) > > > vadd_immI_masked Node > > 1e8 B31: # out( B37 B32 ) <- in( B30 ) Freq: 76.2281 > 1e8 loadV V2, [R31] # vector (rvv) > 1f0 vloadmask V0, V1 > 1f8 vadd_immI_masked V2, V2, #7 > 200 addi R31, R10, #48 # ptr, #@addP_reg_imm > 204 bgeu R30, R7, B37 #@cmpU_branch P=0.000001 C=-1.000000 > > > vadd_regI Node > > 0c4 B4: # out( B9 B5 ) <- in( B8 B3 ) Freq: 1 > 0c4 vloadcon V1 # generate iota indices > 0cc spill [sp, #4] -> R30 # spill size = 32 > 0ce vmul_regI V1, V1, R30 > 0d6 spill [sp, #0] -> R29 # spill size = 32 > 0d8 vadd_regI V1, V1, R29 > > > vadd_regI_masked Node > > 244 B36: # out( B33 B37 ) <- in( B35 ) Freq: 7427.81 > 244 # castII of R30, #@castII > 244 addw R31, R30, zr #@convI2L_reg_reg > 248 spill [sp, #32] -> R10 # spill size = 64 > 24a add R10, R10, R31 # ptr, #@addP_reg_reg > 24c addi R10, R10, #16 # ptr, #@addP_reg_imm > 24e loadV V2, [R10] # vector (rvv) > 256 vloadmask V0, V1 > 25e vadd_regI_masked V2, V2, R29 > > > vsub_regI Node > > 112 B20: # out( B63 B21 ) <- in( B19 ) Freq: 77.0107 > 112 # castII of R20, #@castII > 112 addw R11, R2... This pull request has now been integrated. Changeset: a4c7be86 Author: Gui Cao Committer: Fei Yang URL: https://git.openjdk.org/jdk/commit/a4c7be862cc6dc121efb6c1c283236a588259c8f Stats: 245 lines in 2 files changed: 244 ins; 0 del; 1 mod 8333006: RISC-V: C2: Support vector-scalar and vector-immediate arithmetic instructions Reviewed-by: fyang, fjiang ------------- PR: https://git.openjdk.org/jdk/pull/19415 From gcao at openjdk.org Mon Jun 3 02:13:14 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 3 Jun 2024 02:13:14 GMT Subject: RFR: 8333154: RISC-V: Add support for primitive array C1 clone intrinsic [v2] In-Reply-To: References: Message-ID: <3GdY_V0rXEDpKxDNeG8iqiSr7pwhao0o3izt3U_rFMY=.1cb3e409-e55b-4d0f-8746-b941594e5952@github.com> > Implementation of primitive array C1 clone intrinsic (https://bugs.openjdk.org/browse/JDK-8333154) for linux-riscv64. > > ### Correctness testing: > - [x] Run make test TEST="hotspot_compiler" JTREG="JAVA_OPTIONS=-XX:TieredStopAtLevel=1" (fastdebug) > - [x] Run tier1-3 tests on SOPHON SG2042 (release) > > ### Performance testing: > Without Patch: > > make test TEST="micro:java.lang.ArrayClone" MICRO="JAVA_OPTIONS=-XX:TieredStopAtLevel=1" > > Benchmark (size) Mode Cnt Score Error Units > ArrayClone.byteArraycopy 0 avgt 15 90.089 ? 7.122 ns/op > ArrayClone.byteArraycopy 10 avgt 15 146.000 ? 11.761 ns/op > ArrayClone.byteArraycopy 100 avgt 15 289.382 ? 23.903 ns/op > ArrayClone.byteArraycopy 1000 avgt 15 767.864 ? 56.721 ns/op > ArrayClone.byteClone 0 avgt 15 735.692 ? 26.641 ns/op > ArrayClone.byteClone 10 avgt 15 810.810 ? 34.563 ns/op > ArrayClone.byteClone 100 avgt 15 1055.917 ? 93.574 ns/op > ArrayClone.byteClone 1000 avgt 15 1564.465 ? 140.941 ns/op > ArrayClone.intArraycopy 0 avgt 15 93.732 ? 8.468 ns/op > ArrayClone.intArraycopy 10 avgt 15 214.168 ? 34.526 ns/op > ArrayClone.intArraycopy 100 avgt 15 613.363 ? 45.415 ns/op > ArrayClone.intArraycopy 1000 avgt 15 1759.611 ? 59.010 ns/op > ArrayClone.intClone 0 avgt 15 680.100 ? 24.375 ns/op > ArrayClone.intClone 10 avgt 15 835.979 ? 75.154 ns/op > ArrayClone.intClone 100 avgt 15 1337.354 ? 86.182 ns/op > ArrayClone.intClone 1000 avgt 15 2696.280 ? 207.418 ns/op > Finished running test 'micro:java.lang.ArrayClone' > > > With Patch: > > make test TEST="micro:java.lang.ArrayClone" MICRO="JAVA_OPTIONS=-XX:TieredStopAtLevel=1" > > Benchmark (size) Mode Cnt Score Error Units > ArrayClone.byteArraycopy 0 avgt 15 89.410 ? 5.112 ns/op > ArrayClone.byteArraycopy 10 avgt 15 141.125 ? 8.711 ns/op > ArrayClone.byteArraycopy 100 avgt 15 277.098 ? 12.925 ns/op > ArrayClone.byteArraycopy 1000 avgt 15 770.188 ? 83.034 ns/op > ArrayClone.byteClone 0 avgt 15 94.367 ? 7.088 ns/op > ArrayClone.byteClone 10 avgt 15 151.804 ? 16.497 ns/op > ArrayClone.byteClone 100 avgt 15 296.284 ? 17.893 ns/op > ArrayClone.byteClone 1000 avgt 15 790.517 ? 28.765 ns/op > ArrayClone.intArraycopy 0 avgt 15 93.688 ? 7.050 ns... Gui Cao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains two additional commits since the last revision: - Merge remote-tracking branch 'upstream/master' into JDK-8333154 - 8333154: RISC-V: Add support for primitive array C1 clone intrinsic ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19448/files - new: https://git.openjdk.org/jdk/pull/19448/files/714ac46f..6cc793f2 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19448&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19448&range=00-01 Stats: 13751 lines in 381 files changed: 8275 ins; 3832 del; 1644 mod Patch: https://git.openjdk.org/jdk/pull/19448.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19448/head:pull/19448 PR: https://git.openjdk.org/jdk/pull/19448 From gcao at openjdk.org Mon Jun 3 04:45:14 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 3 Jun 2024 04:45:14 GMT Subject: RFR: 8333154: RISC-V: Add support for primitive array C1 clone intrinsic [v2] In-Reply-To: References: Message-ID: On Thu, 30 May 2024 12:13:48 GMT, Fei Yang wrote: >> Gui Cao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains two additional commits since the last revision: >> >> - Merge remote-tracking branch 'upstream/master' into JDK-8333154 >> - 8333154: RISC-V: Add support for primitive array C1 clone intrinsic > > LGTM. Thanks. @RealFYang : Thanks for the review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19448#issuecomment-2144266821 From gcao at openjdk.org Mon Jun 3 04:45:14 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 3 Jun 2024 04:45:14 GMT Subject: Integrated: 8333154: RISC-V: Add support for primitive array C1 clone intrinsic In-Reply-To: References: Message-ID: <-p6dPxD3KUWcppKsTUB7j1pvkGl6K0tTuAprAhrrR2E=.7c0c6313-d03e-457e-beb4-2330ec529e87@github.com> On Wed, 29 May 2024 08:23:39 GMT, Gui Cao wrote: > Implementation of primitive array C1 clone intrinsic (https://bugs.openjdk.org/browse/JDK-8333154) for linux-riscv64. > > ### Correctness testing: > - [x] Run make test TEST="hotspot_compiler" JTREG="JAVA_OPTIONS=-XX:TieredStopAtLevel=1" (fastdebug) > - [x] Run tier1-3 tests on SOPHON SG2042 (release) > > ### Performance testing: > Without Patch: > > make test TEST="micro:java.lang.ArrayClone" MICRO="JAVA_OPTIONS=-XX:TieredStopAtLevel=1" > > Benchmark (size) Mode Cnt Score Error Units > ArrayClone.byteArraycopy 0 avgt 15 90.089 ? 7.122 ns/op > ArrayClone.byteArraycopy 10 avgt 15 146.000 ? 11.761 ns/op > ArrayClone.byteArraycopy 100 avgt 15 289.382 ? 23.903 ns/op > ArrayClone.byteArraycopy 1000 avgt 15 767.864 ? 56.721 ns/op > ArrayClone.byteClone 0 avgt 15 735.692 ? 26.641 ns/op > ArrayClone.byteClone 10 avgt 15 810.810 ? 34.563 ns/op > ArrayClone.byteClone 100 avgt 15 1055.917 ? 93.574 ns/op > ArrayClone.byteClone 1000 avgt 15 1564.465 ? 140.941 ns/op > ArrayClone.intArraycopy 0 avgt 15 93.732 ? 8.468 ns/op > ArrayClone.intArraycopy 10 avgt 15 214.168 ? 34.526 ns/op > ArrayClone.intArraycopy 100 avgt 15 613.363 ? 45.415 ns/op > ArrayClone.intArraycopy 1000 avgt 15 1759.611 ? 59.010 ns/op > ArrayClone.intClone 0 avgt 15 680.100 ? 24.375 ns/op > ArrayClone.intClone 10 avgt 15 835.979 ? 75.154 ns/op > ArrayClone.intClone 100 avgt 15 1337.354 ? 86.182 ns/op > ArrayClone.intClone 1000 avgt 15 2696.280 ? 207.418 ns/op > Finished running test 'micro:java.lang.ArrayClone' > > > With Patch: > > make test TEST="micro:java.lang.ArrayClone" MICRO="JAVA_OPTIONS=-XX:TieredStopAtLevel=1" > > Benchmark (size) Mode Cnt Score Error Units > ArrayClone.byteArraycopy 0 avgt 15 89.410 ? 5.112 ns/op > ArrayClone.byteArraycopy 10 avgt 15 141.125 ? 8.711 ns/op > ArrayClone.byteArraycopy 100 avgt 15 277.098 ? 12.925 ns/op > ArrayClone.byteArraycopy 1000 avgt 15 770.188 ? 83.034 ns/op > ArrayClone.byteClone 0 avgt 15 94.367 ? 7.088 ns/op > ArrayClone.byteClone 10 avgt 15 151.804 ? 16.497 ns/op > ArrayClone.byteClone 100 avgt 15 296.284 ? 17.893 ns/op > ArrayClone.byteClone 1000 avgt 15 790.517 ? 28.765 ns/op > ArrayClone.intArraycopy 0 avgt 15 93.688 ? 7.050 ns... This pull request has now been integrated. Changeset: 75220da2 Author: Gui Cao Committer: Fei Yang URL: https://git.openjdk.org/jdk/commit/75220da26f647c6f3dabc05cea81cefaf3a1e195 Stats: 31 lines in 7 files changed: 21 ins; 0 del; 10 mod 8333154: RISC-V: Add support for primitive array C1 clone intrinsic Reviewed-by: fyang ------------- PR: https://git.openjdk.org/jdk/pull/19448 From thartmann at openjdk.org Mon Jun 3 05:34:01 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Mon, 3 Jun 2024 05:34:01 GMT Subject: RFR: 8333366: C2: CmpU3Nodes are not pushed back to worklist in PhaseCCP leading to non-fixpoint assertion failure In-Reply-To: References: Message-ID: On Fri, 31 May 2024 17:34:41 GMT, Christian Hagedorn wrote: > The current code to push uses back to the worklist during CCP handles `CmpU` nodes but misses `CmpU3` nodes. This leads to an assertion failure that we have not reached a fixpoint. > > The fix is straight forward to add a case for `CmpU3` at the case where we already handle `CmpU` nodes such that they can be added back to the worklist like `CmpU` nodes during CCP. > > This was found during the analysis of [JDK-8332920](https://bugs.openjdk.org/browse/JDK-8332920) by trying to simplify the regression test (thanks to @TobiHartmann!). To properly add regression tests for JDK-8332920 and avoid hitting this bug here with some flag combination, we should fix this first. I will soon propose a PR for JDK-8332920 as well. > > Thanks, > Christian Looks good to me too. I assume you verified that the test does not trigger JDK-8332920 with some VM flag combination in the CI? ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19504#pullrequestreview-2092821848 From varadam at openjdk.org Mon Jun 3 05:36:12 2024 From: varadam at openjdk.org (Varada M) Date: Mon, 3 Jun 2024 05:36:12 GMT Subject: RFR: 8331935: Add support for primitive array C1 clone intrinsic in PPC [v5] In-Reply-To: References: Message-ID: > https://bugs.openjdk.org/browse/JDK-8302850 port for PPC64 > > JMH Benchmark Results > > > Before : > > Benchmark (size) Mode Cnt Score Error Units > ArrayClone.byteArraycopy 0 avgt 15 114.107 ? 1.337 ns/op > ArrayClone.byteArraycopy 10 avgt 15 130.492 ? 0.991 ns/op > ArrayClone.byteArraycopy 100 avgt 15 139.103 ? 1.913 ns/op > ArrayClone.byteArraycopy 1000 avgt 15 321.688 ? 6.033 ns/op > ArrayClone.byteClone 0 avgt 15 227.602 ? 3.393 ns/op > ArrayClone.byteClone 10 avgt 15 237.624 ? 2.996 ns/op > ArrayClone.byteClone 100 avgt 15 239.219 ? 2.835 ns/op > > ArrayClone.byteClone 1000 avgt 15 355.571 ? 2.946 ns/op > ArrayClone.intArraycopy 0 avgt 15 113.275 ? 1.099 ns/op > ArrayClone.intArraycopy 10 avgt 15 129.763 ? 1.458 ns/op > ArrayClone.intArraycopy 100 avgt 15 213.327 ? 2.524 ns/op > ArrayClone.intArraycopy 1000 avgt 15 449.650 ? 7.338 ns/op > ArrayClone.intClone 0 avgt 15 225.682 ? 3.048 ns/op > ArrayClone.intClone 10 avgt 15 234.532 ? 2.817 ns/op > ArrayClone.intClone 100 avgt 15 295.934 ? 4.925 ns/op > ArrayClone.intClone 1000 avgt 15 573.368 ? 5.739 ns/op > Finished running test 'micro:java.lang.ArrayClone' > Test report is stored in build/aix-ppc64-server-release/test-results/micro_java_lang_ArrayClone > > ============================== > Test summary > ============================== > TEST TOTAL PASS FAIL ERROR > micro:java.lang.ArrayClone 1 1 0 0 > ============================== > TEST SUCCESS > > Finished building target 'test' in configuration 'aix-ppc64-server-release' > > > > > After: > > Benchmark (size) Mode Cnt Score Error Units > ArrayClone.byteArraycopy 0 avgt 15 113.894 ? 0.993 ns/op > ArrayClone.byteArraycopy 10 avgt 15 131.455 ? 0.956 ns/op > ArrayClone.byteArraycopy 100 avgt 15 139.145 ? 3.002 ns/op > ArrayClone.byteArraycopy 1000 avgt 15 315.957 ? 14.591 ns/op > ArrayClone.byteClone 0 avgt 15 43.753 ? 3.669 ns/op > ArrayClone.byteClone 10 avgt 15 52.329 ? 1.041 ns/op > ArrayClone.byteClone 100 avgt 15 127.711 ? 3.938 ns/op > > ArrayClone.byteClone 1000 avgt 15 225.937 ? 1.987 ns/op > ArrayClone.intArraycopy 0 avgt 15 113.788 ? 0.770 ns/op > ArrayClone.intArraycopy 10 avgt 1... Varada M has updated the pull request with a new target base due to a merge or a rebase. The pull request now contains six commits: - Merge branch 'master' into arryClone - Merge branch 'master' into arryClone - Add support for primitive array C1 clone intrinsic - Add support for primitive array C1 clone intrinsic - Add support for primitive array C1 clone intrinsic - Add support for primitive array C1 clone intrinsic ------------- Changes: https://git.openjdk.org/jdk/pull/19250/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19250&range=04 Stats: 64 lines in 6 files changed: 27 ins; 3 del; 34 mod Patch: https://git.openjdk.org/jdk/pull/19250.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19250/head:pull/19250 PR: https://git.openjdk.org/jdk/pull/19250 From thartmann at openjdk.org Mon Jun 3 05:41:01 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Mon, 3 Jun 2024 05:41:01 GMT Subject: RFR: 8332959: C2: ZGC fails with 'Incorrect load shift' when invoking Object.clone() reflectively on an array In-Reply-To: References: Message-ID: On Thu, 30 May 2024 16:50:22 GMT, Roberto Casta?eda Lozano wrote: > This changeset enforces cloned arrays to be initialized at allocation time when their type is unknown, as expected by ZGC in this scenario (see the [JBS issue](https://bugs.openjdk.org/projects/JDK/issues/JDK-8332959) for further details). Array clones with unknown type may arise from compiling the array-guarded branch of a reflective `Object.clone()` invocation, as illustrated by the included test. > > #### Testing > - tier1-5, stress test (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode). > - tier6-7 (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode; ZGC tests only). Looks good to me. Great that you were able to come up with a test! ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19486#pullrequestreview-2092828482 From chagedorn at openjdk.org Mon Jun 3 06:44:10 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 06:44:10 GMT Subject: RFR: 8333366: C2: CmpU3Nodes are not pushed back to worklist in PhaseCCP leading to non-fixpoint assertion failure In-Reply-To: References: Message-ID: On Fri, 31 May 2024 18:03:42 GMT, Vladimir Kozlov wrote: >> The current code to push uses back to the worklist during CCP handles `CmpU` nodes but misses `CmpU3` nodes. This leads to an assertion failure that we have not reached a fixpoint. >> >> The fix is straight forward to add a case for `CmpU3` at the case where we already handle `CmpU` nodes such that they can be added back to the worklist like `CmpU` nodes during CCP. >> >> This was found during the analysis of [JDK-8332920](https://bugs.openjdk.org/browse/JDK-8332920) by trying to simplify the regression test (thanks to @TobiHartmann!). To properly add regression tests for JDK-8332920 and avoid hitting this bug here with some flag combination, we should fix this first. I will soon propose a PR for JDK-8332920 as well. >> >> Thanks, >> Christian > > Good. Thanks @vnkozlov, @abdelhak-zaaim, and @TobiHartmann for your reviews! > Looks good to me too. I assume you verified that the test does not trigger JDK-8332920 with some VM flag combination in the CI? Yes, I removed the checking code with the `RuntimeException` from the test. Then the test succeeds (the tests of JDK-8332920 only result in a wrong execution). Testing in the CI confirmed that there is no failure of this test. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19504#issuecomment-2144392850 From chagedorn at openjdk.org Mon Jun 3 06:44:11 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 06:44:11 GMT Subject: Integrated: 8333366: C2: CmpU3Nodes are not pushed back to worklist in PhaseCCP leading to non-fixpoint assertion failure In-Reply-To: References: Message-ID: <0F6cZ8iUFUu2P4zwJxK9_snJi0OaHax_N2I7Xgl3Suc=.9fb6a57b-7c35-4086-ae32-88b45c2f7742@github.com> On Fri, 31 May 2024 17:34:41 GMT, Christian Hagedorn wrote: > The current code to push uses back to the worklist during CCP handles `CmpU` nodes but misses `CmpU3` nodes. This leads to an assertion failure that we have not reached a fixpoint. > > The fix is straight forward to add a case for `CmpU3` at the case where we already handle `CmpU` nodes such that they can be added back to the worklist like `CmpU` nodes during CCP. > > This was found during the analysis of [JDK-8332920](https://bugs.openjdk.org/browse/JDK-8332920) by trying to simplify the regression test (thanks to @TobiHartmann!). To properly add regression tests for JDK-8332920 and avoid hitting this bug here with some flag combination, we should fix this first. I will soon propose a PR for JDK-8332920 as well. > > Thanks, > Christian This pull request has now been integrated. Changeset: 7c83d7ab Author: Christian Hagedorn URL: https://git.openjdk.org/jdk/commit/7c83d7ab53f1f761a88a1d248b9a2f14980ef702 Stats: 53 lines in 2 files changed: 51 ins; 0 del; 2 mod 8333366: C2: CmpU3Nodes are not pushed back to worklist in PhaseCCP leading to non-fixpoint assertion failure Reviewed-by: kvn, thartmann ------------- PR: https://git.openjdk.org/jdk/pull/19504 From gcao at openjdk.org Mon Jun 3 06:54:01 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 3 Jun 2024 06:54:01 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 In-Reply-To: References: Message-ID: On Thu, 30 May 2024 10:05:39 GMT, Gui Cao wrote: > Hi, VectorGatherMaskFoldingTest.java Test fails when max vector bits is 64, when max vector bits is 64, LongVector.SPECIES_MAX.length() and DoubleVector.SPECIES_MAX.length() is 1. > > We can reproduce this problem in two ways: > 1. We can use riscv without rvv1.0 board to reproduce this problem > 2. Run VectorGatherMaskFoldingTest.java on aarch64 client mode without `-XX:+IncrementalInlineForceCleanup` Option, the `-XX:+IncrementalInlineForceCleanup` is C2 Option, so we need to remove this Option from the VectorGatherMaskFoldingTest.main method. error message: > > Base Test: @Test testDoubleVectorStoreLoadMaskedVector: > compiler.lib.ir_framework.shared.TestRunException: There was an error while invoking @Test method public static void compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(). Target: null. Arguments: > at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:84) > at compiler.lib.ir_framework.test.BaseTest.invokeTest(BaseTest.java:71) > at compiler.lib.ir_framework.test.AbstractTest.run(AbstractTest.java:98) > at compiler.lib.ir_framework.test.TestVM.runTests(TestVM.java:861) > at compiler.lib.ir_framework.test.TestVM.start(TestVM.java:252) > at compiler.lib.ir_framework.test.TestVM.main(TestVM.java:165) > Caused by: java.lang.reflect.InvocationTargetException > at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:118) > at java.base/java.lang.reflect.Method.invoke(Method.java:580) > at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:80) > ... 5 more > Caused by: java.lang.RuntimeException: assertNotEquals: expected [1.0] to not equal [1.0] > at jdk.test.lib.Asserts.fail(Asserts.java:691) > at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:451) > at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:435) > at compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(VectorGatherMaskFoldingTest.java:1089) > at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103) > ... 7 more > > > For example, the following method will be failed: > > private static final VectorSpecies L_SPECIES = LongVector.SPECIES_MAX; > private static final VectorSpecies D_SPECIES = DoubleVector.SPECIES_MAX; > ... > @Test > @IR(counts = { IRNode.STORE_VECTOR_MASKED, ">= 1", IRNode.LOAD_VECTOR_MASKED, ">= 1" }, applyIfCPUFeatureOr = {"avx512", "true", "sve", "true"}) > public static ... @dafedafe Hi Damon, Maybe you can take a look at this small change? Thanks. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19473#issuecomment-2144406197 From rcastanedalo at openjdk.org Mon Jun 3 07:29:05 2024 From: rcastanedalo at openjdk.org (Roberto =?UTF-8?B?Q2FzdGHDsWVkYQ==?= Lozano) Date: Mon, 3 Jun 2024 07:29:05 GMT Subject: RFR: 8332959: C2: ZGC fails with 'Incorrect load shift' when invoking Object.clone() reflectively on an array In-Reply-To: References: Message-ID: On Thu, 30 May 2024 16:50:22 GMT, Roberto Casta?eda Lozano wrote: > This changeset enforces cloned arrays to be initialized at allocation time when their type is unknown, as expected by ZGC in this scenario (see the [JBS issue](https://bugs.openjdk.org/projects/JDK/issues/JDK-8332959) for further details). Array clones with unknown type may arise from compiling the array-guarded branch of a reflective `Object.clone()` invocation, as illustrated by the included test. > > #### Testing > - tier1-5, stress test (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode). > - tier6-7 (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode; ZGC tests only). Thanks for reviewing, Vladimir and Tobias! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19486#issuecomment-2144465107 From aboldtch at openjdk.org Mon Jun 3 07:34:03 2024 From: aboldtch at openjdk.org (Axel Boldt-Christmas) Date: Mon, 3 Jun 2024 07:34:03 GMT Subject: RFR: 8332959: C2: ZGC fails with 'Incorrect load shift' when invoking Object.clone() reflectively on an array In-Reply-To: References: Message-ID: On Thu, 30 May 2024 16:50:22 GMT, Roberto Casta?eda Lozano wrote: > This changeset enforces cloned arrays to be initialized at allocation time when their type is unknown, as expected by ZGC in this scenario (see the [JBS issue](https://bugs.openjdk.org/projects/JDK/issues/JDK-8332959) for further details). Array clones with unknown type may arise from compiling the array-guarded branch of a reflective `Object.clone()` invocation, as illustrated by the included test. > > #### Testing > - tier1-5, stress test (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode). > - tier6-7 (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode; ZGC tests only). Marked as reviewed by aboldtch (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19486#pullrequestreview-2092997392 From rcastanedalo at openjdk.org Mon Jun 3 07:40:07 2024 From: rcastanedalo at openjdk.org (Roberto =?UTF-8?B?Q2FzdGHDsWVkYQ==?= Lozano) Date: Mon, 3 Jun 2024 07:40:07 GMT Subject: RFR: 8332959: C2: ZGC fails with 'Incorrect load shift' when invoking Object.clone() reflectively on an array In-Reply-To: References: Message-ID: <72rYmOInNaxiTnskd10s5hVNw4Ws_NvaStWXOpPwRTc=.5811245e-5640-4007-8110-e220cb9eb5c4@github.com> On Thu, 30 May 2024 16:50:22 GMT, Roberto Casta?eda Lozano wrote: > This changeset enforces cloned arrays to be initialized at allocation time when their type is unknown, as expected by ZGC in this scenario (see the [JBS issue](https://bugs.openjdk.org/projects/JDK/issues/JDK-8332959) for further details). Array clones with unknown type may arise from compiling the array-guarded branch of a reflective `Object.clone()` invocation, as illustrated by the included test. > > #### Testing > - tier1-5, stress test (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode). > - tier6-7 (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode; ZGC tests only). Thanks for reviewing (and proposing the fix), Axel! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19486#issuecomment-2144483058 From rcastanedalo at openjdk.org Mon Jun 3 07:40:07 2024 From: rcastanedalo at openjdk.org (Roberto =?UTF-8?B?Q2FzdGHDsWVkYQ==?= Lozano) Date: Mon, 3 Jun 2024 07:40:07 GMT Subject: Integrated: 8332959: C2: ZGC fails with 'Incorrect load shift' when invoking Object.clone() reflectively on an array In-Reply-To: References: Message-ID: On Thu, 30 May 2024 16:50:22 GMT, Roberto Casta?eda Lozano wrote: > This changeset enforces cloned arrays to be initialized at allocation time when their type is unknown, as expected by ZGC in this scenario (see the [JBS issue](https://bugs.openjdk.org/projects/JDK/issues/JDK-8332959) for further details). Array clones with unknown type may arise from compiling the array-guarded branch of a reflective `Object.clone()` invocation, as illustrated by the included test. > > #### Testing > - tier1-5, stress test (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode). > - tier6-7 (windows-x64, linux-x64, linux-aarch64, macosx-x64, macosx-aarch64; release and debug mode; ZGC tests only). This pull request has now been integrated. Changeset: e0ac8249 Author: Roberto Casta?eda Lozano URL: https://git.openjdk.org/jdk/commit/e0ac8249f54222cc5efe49d5ed1068fed3073ba0 Stats: 41 lines in 2 files changed: 37 ins; 0 del; 4 mod 8332959: C2: ZGC fails with 'Incorrect load shift' when invoking Object.clone() reflectively on an array Co-authored-by: Axel Boldt-Christmas Reviewed-by: kvn, thartmann, aboldtch ------------- PR: https://git.openjdk.org/jdk/pull/19486 From shade at openjdk.org Mon Jun 3 08:25:01 2024 From: shade at openjdk.org (Aleksey Shipilev) Date: Mon, 3 Jun 2024 08:25:01 GMT Subject: RFR: 8328107: Shenandoah/C2: TestVerifyLoopOptimizations test failure In-Reply-To: References: Message-ID: <9gEUG-pcuefQN9hQ7jLR5sjjvKn0PQU8f2QgFyuBOEQ=.3e038f49-f7eb-48cc-903a-86201bf07240@github.com> On Thu, 16 May 2024 07:37:46 GMT, Roland Westrelin wrote: > The failure occurs because a load barrier is expanded on the backedge > of the counted loop. That breaks the expected counted loop shape. The > fix I propose is to replace the `CountedLoop` with a `Loop` node when > that happens. We're basically done with optimizations related to > counted loop at this point so this shouldn't make a difference. This looks reasonable, thanks. Passes all the tests I throw at it. ------------- Marked as reviewed by shade (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19259#pullrequestreview-2093136958 From aph at openjdk.org Mon Jun 3 08:34:07 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 08:34:07 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v6] In-Reply-To: References: Message-ID: <8pQ9Hda7lo9Da7T1U1vcDCKobhhpUUOCO0CL-Aq_wQQ=.f0250a36-ec4b-4966-9dc6-0d3b31a3566e@github.com> On Sat, 1 Jun 2024 06:00:55 GMT, Thomas Stuefe wrote: > > > Just a code-style review. > > > Question: could there be some sort of regression test for this, with different examples and edge cases? > > > > > > I have no idea, really. assert_different_registers is used all over the place, and I'm going for bootcycle and tier1. > > You could write a death test gtest. Like this: > > ``` > TEST_VM_ASSERT_MSG(AssemblerAArch64, assert_different_regs, ".*Multiple uses of register: c_rarg0.*") { > Register reg1 = r0; > Register reg2 = r0; > assert_different_registers(reg1, reg2); > } > ``` I could, but I don't think there's much point. `assert_different_registers()` is used so much that it'll get thoroughly tested in the positive cases, at least. Do you think this is important? ------------- PR Comment: https://git.openjdk.org/jdk/pull/16617#issuecomment-2144597901 From chagedorn at openjdk.org Mon Jun 3 08:34:40 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 08:34:40 GMT Subject: RFR: 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate Message-ID: [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and/or non-null-checks which both use `Opaque4` nodes. I've also improved the following bailout from `!= 2` to `< 2` since the original comment suggested that if there is a test without 2 inputs, it must be a dead test (i.e. I assumed a `ConNode`): https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/share/opto/loopTransform.cpp#L1204-L1206 But apparently, we could also have the following, also dead, `If` with a condition with 3 inputs (`505 Phi`) created by `split_thru_phi`: ![image](https://github.com/openjdk/jdk/assets/17833009/003ad32a-a675-49f2-a263-7ca28d1faf82) I therefore revert the check back to `!= 2` and improved the comment. Thanks, Christian ------------- Commit messages: - 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate Changes: https://git.openjdk.org/jdk/pull/19517/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19517&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333394 Stats: 61 lines in 2 files changed: 58 ins; 0 del; 3 mod Patch: https://git.openjdk.org/jdk/pull/19517.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19517/head:pull/19517 PR: https://git.openjdk.org/jdk/pull/19517 From bkilambi at openjdk.org Mon Jun 3 08:35:44 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Mon, 3 Jun 2024 08:35:44 GMT Subject: RFR: 8320725: C2: Add "requires_strict_order" flag for floating-point add-reduction [v11] In-Reply-To: References: Message-ID: > Floating-point addition is non-associative, that is adding floating-point elements in arbitrary order may get different value. Specially, Vector API does not define the order of reduction intentionally, which allows platforms to generate more efficient codes [1]. So that needs a node to represent non strictly-ordered add-reduction for floating-point type in C2. > > To avoid introducing new nodes, this patch adds a bool field in `AddReductionVF/D` to distinguish whether they require strict order. It also removes `UnorderedReductionNode` and adds a virtual function `bool requires_strict_order()` in `ReductionNode`. Besides `AddReductionVF/D`, other reduction nodes' `requires_strict_order()` have a fixed value. > > With this patch, Vector API would always generate non strictly-ordered `AddReductionVF/D' on SVE machines with vector length <= 16B as it is more beneficial to generate non-strictly ordered instructions on such machines compared to strictly ordered ones. > > [AArch64] > On Neon, non strictly-ordered `AddReductionVF/D` cannot be generated. Auto-vectorization has already banned these nodes in JDK-8275275 [2]. > > This patch adds matching rules for non strictly-ordered `AddReductionVF/D`. > > No effects on other platforms. > > [Performance] > FloatMaxVector.ADDLanes [3] measures the performance of add reduction for floating-point type. With this patch, it improves ~3x on my SVE machine (128-bit). > > ADDLanes > > Benchmark Before After Unit > FloatMaxVector.ADDLanes 1789.513 5264.226 ops/ms > > > Final code is as below: > > Before: > ` fadda z17.s, p7/m, z17.s, z16.s > ` > After: > > faddp v17.4s, v21.4s, v21.4s > faddp s18, v17.2s > fadd s18, s18, s19 > > > > > [Test] > Full jtreg passed on AArch64 and x86. > > [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2529 > [2] https://bugs.openjdk.org/browse/JDK-8275275 > [3] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/micro/org/openjdk/bench/jdk/incubator/vector/operation/FloatMaxVector.java#L316 Bhavana Kilambi has updated the pull request incrementally with one additional commit since the last revision: Make changes in IR rules for JTREG tests ------------- Changes: - all: https://git.openjdk.org/jdk/pull/18034/files - new: https://git.openjdk.org/jdk/pull/18034/files/b8f6cfb5..db88e3c9 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=18034&range=10 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=18034&range=09-10 Stats: 265 lines in 3 files changed: 141 ins; 102 del; 22 mod Patch: https://git.openjdk.org/jdk/pull/18034.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/18034/head:pull/18034 PR: https://git.openjdk.org/jdk/pull/18034 From bkilambi at openjdk.org Mon Jun 3 08:38:13 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Mon, 3 Jun 2024 08:38:13 GMT Subject: RFR: 8320725: C2: Add "requires_strict_order" flag for floating-point add-reduction [v8] In-Reply-To: References: <8-_t7nWbR9gZ2_QkfFNuf5M0Q4PMkKJKgwS3ZbHcCxI=.32dc4f11-dec5-468d-afc8-3b4dae285dcb@github.com> <2y-Ag6MxVDJfYl6kM0FYjQA-kzSCekUgAMWAZmkECyQ=.2a2a0a8e-fc67-42a4-bd67-b4ae3b60bcea@github.com> Message-ID: <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> On Mon, 13 May 2024 11:01:30 GMT, Emanuel Peter wrote: >> @eme64 Thanks for the clarification. I understand the usage of `counts` in the IR tests. Just that I got a bit confused by some of your earlier statements. We do actually have a test to make sure AddReductionVF/VD and MulReductionVF/VD are not generated on aarch64 NEON machines - `test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java`. I can modify this test to include UseSVE > 0 case as well and will also add a separate JTREG test for the VectorAPI tests. Hope that's ok.. > > @Bhavana-Kilambi > I know we have the tests in `test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java`, and some other reduction tests. But these do not do the specific think I would like to see. > > I would like this: > - Add `no_strict_order` vs `requires_strict_order` or similar to `dump_spec`. > - IR match not just that there is the correct `ReductionNode`, but also that it has the `no_strict_order` or `requires_strict_order` in its dump. You can do that by using a custom regex string, rather than `IRNode.STORE_VECTOR` or similar. > - Then, create different tests, some where we expect ordered, some unordered vectors. Use Vector API and SuperWord examples. > > Does that make sense? Hi @eme64 , I have modified the tests as suggested. Please review :) ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2144609426 From stuefe at openjdk.org Mon Jun 3 09:26:12 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Mon, 3 Jun 2024 09:26:12 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: Message-ID: On Fri, 31 May 2024 16:02:40 GMT, Andrew Haley wrote: >> At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. >> >> In addition, it would be useful to be able to static_assert different registers. >> >> Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. >> >> I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. > > Andrew Haley has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains 19 additional commits since the last revision: > > - Merge branch 'clean' into different-regs > - Review feedback > - Review feedback > - Update src/hotspot/share/asm/register.hpp > > Co-authored-by: Stefan Karlsson > - Review feedback > - Review feedback > - Review feedback > - Merge branch 'different-regs' of https://github.com/theRealAph/jdk into different-regs > - Update src/hotspot/share/asm/register.hpp > > Co-authored-by: Emanuel Peter > - Merge branch 'clean' into different-regs > - ... and 9 more: https://git.openjdk.org/jdk/compare/8fe7e1ce...c9fc63d7 > I could, but I don't think there's much point. assert_different_registers() is used so much that it'll get thoroughly tested in the positive cases, at least. Do you think this is important? See my remarks. I am mainly concerned about exceeding the range for the bit set. If you add the proposed static assert, I think we are good. src/hotspot/share/asm/register.hpp line 96: > 94: template > 95: class AbstractRegSet { > 96: size_t _bitset; Why couple the number of possible registers to the memory size? Why not uint64_t? src/hotspot/share/asm/register.hpp line 109: > 107: > 108: constexpr AbstractRegSet(RegImpl r1) > 109: : _bitset(r1->is_valid() ? size_t(1) << r1->encoding() : 0) { } If my assumption from above is correct, we never noticed ppc being broken since _bitset would have been 0 if encoding > 31. Could you add something like: STATIC_ASSERT(RegImpl::number_of_registers <= 64); ? ------------- PR Review: https://git.openjdk.org/jdk/pull/16617#pullrequestreview-2093240269 PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624068958 PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624084824 From stuefe at openjdk.org Mon Jun 3 09:26:12 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Mon, 3 Jun 2024 09:26:12 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 09:00:38 GMT, Thomas Stuefe wrote: >> Andrew Haley has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains 19 additional commits since the last revision: >> >> - Merge branch 'clean' into different-regs >> - Review feedback >> - Review feedback >> - Update src/hotspot/share/asm/register.hpp >> >> Co-authored-by: Stefan Karlsson >> - Review feedback >> - Review feedback >> - Review feedback >> - Merge branch 'different-regs' of https://github.com/theRealAph/jdk into different-regs >> - Update src/hotspot/share/asm/register.hpp >> >> Co-authored-by: Emanuel Peter >> - Merge branch 'clean' into different-regs >> - ... and 9 more: https://git.openjdk.org/jdk/compare/8fe7e1ce...c9fc63d7 > > src/hotspot/share/asm/register.hpp line 96: > >> 94: template >> 95: class AbstractRegSet { >> 96: size_t _bitset; > > Why couple the number of possible registers to the memory size? Why not uint64_t? 64-bit makes sense. I think this may have been broken for ppc where the number of vector registers can exceed 32 (https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/cpu/ppc/register_ppc.hpp#L378) Unless I am mistaken, `assert_different_registers`, if applied to VSR32 and up, would never have fired. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624077001 From duke at openjdk.org Mon Jun 3 10:47:29 2024 From: duke at openjdk.org (kuaiwei) Date: Mon, 3 Jun 2024 10:47:29 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp Message-ID: Some classes in nativeInst_aarch64.hpp are unused and can be removed. I checked with tier1 tests. ------------- Commit messages: - Remove GOT and PLT classes - 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp Changes: https://git.openjdk.org/jdk/pull/19518/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19518&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333410 Stats: 265 lines in 2 files changed: 1 ins; 258 del; 6 mod Patch: https://git.openjdk.org/jdk/pull/19518.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19518/head:pull/19518 PR: https://git.openjdk.org/jdk/pull/19518 From aph at openjdk.org Mon Jun 3 11:31:02 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 11:31:02 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 10:43:11 GMT, kuaiwei wrote: > Some classes in nativeInst_aarch64.hpp are unused and can be removed. > > I checked with tier1 tests. Mmm, interesting. Let me have a look at why these are there. ------------- PR Review: https://git.openjdk.org/jdk/pull/19518#pullrequestreview-2093561686 From aph at openjdk.org Mon Jun 3 11:44:03 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 11:44:03 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp In-Reply-To: References: Message-ID: <3wjDkxySe8t8A8feJjH9SzuEK2a2BDt9xBFwJOW8iac=.8b2ba0b1-9457-41da-be9c-2b477a2fa655@github.com> On Mon, 3 Jun 2024 10:43:11 GMT, kuaiwei wrote: > Some classes in nativeInst_aarch64.hpp are unused and can be removed. > > I checked with tier1 tests. OK, it's some remnants of the Graal-based jaotc. Patch approved. ------------- Marked as reviewed by aph (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19518#pullrequestreview-2093584916 From thartmann at openjdk.org Mon Jun 3 12:03:29 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Mon, 3 Jun 2024 12:03:29 GMT Subject: RFR: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state Message-ID: Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 which is casted to a `Cell`: https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. Thanks, Tobias ------------- Commit messages: - 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state Changes: https://git.openjdk.org/jdk/pull/19520/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19520&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333177 Stats: 5 lines in 1 file changed: 0 ins; 3 del; 2 mod Patch: https://git.openjdk.org/jdk/pull/19520.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19520/head:pull/19520 PR: https://git.openjdk.org/jdk/pull/19520 From roland at openjdk.org Mon Jun 3 12:56:05 2024 From: roland at openjdk.org (Roland Westrelin) Date: Mon, 3 Jun 2024 12:56:05 GMT Subject: RFR: 8327380: Add tests for Shenandoah barrier expansion optimization In-Reply-To: <6jnsp3eSXnS5H2F915sWIiSQvz5-CN7hNuciqoc1Lp4=.445c9bdc-9c5d-427a-89ba-5bc0a57d2425@github.com> References: <6jnsp3eSXnS5H2F915sWIiSQvz5-CN7hNuciqoc1Lp4=.445c9bdc-9c5d-427a-89ba-5bc0a57d2425@github.com> Message-ID: <7vWvORnFLnAiLg6DHh9zy2VOWWxFNnbXdvBddF3of5I=.a07850f8-77ab-4aa3-9046-97a3fa94f05f@github.com> On Wed, 17 Apr 2024 13:27:11 GMT, Kangcheng Xu wrote: > The Ideal graph for Shenandoah barrier expansion is optimized so that unnecessary checks are eliminated; however, currently there is no test cases to determine these optimizations are in effect. > > Adding unit tests with the IR test framework will support related code changes in the future. Looks good to me. ------------- Marked as reviewed by roland (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/18814#pullrequestreview-2093793474 From chagedorn at openjdk.org Mon Jun 3 12:59:10 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 12:59:10 GMT Subject: RFR: 8333434: IGV: Print loop node for PHASE_BEFORE/AFTER_CLOOPS Message-ID: This simple patch adds loop nodes for `PHASE_BEFORE_CLOOPS` (the `LoopNode` to be converted) and `PHASE_AFTER_CLOOPS` (the newly created `CountedLoopNode`) in IGV: Before patch: ![image](https://github.com/openjdk/jdk/assets/17833009/6771160e-4d2e-423a-9578-40562c053a6f) With patch: ![image](https://github.com/openjdk/jdk/assets/17833009/b6870a67-8c5e-42be-87f0-1b8de7269298) Thanks, Christian ------------- Commit messages: - 8333434: IGV: Print loop node for PHASE_BEFORE/AFTER_CLOOPS Changes: https://git.openjdk.org/jdk/pull/19524/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19524&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333434 Stats: 2 lines in 1 file changed: 0 ins; 0 del; 2 mod Patch: https://git.openjdk.org/jdk/pull/19524.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19524/head:pull/19524 PR: https://git.openjdk.org/jdk/pull/19524 From thartmann at openjdk.org Mon Jun 3 13:04:05 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Mon, 3 Jun 2024 13:04:05 GMT Subject: RFR: 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate In-Reply-To: References: Message-ID: <8zTlKBXjP_6afLiMTmfK9VjqUefC5Z5PaIeaMqdfpw4=.0db350fc-0fd1-4578-90c4-8bab7016ccf3@github.com> On Mon, 3 Jun 2024 08:29:18 GMT, Christian Hagedorn wrote: > [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and/or non-null-checks which both use `Opaque4` nodes. I've also improved the following bailout from `!= 2` to `< 2` since the original comment suggested that if there is a test without 2 inputs, it must be a dead test (i.e. I assumed a `ConNode`): > > https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/share/opto/loopTransform.cpp#L1204-L1206 > > But apparently, we could also have the following, also dead, `If` with a condition with 3 inputs (`505 Phi`) created by `split_thru_phi`: > > ![image](https://github.com/openjdk/jdk/assets/17833009/003ad32a-a675-49f2-a263-7ca28d1faf82) > > I therefore revert the check back to `!= 2` and improved the comment. > > Thanks, > Christian Looks good to me. test/hotspot/jtreg/compiler/predicates/assertion/TestIfWithPhiInput.java line 29: > 27: * @bug 8333394 > 28: * @summary Test bailout of range check policy with an If with a Phi as condition. > 29: * @run main/othervm -XX:CompileCommand=compileonly,*TestIfWithPhiInput*::* -Xcomp Suggestion: * @run main/othervm -XX:CompileCommand=compileonly,*TestIfWithPhiInput*::* -Xcomp -XX:-TieredCompilation Just to make sure C2 compilation is triggered. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19517#pullrequestreview-2093803652 PR Review Comment: https://git.openjdk.org/jdk/pull/19517#discussion_r1624419531 From roland at openjdk.org Mon Jun 3 13:04:05 2024 From: roland at openjdk.org (Roland Westrelin) Date: Mon, 3 Jun 2024 13:04:05 GMT Subject: RFR: 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 08:29:18 GMT, Christian Hagedorn wrote: > [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and/or non-null-checks which both use `Opaque4` nodes. I've also improved the following bailout from `!= 2` to `< 2` since the original comment suggested that if there is a test without 2 inputs, it must be a dead test (i.e. I assumed a `ConNode`): > > https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/share/opto/loopTransform.cpp#L1204-L1206 > > But apparently, we could also have the following, also dead, `If` with a condition with 3 inputs (`505 Phi`) created by `split_thru_phi`: > > ![image](https://github.com/openjdk/jdk/assets/17833009/003ad32a-a675-49f2-a263-7ca28d1faf82) > > I therefore revert the check back to `!= 2` and improved the comment. > > Thanks, > Christian Looks good to me. ------------- Marked as reviewed by roland (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19517#pullrequestreview-2093810765 From thartmann at openjdk.org Mon Jun 3 13:04:07 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Mon, 3 Jun 2024 13:04:07 GMT Subject: RFR: 8333434: IGV: Print loop node for PHASE_BEFORE/AFTER_CLOOPS In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 12:53:47 GMT, Christian Hagedorn wrote: > This simple patch adds loop nodes for `PHASE_BEFORE_CLOOPS` (the `LoopNode` to be converted) and `PHASE_AFTER_CLOOPS` (the newly created `CountedLoopNode`) in IGV: > > Before patch: > ![image](https://github.com/openjdk/jdk/assets/17833009/6771160e-4d2e-423a-9578-40562c053a6f) > > With patch: > ![image](https://github.com/openjdk/jdk/assets/17833009/b6870a67-8c5e-42be-87f0-1b8de7269298) > > Thanks, > Christian Looks good and trivial. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19524#pullrequestreview-2093811146 From rcastanedalo at openjdk.org Mon Jun 3 13:07:01 2024 From: rcastanedalo at openjdk.org (Roberto =?UTF-8?B?Q2FzdGHDsWVkYQ==?= Lozano) Date: Mon, 3 Jun 2024 13:07:01 GMT Subject: RFR: 8333434: IGV: Print loop node for PHASE_BEFORE/AFTER_CLOOPS In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 12:53:47 GMT, Christian Hagedorn wrote: > This simple patch adds loop nodes for `PHASE_BEFORE_CLOOPS` (the `LoopNode` to be converted) and `PHASE_AFTER_CLOOPS` (the newly created `CountedLoopNode`) in IGV: > > Before patch: > ![image](https://github.com/openjdk/jdk/assets/17833009/6771160e-4d2e-423a-9578-40562c053a6f) > > With patch: > ![image](https://github.com/openjdk/jdk/assets/17833009/b6870a67-8c5e-42be-87f0-1b8de7269298) > > Thanks, > Christian Marked as reviewed by rcastanedalo (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19524#pullrequestreview-2093818561 From aph at openjdk.org Mon Jun 3 13:07:09 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 13:07:09 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: Message-ID: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> On Mon, 3 Jun 2024 09:04:45 GMT, Thomas Stuefe wrote: >> src/hotspot/share/asm/register.hpp line 96: >> >>> 94: template >>> 95: class AbstractRegSet { >>> 96: size_t _bitset; >> >> Why couple the number of possible registers to the memory size? Why not uint64_t? > > 64-bit makes sense. > > I think this may have been broken for ppc where the number of vector registers can exceed 32 (https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/cpu/ppc/register_ppc.hpp#L378) > > Unless I am mistaken, `assert_different_registers`, if applied to VSR32 and up, would never have fired. OK, I get it. I was sort-of thinking that on the 32-bit platforms we support we don't ever have more than 32 registers in a set, but maybe that's not true. I certainly don't want to slow down 32-bit platforms by burdening them with double-word operations for something that can never happen. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624425123 From aph at openjdk.org Mon Jun 3 13:07:12 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 13:07:12 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 09:10:15 GMT, Thomas Stuefe wrote: >> Andrew Haley has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains 19 additional commits since the last revision: >> >> - Merge branch 'clean' into different-regs >> - Review feedback >> - Review feedback >> - Update src/hotspot/share/asm/register.hpp >> >> Co-authored-by: Stefan Karlsson >> - Review feedback >> - Review feedback >> - Review feedback >> - Merge branch 'different-regs' of https://github.com/theRealAph/jdk into different-regs >> - Update src/hotspot/share/asm/register.hpp >> >> Co-authored-by: Emanuel Peter >> - Merge branch 'clean' into different-regs >> - ... and 9 more: https://git.openjdk.org/jdk/compare/55c9e523...c9fc63d7 > > src/hotspot/share/asm/register.hpp line 109: > >> 107: >> 108: constexpr AbstractRegSet(RegImpl r1) >> 109: : _bitset(r1->is_valid() ? size_t(1) << r1->encoding() : 0) { } > > If my assumption from above is correct, we never noticed ppc being broken since _bitset would have been 0 if encoding > 31. > > Could you add something like: > > > STATIC_ASSERT(RegImpl::number_of_registers <= 64); > > > ? OK. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624426065 From chagedorn at openjdk.org Mon Jun 3 13:08:29 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 13:08:29 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit Message-ID: A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. #### Idea of Partial Peeling Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). #### Partial Peeling with Unsigned Test However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 #### Requirements for Using an Unsigned Test The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. #### The Requirements Are Currently Broken This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 #### Why Are the Requirements Broken? The reason is that i >=u limit can only be converted into the two signed comparisons i < 0 || i >= limit if `limit` is non-negative (i.e. `limit >= 0`): https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. #### Fixing the Broken Requirements To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of `limit` contains a negative value: https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3062-L3066 Thanks, Christian ------------- Commit messages: - 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit Changes: https://git.openjdk.org/jdk/pull/19522/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8332920 Stats: 344 lines in 2 files changed: 337 ins; 0 del; 7 mod Patch: https://git.openjdk.org/jdk/pull/19522.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19522/head:pull/19522 PR: https://git.openjdk.org/jdk/pull/19522 From chagedorn at openjdk.org Mon Jun 3 13:12:16 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 13:12:16 GMT Subject: RFR: 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate [v2] In-Reply-To: References: Message-ID: > [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and/or non-null-checks which both use `Opaque4` nodes. I've also improved the following bailout from `!= 2` to `< 2` since the original comment suggested that if there is a test without 2 inputs, it must be a dead test (i.e. I assumed a `ConNode`): > > https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/share/opto/loopTransform.cpp#L1204-L1206 > > But apparently, we could also have the following, also dead, `If` with a condition with 3 inputs (`505 Phi`) created by `split_thru_phi`: > > ![image](https://github.com/openjdk/jdk/assets/17833009/003ad32a-a675-49f2-a263-7ca28d1faf82) > > I therefore revert the check back to `!= 2` and improved the comment. > > Thanks, > Christian Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: Update test/hotspot/jtreg/compiler/predicates/assertion/TestIfWithPhiInput.java Co-authored-by: Tobias Hartmann ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19517/files - new: https://git.openjdk.org/jdk/pull/19517/files/5eb94738..b435157a Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19517&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19517&range=00-01 Stats: 1 line in 1 file changed: 0 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19517.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19517/head:pull/19517 PR: https://git.openjdk.org/jdk/pull/19517 From chagedorn at openjdk.org Mon Jun 3 13:12:16 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 13:12:16 GMT Subject: RFR: 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate [v2] In-Reply-To: <8zTlKBXjP_6afLiMTmfK9VjqUefC5Z5PaIeaMqdfpw4=.0db350fc-0fd1-4578-90c4-8bab7016ccf3@github.com> References: <8zTlKBXjP_6afLiMTmfK9VjqUefC5Z5PaIeaMqdfpw4=.0db350fc-0fd1-4578-90c4-8bab7016ccf3@github.com> Message-ID: On Mon, 3 Jun 2024 12:57:57 GMT, Tobias Hartmann wrote: >> Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: >> >> Update test/hotspot/jtreg/compiler/predicates/assertion/TestIfWithPhiInput.java >> >> Co-authored-by: Tobias Hartmann > > Looks good to me. Thanks @TobiHartmann and @rwestrel for your reviews! > test/hotspot/jtreg/compiler/predicates/assertion/TestIfWithPhiInput.java line 29: > >> 27: * @bug 8333394 >> 28: * @summary Test bailout of range check policy with an If with a Phi as condition. >> 29: * @run main/othervm -XX:CompileCommand=compileonly,*TestIfWithPhiInput*::* -Xcomp > > Suggestion: > > * @run main/othervm -XX:CompileCommand=compileonly,*TestIfWithPhiInput*::* -Xcomp -XX:-TieredCompilation > > > Just to make sure C2 compilation is triggered. Good point, added. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19517#issuecomment-2145163713 PR Review Comment: https://git.openjdk.org/jdk/pull/19517#discussion_r1624432570 From chagedorn at openjdk.org Mon Jun 3 13:13:02 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 13:13:02 GMT Subject: RFR: 8333434: IGV: Print loop node for PHASE_BEFORE/AFTER_CLOOPS In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 13:01:10 GMT, Tobias Hartmann wrote: >> This simple patch adds loop nodes for `PHASE_BEFORE_CLOOPS` (the `LoopNode` to be converted) and `PHASE_AFTER_CLOOPS` (the newly created `CountedLoopNode`) in IGV: >> >> Before patch: >> ![image](https://github.com/openjdk/jdk/assets/17833009/6771160e-4d2e-423a-9578-40562c053a6f) >> >> With patch: >> ![image](https://github.com/openjdk/jdk/assets/17833009/b6870a67-8c5e-42be-87f0-1b8de7269298) >> >> Thanks, >> Christian > > Looks good and trivial. Thanks @TobiHartmann and @robcasloz for your quick reviews! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19524#issuecomment-2145167181 From aph at openjdk.org Mon Jun 3 13:23:15 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 13:23:15 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v13] In-Reply-To: References: Message-ID: > At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. > > In addition, it would be useful to be able to static_assert different registers. > > Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. > > I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: Windows fix ------------- Changes: - all: https://git.openjdk.org/jdk/pull/16617/files - new: https://git.openjdk.org/jdk/pull/16617/files/c9fc63d7..c3465557 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=12 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=11-12 Stats: 4 lines in 1 file changed: 0 ins; 0 del; 4 mod Patch: https://git.openjdk.org/jdk/pull/16617.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/16617/head:pull/16617 PR: https://git.openjdk.org/jdk/pull/16617 From thartmann at openjdk.org Mon Jun 3 13:43:04 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Mon, 3 Jun 2024 13:43:04 GMT Subject: RFR: 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate [v2] In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 13:12:16 GMT, Christian Hagedorn wrote: >> [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and/or non-null-checks which both use `Opaque4` nodes. I've also improved the following bailout from `!= 2` to `< 2` since the original comment suggested that if there is a test without 2 inputs, it must be a dead test (i.e. I assumed a `ConNode`): >> >> https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/share/opto/loopTransform.cpp#L1204-L1206 >> >> But apparently, we could also have the following, also dead, `If` with a condition with 3 inputs (`505 Phi`) created by `split_thru_phi`: >> >> ![image](https://github.com/openjdk/jdk/assets/17833009/003ad32a-a675-49f2-a263-7ca28d1faf82) >> >> I therefore revert the check back to `!= 2` and improved the comment. >> >> Thanks, >> Christian > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Update test/hotspot/jtreg/compiler/predicates/assertion/TestIfWithPhiInput.java > > Co-authored-by: Tobias Hartmann Marked as reviewed by thartmann (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19517#pullrequestreview-2093913138 From stuefe at openjdk.org Mon Jun 3 14:01:04 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Mon, 3 Jun 2024 14:01:04 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> Message-ID: On Mon, 3 Jun 2024 13:03:22 GMT, Andrew Haley wrote: >> 64-bit makes sense. >> >> I think this may have been broken for ppc where the number of vector registers can exceed 32 (https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/cpu/ppc/register_ppc.hpp#L378) >> >> Unless I am mistaken, `assert_different_registers`, if applied to VSR32 and up, would never have fired. > > OK, I get it. I was sort-of thinking that on the 32-bit platforms we support we don't ever have more than 32 registers in a set, but maybe that's not true. I certainly don't want to slow down 32-bit platforms by burdening them with double-word operations for something that can never happen. Okay. Well, if we have a static assert, we will notice if we have more registers than fit the bitset. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624510457 From chagedorn at openjdk.org Mon Jun 3 14:06:07 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Mon, 3 Jun 2024 14:06:07 GMT Subject: Integrated: 8333434: IGV: Print loop node for PHASE_BEFORE/AFTER_CLOOPS In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 12:53:47 GMT, Christian Hagedorn wrote: > This simple patch adds loop nodes for `PHASE_BEFORE_CLOOPS` (the `LoopNode` to be converted) and `PHASE_AFTER_CLOOPS` (the newly created `CountedLoopNode`) in IGV: > > Before patch: > ![image](https://github.com/openjdk/jdk/assets/17833009/6771160e-4d2e-423a-9578-40562c053a6f) > > With patch: > ![image](https://github.com/openjdk/jdk/assets/17833009/b6870a67-8c5e-42be-87f0-1b8de7269298) > > Thanks, > Christian This pull request has now been integrated. Changeset: 1f9e6290 Author: Christian Hagedorn URL: https://git.openjdk.org/jdk/commit/1f9e62904c624b12bd344d2ef3021eb5d3377197 Stats: 2 lines in 1 file changed: 0 ins; 0 del; 2 mod 8333434: IGV: Print loop node for PHASE_BEFORE/AFTER_CLOOPS Reviewed-by: thartmann, rcastanedalo ------------- PR: https://git.openjdk.org/jdk/pull/19524 From aph at openjdk.org Mon Jun 3 14:25:17 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 14:25:17 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v14] In-Reply-To: References: Message-ID: > At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. > > In addition, it would be useful to be able to static_assert different registers. > > Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. > > I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: Review feedback ------------- Changes: - all: https://git.openjdk.org/jdk/pull/16617/files - new: https://git.openjdk.org/jdk/pull/16617/files/c3465557..80ca17c6 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=13 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=12-13 Stats: 2 lines in 1 file changed: 1 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/16617.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/16617/head:pull/16617 PR: https://git.openjdk.org/jdk/pull/16617 From kvn at openjdk.org Mon Jun 3 14:45:12 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Mon, 3 Jun 2024 14:45:12 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer Message-ID: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> Avoid iterations on empty relocation info. Found by running `ubsan`. Tested tier1-4, stress, xcomp. ------------- Commit messages: - 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer Changes: https://git.openjdk.org/jdk/pull/19525/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19525&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8331731 Stats: 8 lines in 2 files changed: 1 ins; 4 del; 3 mod Patch: https://git.openjdk.org/jdk/pull/19525.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19525/head:pull/19525 PR: https://git.openjdk.org/jdk/pull/19525 From kvn at openjdk.org Mon Jun 3 14:46:09 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Mon, 3 Jun 2024 14:46:09 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset 18446744073709551614 to null pointer [v3] In-Reply-To: References: Message-ID: On Fri, 31 May 2024 08:04:27 GMT, Matthias Baesken wrote: >> When running on macOS with ubsan enabled, we see some issues in relocInfo (hpp and cpp); those already occur in the build quite early. >> >> /jdk/src/hotspot/share/code/relocInfo.cpp:155:30: runtime error: applying non-zero offset 18446744073709551614 to null pointer >> >> Similar happens when we add to the _current pointer >> _current++; >> this gives : >> relocInfo.hpp:606:13: runtime error: applying non-zero offset to non-null pointer 0xfffffffffffffffe produced null pointer >> >> Seems the pointer subtraction/addition worked so far, so it might be an option to disable ubsan for those 2 functions. > > Matthias Baesken has updated the pull request incrementally with one additional commit since the last revision: > > rename templates New PR: [#19525](https://github.com/openjdk/jdk/pull/19525) ------------- PR Comment: https://git.openjdk.org/jdk/pull/19424#issuecomment-2145385628 From mdoerr at openjdk.org Mon Jun 3 14:49:01 2024 From: mdoerr at openjdk.org (Martin Doerr) Date: Mon, 3 Jun 2024 14:49:01 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer In-Reply-To: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> References: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> Message-ID: On Mon, 3 Jun 2024 14:39:28 GMT, Vladimir Kozlov wrote: > Avoid iterations on empty relocation info. Found by running `ubsan`. > > Tested tier1-4, stress, xcomp. LGTM. Thanks! @MBaesken: Can you verify it, please? ------------- Marked as reviewed by mdoerr (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19525#pullrequestreview-2094097706 From aph at openjdk.org Mon Jun 3 15:18:03 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 15:18:03 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> Message-ID: On Mon, 3 Jun 2024 13:58:12 GMT, Thomas Stuefe wrote: >> OK, I get it. I was sort-of thinking that on the 32-bit platforms we support we don't ever have more than 32 registers in a set, but maybe that's not true. I certainly don't want to slow down 32-bit platforms by burdening them with double-word operations for something that can never happen. > > Okay. Well, if we have a static assert, we will notice if we have more registers than fit the bitset. So I'm getting this bizarre failure on arm32. I'm guessing it's actually a compiler bug, but I suppose it might be some dusty corner of C++ to do with template arg substitution. Any thoughts? In file included from /home/runner/work/jdk/jdk/src/hotspot/share/utilities/globalDefinitions.hpp:29, from /home/runner/work/jdk/jdk/src/hotspot/share/nmt/memflags.hpp:28, from /home/runner/work/jdk/jdk/src/hotspot/share/memory/allocation.hpp:29, from ad_arm.hpp:30, from ad_arm.cpp:28: /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp: In instantiation of ?class AbstractRegSet?: /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:272:30: required from ?constexpr bool different_registers(R, Rx ...) [with R = RegisterImpl*; Rx = {RegisterImpl*, RegisterImpl*}]? /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:278:27: required from ?void assert_different_registers(R, Rx ...) [with R = RegisterImpl*; Rx = {RegisterImpl*, RegisterImpl*}]? /home/runner/work/jdk/jdk/src/hotspot/cpu/arm/arm.ad:8984:52: required from here /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:96:26: error: ?number_of_registers? is not a member of ?RegisterImpl*? 96 | STATIC_ASSERT(RegImpl::number_of_registers <= 64); | ^~~~~~~~~~~~~~~~~~~ /home/runner/work/jdk/jdk/src/hotspot/share/utilities/debug.hpp:287:44: note: in definition of macro ?STATIC_ASSERT? 287 | #define STATIC_ASSERT(Cond) static_assert((Cond), #Cond) | ^~~~ ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624634893 From aph at openjdk.org Mon Jun 3 15:26:16 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 15:26:16 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v15] In-Reply-To: References: Message-ID: > At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. > > In addition, it would be useful to be able to static_assert different registers. > > Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. > > I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: Review feedback ------------- Changes: - all: https://git.openjdk.org/jdk/pull/16617/files - new: https://git.openjdk.org/jdk/pull/16617/files/80ca17c6..47681709 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=14 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=13-14 Stats: 8 lines in 2 files changed: 4 ins; 1 del; 3 mod Patch: https://git.openjdk.org/jdk/pull/16617.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/16617/head:pull/16617 PR: https://git.openjdk.org/jdk/pull/16617 From aph at openjdk.org Mon Jun 3 15:26:16 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 15:26:16 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> Message-ID: On Mon, 3 Jun 2024 15:15:09 GMT, Andrew Haley wrote: >> Okay. Well, if we have a static assert, we will notice if we have more registers than fit the bitset. > > So I'm getting this bizarre failure on arm32. I'm guessing it's actually a compiler bug, but I suppose it might be some dusty corner of C++ to do with template arg substitution. Any thoughts? > > > In file included from /home/runner/work/jdk/jdk/src/hotspot/share/utilities/globalDefinitions.hpp:29, > from /home/runner/work/jdk/jdk/src/hotspot/share/nmt/memflags.hpp:28, > from /home/runner/work/jdk/jdk/src/hotspot/share/memory/allocation.hpp:29, > from ad_arm.hpp:30, > from ad_arm.cpp:28: > /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp: In instantiation of ?class AbstractRegSet?: > /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:272:30: required from ?constexpr bool different_registers(R, Rx ...) [with R = RegisterImpl*; Rx = {RegisterImpl*, RegisterImpl*}]? > /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:278:27: required from ?void assert_different_registers(R, Rx ...) [with R = RegisterImpl*; Rx = {RegisterImpl*, RegisterImpl*}]? > /home/runner/work/jdk/jdk/src/hotspot/cpu/arm/arm.ad:8984:52: required from here > /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:96:26: error: ?number_of_registers? is not a member of ?RegisterImpl*? > 96 | STATIC_ASSERT(RegImpl::number_of_registers <= 64); > | ^~~~~~~~~~~~~~~~~~~ > /home/runner/work/jdk/jdk/src/hotspot/share/utilities/debug.hpp:287:44: note: in definition of macro ?STATIC_ASSERT? > 287 | #define STATIC_ASSERT(Cond) static_assert((Cond), #Cond) > | ^~~~ I just changed the static assert to a regular assert, in the hope that it might work better on arm32. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624647422 From sviswanathan at openjdk.org Mon Jun 3 15:27:06 2024 From: sviswanathan at openjdk.org (Sandhya Viswanathan) Date: Mon, 3 Jun 2024 15:27:06 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v2] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: On Sun, 2 Jun 2024 15:43:39 GMT, Jatin Bhateja wrote: >> Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. >> >> This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin >> >> PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 >> [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review Comments Incorporated. Looks good to me. ------------- Marked as reviewed by sviswanathan (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19442#pullrequestreview-2094194312 From aph at openjdk.org Mon Jun 3 15:55:16 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 15:55:16 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v16] In-Reply-To: References: Message-ID: > At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. > > In addition, it would be useful to be able to static_assert different registers. > > Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. > > I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: Review feedback ------------- Changes: - all: https://git.openjdk.org/jdk/pull/16617/files - new: https://git.openjdk.org/jdk/pull/16617/files/47681709..df953ad3 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=15 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=14-15 Stats: 4 lines in 1 file changed: 0 ins; 0 del; 4 mod Patch: https://git.openjdk.org/jdk/pull/16617.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/16617/head:pull/16617 PR: https://git.openjdk.org/jdk/pull/16617 From kvn at openjdk.org Mon Jun 3 16:08:01 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Mon, 3 Jun 2024 16:08:01 GMT Subject: RFR: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 11:57:58 GMT, Tobias Hartmann wrote: > Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 > > The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 > > which is casted to a `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 > > I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. > > Thanks, > Tobias This looks correct. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19520#pullrequestreview-2094310535 From kvn at openjdk.org Mon Jun 3 16:11:01 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Mon, 3 Jun 2024 16:11:01 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 12:39:05 GMT, Christian Hagedorn wrote: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... Looks good. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19522#pullrequestreview-2094315565 From stuefe at openjdk.org Mon Jun 3 16:52:05 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Mon, 3 Jun 2024 16:52:05 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> Message-ID: <0pCspZzjm34L1sogLLErvgq-7u395IVrRslWxeKHMh8=.80d821c1-a446-4157-b813-778ec8510b3c@github.com> On Mon, 3 Jun 2024 15:23:32 GMT, Andrew Haley wrote: >> So I'm getting this bizarre failure on arm32. I'm guessing it's actually a compiler bug, but I suppose it might be some dusty corner of C++ to do with template arg substitution. Any thoughts? >> >> >> In file included from /home/runner/work/jdk/jdk/src/hotspot/share/utilities/globalDefinitions.hpp:29, >> from /home/runner/work/jdk/jdk/src/hotspot/share/nmt/memflags.hpp:28, >> from /home/runner/work/jdk/jdk/src/hotspot/share/memory/allocation.hpp:29, >> from ad_arm.hpp:30, >> from ad_arm.cpp:28: >> /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp: In instantiation of ?class AbstractRegSet?: >> /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:272:30: required from ?constexpr bool different_registers(R, Rx ...) [with R = RegisterImpl*; Rx = {RegisterImpl*, RegisterImpl*}]? >> /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:278:27: required from ?void assert_different_registers(R, Rx ...) [with R = RegisterImpl*; Rx = {RegisterImpl*, RegisterImpl*}]? >> /home/runner/work/jdk/jdk/src/hotspot/cpu/arm/arm.ad:8984:52: required from here >> /home/runner/work/jdk/jdk/src/hotspot/share/asm/register.hpp:96:26: error: ?number_of_registers? is not a member of ?RegisterImpl*? >> 96 | STATIC_ASSERT(RegImpl::number_of_registers <= 64); >> | ^~~~~~~~~~~~~~~~~~~ >> /home/runner/work/jdk/jdk/src/hotspot/share/utilities/debug.hpp:287:44: note: in definition of macro ?STATIC_ASSERT? >> 287 | #define STATIC_ASSERT(Cond) static_assert((Cond), #Cond) >> | ^~~~ > > I just changed the static assert to a regular assert, in the hope that it might work better on arm32. Urgh. Seems on arm32 (and on zero, but I guess that does not matter) Register is a typedef to a pointer: https://github.com/openjdk/jdk/blob/9686e804a2b058955ff88149c54a0a7896c0a2eb/src/hotspot/cpu/arm/register_arm.hpp#L136 and RegImpl is a pointer type, so it cannot have members. On all other platforms, Register is a class (and that works since it overrides operator->() ) I don't see a quick fix other than to make the assert dependent on !arm !zero. Or, leave it for a followup fix. --- BTW, note that on Arm32, we can have 64 float registers: https://github.com/openjdk/jdk/blob/9686e804a2b058955ff88149c54a0a7896c0a2eb/src/hotspot/cpu/arm/register_arm.hpp#L178 So, I would make the underlying type 64-bit for 32-bit builds too. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624768687 From stuefe at openjdk.org Mon Jun 3 16:57:05 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Mon, 3 Jun 2024 16:57:05 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: <0pCspZzjm34L1sogLLErvgq-7u395IVrRslWxeKHMh8=.80d821c1-a446-4157-b813-778ec8510b3c@github.com> References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> <0pCspZzjm34L1sogLLErvgq-7u395IVrRslWxeKHMh8=.80d821c1-a446-4157-b813-778ec8510b3c@github.com> Message-ID: On Mon, 3 Jun 2024 16:47:00 GMT, Thomas Stuefe wrote: >> I just changed the static assert to a regular assert, in the hope that it might work better on arm32. > > Urgh. > > Seems on arm32 (and on zero, but I guess that does not matter) Register is a typedef to a pointer: > > https://github.com/openjdk/jdk/blob/9686e804a2b058955ff88149c54a0a7896c0a2eb/src/hotspot/cpu/arm/register_arm.hpp#L136 > > and RegImpl is a pointer type, so it cannot have members. > > On all other platforms, Register is a class (and that works since it overrides operator->() ) > > I don't see a quick fix other than to make the assert dependent on !arm !zero. Or, leave it for a followup fix. > > --- > > BTW, note that on Arm32, we can have 64 float registers: > > https://github.com/openjdk/jdk/blob/9686e804a2b058955ff88149c54a0a7896c0a2eb/src/hotspot/cpu/arm/register_arm.hpp#L178 > > So, I would make the underlying type 64-bit for 32-bit builds too. BTW Would the runtime assert not prevent this from used as constexpr? ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624776615 From aph at openjdk.org Mon Jun 3 17:19:08 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 17:19:08 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> <0pCspZzjm34L1sogLLErvgq-7u395IVrRslWxeKHMh8=.80d821c1-a446-4157-b813-778ec8510b3c@github.com> Message-ID: On Mon, 3 Jun 2024 16:53:57 GMT, Thomas Stuefe wrote: >> Urgh. >> >> Seems on arm32 (and on zero, but I guess that does not matter) Register is a typedef to a pointer: >> >> https://github.com/openjdk/jdk/blob/9686e804a2b058955ff88149c54a0a7896c0a2eb/src/hotspot/cpu/arm/register_arm.hpp#L136 >> >> and RegImpl is a pointer type, so it cannot have members. >> >> On all other platforms, Register is a class (and that works since it overrides operator->() ) >> >> I don't see a quick fix other than to make the assert dependent on !arm !zero. Or, leave it for a followup fix. >> >> --- >> >> BTW, note that on Arm32, we can have 64 float registers: >> >> https://github.com/openjdk/jdk/blob/9686e804a2b058955ff88149c54a0a7896c0a2eb/src/hotspot/cpu/arm/register_arm.hpp#L178 >> >> So, I would make the underlying type 64-bit for 32-bit builds too. > > BTW Would the runtime assert not prevent this from used as constexpr? It doesn't, and I just saw the problem: on Arm, `Register` is a pointer type, whereas on other ports it's a class type. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624806190 From jbhateja at openjdk.org Mon Jun 3 17:24:02 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Mon, 3 Jun 2024 17:24:02 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v2] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: On Sun, 2 Jun 2024 15:43:39 GMT, Jatin Bhateja wrote: >> Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. >> >> This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin >> >> PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 >> [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review Comments Incorporated. Hi @kvn / @TobiHartmann , May I please request you for second reviewer clearance. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19442#issuecomment-2145745358 From aph at openjdk.org Mon Jun 3 17:27:17 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 17:27:17 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v17] In-Reply-To: References: Message-ID: > At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. > > In addition, it would be useful to be able to static_assert different registers. > > Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. > > I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: Kludge to fix build on Arm 32 ------------- Changes: - all: https://git.openjdk.org/jdk/pull/16617/files - new: https://git.openjdk.org/jdk/pull/16617/files/df953ad3..eac03cc1 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=16 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=15-16 Stats: 4 lines in 1 file changed: 3 ins; 1 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/16617.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/16617/head:pull/16617 PR: https://git.openjdk.org/jdk/pull/16617 From stuefe at openjdk.org Mon Jun 3 17:38:09 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Mon, 3 Jun 2024 17:38:09 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v17] In-Reply-To: References: Message-ID: <5ylgxbLHK9_o0K7AjjOcYzdbr4TOyGj1xJ35RKxbFkA=.ba2cb3a1-eec1-43af-b0c8-d85314b320ec@github.com> On Mon, 3 Jun 2024 17:27:17 GMT, Andrew Haley wrote: >> At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. >> >> In addition, it would be useful to be able to static_assert different registers. >> >> Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. >> >> I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. > > Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: > > Kludge to fix build on Arm 32 Looks good to me. Thanks for working in my remarks. src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp line 71: > 69: #include > 70: > 71: static_assert(different_registers(zr, sp), "fucked"); Debugging remnant? ------------- Marked as reviewed by stuefe (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/16617#pullrequestreview-2094492446 PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624826751 From aph at openjdk.org Mon Jun 3 17:38:10 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 17:38:10 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> <0pCspZzjm34L1sogLLErvgq-7u395IVrRslWxeKHMh8=.80d821c1-a446-4157-b813-778ec8510b3c@github.com> Message-ID: On Mon, 3 Jun 2024 17:16:29 GMT, Andrew Haley wrote: >> BTW Would the runtime assert not prevent this from used as constexpr? > > It doesn't, and I just saw the problem: on Arm, `Register` is a pointer type, whereas on other ports it's a class type. Looks like our comments crossed over. I'm building Zero and it's OK so far. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624823844 From aph at openjdk.org Mon Jun 3 17:38:10 2024 From: aph at openjdk.org (Andrew Haley) Date: Mon, 3 Jun 2024 17:38:10 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> <0pCspZzjm34L1sogLLErvgq-7u395IVrRslWxeKHMh8=.80d821c1-a446-4157-b813-778ec8510b3c@github.com> Message-ID: On Mon, 3 Jun 2024 17:31:55 GMT, Andrew Haley wrote: >> It doesn't, and I just saw the problem: on Arm, `Register` is a pointer type, whereas on other ports it's a class type. > > Looks like our comments crossed over. I'm building Zero and it's OK so far. I guess Arm never got converted. That port needs some love. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624824601 From stuefe at openjdk.org Mon Jun 3 17:38:10 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Mon, 3 Jun 2024 17:38:10 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v12] In-Reply-To: References: <5iRmAdR44HLp8pkZbGBZZ-URJIevAjbASptOhMqHzG0=.9e434754-ec5b-40b4-95d1-43c3d5142615@github.com> <0pCspZzjm34L1sogLLErvgq-7u395IVrRslWxeKHMh8=.80d821c1-a446-4157-b813-778ec8510b3c@github.com> Message-ID: On Mon, 3 Jun 2024 17:32:39 GMT, Andrew Haley wrote: >> Looks like our comments crossed over. I'm building Zero and it's OK so far. > > I guess Arm never got converted. That port needs some love. Or, deprecation. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1624825372 From duke at openjdk.org Mon Jun 3 19:17:07 2024 From: duke at openjdk.org (ArsenyBochkarev) Date: Mon, 3 Jun 2024 19:17:07 GMT Subject: RFR: 8317720: RISC-V: Implement Adler32 intrinsic [v9] In-Reply-To: References: Message-ID: > Hello everyone! Please review this ~non-vectorized~ implementation of `_updateBytesAdler32` intrinsic. Reference implementation for AArch64 can be found [here](https://github.com/openjdk/jdk9/blob/master/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp#L3281). > > ### Correctness checks > > Test `test/hotspot/jtreg/compiler/intrinsics/zip/TestAdler32.java` is ok. All tier1 also passed. > > ### Performance results on T-Head board > > Enabled intrinsic: > > | Benchmark | (count) | Mode | Cnt | Score | Error | Units | > | ------------------------------------- | ----------- | ------ | --------- | ------ | --------- | ---------- | > | Adler32.TestAdler32.testAdler32Update | 64 | thrpt | 25 | 5522.693 | 23.387 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 128 | thrpt | 25 | 3430.761 | 9.210 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 256 | thrpt | 25 | 1962.888 | 5.323 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 512 | thrpt | 25 | 1050.938 | 0.144 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 1024 | thrpt | 25 | 549.227 | 0.375 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 2048 | thrpt | 25 | 280.829 | 0.170 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 5012 | thrpt | 25 | 116.333 | 0.057 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 8192 | thrpt | 25 | 71.392 | 0.060 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 16384 | thrpt | 25 | 35.784 | 0.019 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 32768 | thrpt | 25 | 17.924 | 0.010 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 65536 | thrpt | 25 | 8.940 | 0.003 | ops/ms | > > Disabled intrinsic: > > | Benchmark | (count) | Mode | Cnt | Score | Error | Units | > | ------------------------------------- | ----------- | ------ | --------- | ------ | --------- | ---------- | > |Adler32.TestAdler32.testAdler32Update|64|thrpt|25|655.633|5.845|ops/ms| > |Adler32.TestAdler32.testAdler32Update|128|thrpt|25|587.418|10.062|ops/ms| > |Adler32.TestAdler32.testAdler32Update|256|thrpt|25|546.675|11.598|ops/ms| > |Adler32.TestAdler32.testAdler32Update|512|thrpt|25|432.328|11.517|ops/ms| > |Adler32.TestAdler32.testAdler32Update|1024|thrpt|25|311.771|4.238|ops/ms| > |Adler32.TestAdler32.testAdler32Update|2048|thrpt|25|202.648|2.486|ops/ms| > |Adler32.TestAdler32.testAdler32Update|5012|thrpt|25|100.246|1.119|ops/ms| > |Adler32.TestAdler32.testAdler32Update|8192|t... ArsenyBochkarev has updated the pull request incrementally with eight additional commits since the last revision: - Fix comment - Add count initialization to 64 for by64 loop - Clarify that s1_new sign-extension is safe - Fix comment about redsum constraints - Use vrsub instead of vmv + vsub - Remove comment about group for vtemp1 - Use v31 as vzero - Fix group comment ------------- Changes: - all: https://git.openjdk.org/jdk/pull/18382/files - new: https://git.openjdk.org/jdk/pull/18382/files/453c169b..eb14ec10 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=18382&range=08 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=18382&range=07-08 Stats: 24 lines in 1 file changed: 6 ins; 8 del; 10 mod Patch: https://git.openjdk.org/jdk/pull/18382.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/18382/head:pull/18382 PR: https://git.openjdk.org/jdk/pull/18382 From duke at openjdk.org Mon Jun 3 19:17:08 2024 From: duke at openjdk.org (ArsenyBochkarev) Date: Mon, 3 Jun 2024 19:17:08 GMT Subject: RFR: 8317720: RISC-V: Implement Adler32 intrinsic [v8] In-Reply-To: References: Message-ID: <8EfMWFHX07yLq5itbOO2mFjcJF8-VpJuub7DAEvRYX4=.c018b4f6-0352-4580-aabd-511336812dcf@github.com> On Mon, 27 May 2024 12:07:13 GMT, Fei Yang wrote: >> ArsenyBochkarev has updated the pull request incrementally with three additional commits since the last revision: >> >> - Partially unroll L_by16_loop >> - Fix by64 function for vlen > 128 >> - Fix by16 function for vlen > 128 > > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5069: > >> 5067: >> 5068: // Load data >> 5069: __ vsetvli(temp0, count, Assembler::e8, Assembler::m4); > > Maybe add a simple assertion about `count` before this to make sure that it equals 64 on entry? Or let this function initialize `count` to 64, which I guess won't impact performance much. I see. There are no way to check this at compile time, so I added `count` initialization to 64 in `adler32_process_bytes_by64` > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5083: > >> 5081: // Summing up calculated results for s2_new >> 5082: __ vsetvli(temp0, count, Assembler::e16, Assembler::m4); >> 5083: // 0xFF * 0x10 = 0xFF0 max per single vector element, > > I don't quite understand this line of code comment. What does `0x10` here stands for? These comment lines were about upper bounds for reduction sum. However, I saw that they were not correct, thanks for pointing it out! Actually the correct one is `0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0` per whole 4-register-size group > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5096: > >> 5094: // Extracting results for: >> 5095: // s1_new >> 5096: __ vmv_x_s(temp0, vs1acc[0]); > > Note that `vmv_x_s` will sign-extend the `e16` reduction result in `vs1acc[0]`. Is that safe? It is safe, yes. Left additional comment to clarify it up the code > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5207: > >> 5205: Register step = x28; // t3 >> 5206: >> 5207: VectorRegister vzero = v4; // group: v5, v6, v7 > > I see `vzero` is only used as the scalar source for vector integer reduction instructions, so it's not necessary for `vzero` to be a group of: v4, v5, v6, v7. Seems that we can assign the final v31 for `vzero` and thus free vector register group of v4, v5, v6, v7. > > And here is what the RVV spec says for reference: > > Vector reduction operations take a vector register group of elements and a scalar held in element 0 > of a vector register, and perform a reduction using some binary operator, to produce a scalar result > in element 0 of a vector register. The scalar input and output operands are held in element 0 of a > single vector register, not a vector register group, so any vector register can be the scalar source > or destination of a vector reduction regardless of LMUL setting. Done, thanks > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5217: > >> 5215: v16, v18, v20, v22 >> 5216: }; >> 5217: VectorRegister vtable_64 = v24; // group: v25, v26, v27 > > Suggestion: `// group: v24, v25, v26, v27` Fixed > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5220: > >> 5218: VectorRegister vtable_16 = (MaxVectorSize == 16) ? v27 : v30; >> 5219: VectorRegister vtemp1 = v28; // group: v29, v30, v31 >> 5220: VectorRegister vtemp2 = v29; > > Similar for `vtemp1` and `vtemp2` which are only used as the scalar destination for vector integer reduction instructions: it's not necessary for them to be a vector register group. So you might want to remove the code comment for `vtemp1`. Done > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5234: > >> 5232: __ vid_v(vtemp1); >> 5233: __ vmv_v_x(vtable_64, temp1); >> 5234: __ vsub_vv(vtable_64, vtable_64, vtemp1); > > I think a more simpler `vrsub_vx vtable_64, vtemp1, temp1` will do? This will help save the `vmv_v_x` instruction. Thanks for pointing out, done! > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5245: > >> 5243: __ vid_v(vtemp1); >> 5244: __ vmv_v_x(vtable_16, temp1); >> 5245: __ vsub_vv(vtable_16, vtable_16, vtemp1); > > Similar here: `vrsub_vx vtable_16, vtemp1, temp1`. Done ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624927998 PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624927850 PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624927907 PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624928180 PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624928198 PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624927931 PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624928147 PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624928135 From vlivanov at openjdk.org Mon Jun 3 19:36:58 2024 From: vlivanov at openjdk.org (Vladimir Ivanov) Date: Mon, 3 Jun 2024 19:36:58 GMT Subject: RFR: 8332547: Unloaded signature classes in DirectMethodHandles [v2] In-Reply-To: References: Message-ID: > JVM routinely installs loader constraints for unloaded signature classes when method resolution takes place. MethodHandle resolution took a different route and eagerly resolves signature classes instead (see `java.lang.invoke.MemberName$Factory::resolve` and `sun.invoke.util.VerifyAccess::isTypeVisible` for details). > > There's a micro-optimization which bypasses eager resolution for `java.*` classes. The downside is that `java.*` signature classes can show up as unloaded. It manifests as inlining failures during JIT-compilation and may cause severe performance issues. > > Proposed fix removes the aforementioned special case logic during `MethodHandle` resolution. > > In some cases it may slow down `MethodHandle` construction a bit (e.g., when repeatedly constructing `DirectMethodHandle`s with lots of arguments), but `MethodHandle` construction step is not performance critical. > > Testing: hs-tier1 - hs-tier4 Vladimir Ivanov has updated the pull request incrementally with one additional commit since the last revision: Renaming: isTypeVisible -> ensureTypeVisible ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19319/files - new: https://git.openjdk.org/jdk/pull/19319/files/805d42fc..058cdba3 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19319&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19319&range=00-01 Stats: 9 lines in 3 files changed: 0 ins; 0 del; 9 mod Patch: https://git.openjdk.org/jdk/pull/19319.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19319/head:pull/19319 PR: https://git.openjdk.org/jdk/pull/19319 From vlivanov at openjdk.org Mon Jun 3 19:36:58 2024 From: vlivanov at openjdk.org (Vladimir Ivanov) Date: Mon, 3 Jun 2024 19:36:58 GMT Subject: RFR: 8332547: Unloaded signature classes in DirectMethodHandles In-Reply-To: References: Message-ID: On Tue, 21 May 2024 20:14:41 GMT, Jorn Vernee wrote: >> Class loading triggered by `Class.forName()` call is at the core of `isTypeVisible`. (The rest is fast path checks.) It's what makes `isTypeVisible` query idempotent. >> >> I can definitely name it differently (e.g, `ensureTypeVisible`), but making a separate class loading pass across signature classes doesn't make much sense. > >> I can definitely name it differently (e.g, ensureTypeVisible), but making a separate class loading pass across signature classes doesn't make much sense. > > Ok, in that case I suggest also renaming `MemberName::checkForTypeAlias`, maybe to `ensureTypeVisible` as well. @JornVernee ok, renamed. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19319#issuecomment-2145967665 From duke at openjdk.org Mon Jun 3 19:38:44 2024 From: duke at openjdk.org (ArsenyBochkarev) Date: Mon, 3 Jun 2024 19:38:44 GMT Subject: RFR: 8317720: RISC-V: Implement Adler32 intrinsic [v10] In-Reply-To: References: Message-ID: > Hello everyone! Please review this ~non-vectorized~ implementation of `_updateBytesAdler32` intrinsic. Reference implementation for AArch64 can be found [here](https://github.com/openjdk/jdk9/blob/master/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp#L3281). > > ### Correctness checks > > Test `test/hotspot/jtreg/compiler/intrinsics/zip/TestAdler32.java` is ok. All tier1 also passed. > > ### Performance results on T-Head board > > Enabled intrinsic: > > | Benchmark | (count) | Mode | Cnt | Score | Error | Units | > | ------------------------------------- | ----------- | ------ | --------- | ------ | --------- | ---------- | > | Adler32.TestAdler32.testAdler32Update | 64 | thrpt | 25 | 5522.693 | 23.387 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 128 | thrpt | 25 | 3430.761 | 9.210 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 256 | thrpt | 25 | 1962.888 | 5.323 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 512 | thrpt | 25 | 1050.938 | 0.144 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 1024 | thrpt | 25 | 549.227 | 0.375 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 2048 | thrpt | 25 | 280.829 | 0.170 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 5012 | thrpt | 25 | 116.333 | 0.057 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 8192 | thrpt | 25 | 71.392 | 0.060 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 16384 | thrpt | 25 | 35.784 | 0.019 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 32768 | thrpt | 25 | 17.924 | 0.010 | ops/ms | > | Adler32.TestAdler32.testAdler32Update | 65536 | thrpt | 25 | 8.940 | 0.003 | ops/ms | > > Disabled intrinsic: > > | Benchmark | (count) | Mode | Cnt | Score | Error | Units | > | ------------------------------------- | ----------- | ------ | --------- | ------ | --------- | ---------- | > |Adler32.TestAdler32.testAdler32Update|64|thrpt|25|655.633|5.845|ops/ms| > |Adler32.TestAdler32.testAdler32Update|128|thrpt|25|587.418|10.062|ops/ms| > |Adler32.TestAdler32.testAdler32Update|256|thrpt|25|546.675|11.598|ops/ms| > |Adler32.TestAdler32.testAdler32Update|512|thrpt|25|432.328|11.517|ops/ms| > |Adler32.TestAdler32.testAdler32Update|1024|thrpt|25|311.771|4.238|ops/ms| > |Adler32.TestAdler32.testAdler32Update|2048|thrpt|25|202.648|2.486|ops/ms| > |Adler32.TestAdler32.testAdler32Update|5012|thrpt|25|100.246|1.119|ops/ms| > |Adler32.TestAdler32.testAdler32Update|8192|t... ArsenyBochkarev has updated the pull request incrementally with two additional commits since the last revision: - Fix vrsub_vi for case of vlen > 128 - Add process_bytes_by32 function ------------- Changes: - all: https://git.openjdk.org/jdk/pull/18382/files - new: https://git.openjdk.org/jdk/pull/18382/files/eb14ec10..8530d662 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=18382&range=09 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=18382&range=08-09 Stats: 69 lines in 1 file changed: 67 ins; 0 del; 2 mod Patch: https://git.openjdk.org/jdk/pull/18382.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/18382/head:pull/18382 PR: https://git.openjdk.org/jdk/pull/18382 From duke at openjdk.org Mon Jun 3 19:38:44 2024 From: duke at openjdk.org (ArsenyBochkarev) Date: Mon, 3 Jun 2024 19:38:44 GMT Subject: RFR: 8317720: RISC-V: Implement Adler32 intrinsic [v8] In-Reply-To: References: Message-ID: On Mon, 27 May 2024 14:20:39 GMT, Fei Yang wrote: >> ArsenyBochkarev has updated the pull request incrementally with three additional commits since the last revision: >> >> - Partially unroll L_by16_loop >> - Fix by64 function for vlen > 128 >> - Fix by16 function for vlen > 128 > > src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5303: > >> 5301: const int remainder = 3; >> 5302: adler32_process_bytes_by16(buff, s1, s2, right_16_bits, vtable_16, vzero, >> 5303: vbytes, vs1acc, vs2acc, temp0, temp1, temp2, vtemp1, vtemp2, remainder); > > Maybe deserves another `adler32_process_bytes_by32` here? Then you do one `adler32_process_bytes_by32` and one `adler32_process_bytes_by16` for the rest 3 iterations. Done ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1624951017 From vlivanov at openjdk.org Mon Jun 3 19:44:41 2024 From: vlivanov at openjdk.org (Vladimir Ivanov) Date: Mon, 3 Jun 2024 19:44:41 GMT Subject: RFR: 8331658: secondary_super_cache does not scale well: C1 [v2] In-Reply-To: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> References: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> Message-ID: On Wed, 29 May 2024 09:32:41 GMT, Andrew Haley wrote: >> This is the C1 version of [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). >> >> The new logic in this PR is as simple as I can make it. It is a somewhat-simplified version of the C2 change in [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). In order to reduce risk I haven't touched the existing slow subtype stub. >> The register allocation logic in the existing code is pretty gnarly, and I have no desire to break anything at this point in the release cycle, so I have allocated just one register more than the existing code does. >> >> Performance is pretty good. Before and after: >> >> x64, AMD 2950X, 8 cores: >> >> >> Benchmark Mode Cnt Score Error Units >> SecondarySuperCacheHits.test avgt 5 0.959 ? 0.091 ns/op >> SecondarySuperCacheInterContention.test avgt 5 42.931 ? 6.951 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 42.397 ? 7.708 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 43.466 ? 8.238 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 74.660 ? 0.127 ns/op >> >> SecondarySuperCacheHits.test avgt 5 1.480 ? 0.077 ns/op >> SecondarySuperCacheInterContention.test avgt 5 1.461 ? 0.063 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 1.767 ? 0.078 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 1.155 ? 0.052 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 1.421 ? 0.002 ns/op >> >> AArch64, Mac M3, 8 cores: >> >> >> Benchmark Mode Cnt Score Error Units >> SecondarySuperCacheHits.test avgt 5 0.835 ? 0.021 ns/op >> SecondarySuperCacheInterContention.test avgt 5 74.078 ? 18.095 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 81.863 ? 42.492 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 66.293 ? 11.254 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 335.563 ? 6.171 ns/op >> >> SecondarySuperCacheHits.test avgt 5 1.212 ? 0.004 ns/op >> SecondarySuperCacheInterContention.test avgt 5 0.871 ? 0.002 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 0.626 ? 0.003 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 1.115 ? 0.006 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 0.696 ? 0.001 ns/op >> >> >> >> The first test, `SecondarySuperCacheHits`, showns a small regression. It's... > > Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: > > JDK-8331658: secondary_super_cache does not scale well: C1 It's unfortunate to see C1-specific version of secondary supers table lookup. Why don't you reuse `MacroAssembler::lookup_secondary_supers_table` instead? Also, in the context of C1, do performance benefits justify additional implementation complexity? As an alternative, migrating `MacroAssembler::check_klass_subtype_slow_path` away from linear search to a table lookup would also do the job and cover all cases of subtype checks in the JVM. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19426#issuecomment-2145980997 From jvernee at openjdk.org Mon Jun 3 20:51:32 2024 From: jvernee at openjdk.org (Jorn Vernee) Date: Mon, 3 Jun 2024 20:51:32 GMT Subject: RFR: 8332547: Unloaded signature classes in DirectMethodHandles [v2] In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 19:36:58 GMT, Vladimir Ivanov wrote: >> JVM routinely installs loader constraints for unloaded signature classes when method resolution takes place. MethodHandle resolution took a different route and eagerly resolves signature classes instead (see `java.lang.invoke.MemberName$Factory::resolve` and `sun.invoke.util.VerifyAccess::isTypeVisible` for details). >> >> There's a micro-optimization which bypasses eager resolution for `java.*` classes. The downside is that `java.*` signature classes can show up as unloaded. It manifests as inlining failures during JIT-compilation and may cause severe performance issues. >> >> Proposed fix removes the aforementioned special case logic during `MethodHandle` resolution. >> >> In some cases it may slow down `MethodHandle` construction a bit (e.g., when repeatedly constructing `DirectMethodHandle`s with lots of arguments), but `MethodHandle` construction step is not performance critical. >> >> Testing: hs-tier1 - hs-tier4 > > Vladimir Ivanov has updated the pull request incrementally with one additional commit since the last revision: > > Renaming: isTypeVisible -> ensureTypeVisible Thanks ------------- Marked as reviewed by jvernee (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19319#pullrequestreview-2094825208 From liach at openjdk.org Mon Jun 3 22:33:49 2024 From: liach at openjdk.org (Chen Liang) Date: Mon, 3 Jun 2024 22:33:49 GMT Subject: RFR: 8332547: Unloaded signature classes in DirectMethodHandles [v2] In-Reply-To: References: Message-ID: <1WnYxErQWK4DWeiSqd-P0wB5k5HrlTkZtjtVp8bXV8A=.d74c9626-1889-4ae9-b197-9991cb21ee06@github.com> On Mon, 3 Jun 2024 19:36:58 GMT, Vladimir Ivanov wrote: >> JVM routinely installs loader constraints for unloaded signature classes when method resolution takes place. MethodHandle resolution took a different route and eagerly resolves signature classes instead (see `java.lang.invoke.MemberName$Factory::resolve` and `sun.invoke.util.VerifyAccess::isTypeVisible` for details). >> >> There's a micro-optimization which bypasses eager resolution for `java.*` classes. The downside is that `java.*` signature classes can show up as unloaded. It manifests as inlining failures during JIT-compilation and may cause severe performance issues. >> >> Proposed fix removes the aforementioned special case logic during `MethodHandle` resolution. >> >> In some cases it may slow down `MethodHandle` construction a bit (e.g., when repeatedly constructing `DirectMethodHandle`s with lots of arguments), but `MethodHandle` construction step is not performance critical. >> >> Testing: hs-tier1 - hs-tier4 > > Vladimir Ivanov has updated the pull request incrementally with one additional commit since the last revision: > > Renaming: isTypeVisible -> ensureTypeVisible Marked as reviewed by liach (Author). ------------- PR Review: https://git.openjdk.org/jdk/pull/19319#pullrequestreview-2094960904 From duke at openjdk.org Tue Jun 4 02:22:11 2024 From: duke at openjdk.org (kuaiwei) Date: Tue, 4 Jun 2024 02:22:11 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp In-Reply-To: <3wjDkxySe8t8A8feJjH9SzuEK2a2BDt9xBFwJOW8iac=.8b2ba0b1-9457-41da-be9c-2b477a2fa655@github.com> References: <3wjDkxySe8t8A8feJjH9SzuEK2a2BDt9xBFwJOW8iac=.8b2ba0b1-9457-41da-be9c-2b477a2fa655@github.com> Message-ID: On Mon, 3 Jun 2024 11:40:59 GMT, Andrew Haley wrote: > OK, it's some remnants of the Graal-based jaotc. Patch approved. Thanks for quick review. Wait for another approve. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19518#issuecomment-2146443365 From vlivanov at openjdk.org Tue Jun 4 04:05:12 2024 From: vlivanov at openjdk.org (Vladimir Ivanov) Date: Tue, 4 Jun 2024 04:05:12 GMT Subject: RFR: 8332547: Unloaded signature classes in DirectMethodHandles [v2] In-Reply-To: References: Message-ID: <2fd-qvBsJuxYtZmX22F7u5ycAN7uDMobngVKi3Wqrtc=.dc93d9da-948d-433d-9aeb-a205407a41b5@github.com> On Mon, 3 Jun 2024 19:36:58 GMT, Vladimir Ivanov wrote: >> JVM routinely installs loader constraints for unloaded signature classes when method resolution takes place. MethodHandle resolution took a different route and eagerly resolves signature classes instead (see `java.lang.invoke.MemberName$Factory::resolve` and `sun.invoke.util.VerifyAccess::isTypeVisible` for details). >> >> There's a micro-optimization which bypasses eager resolution for `java.*` classes. The downside is that `java.*` signature classes can show up as unloaded. It manifests as inlining failures during JIT-compilation and may cause severe performance issues. >> >> Proposed fix removes the aforementioned special case logic during `MethodHandle` resolution. >> >> In some cases it may slow down `MethodHandle` construction a bit (e.g., when repeatedly constructing `DirectMethodHandle`s with lots of arguments), but `MethodHandle` construction step is not performance critical. >> >> Testing: hs-tier1 - hs-tier4 > > Vladimir Ivanov has updated the pull request incrementally with one additional commit since the last revision: > > Renaming: isTypeVisible -> ensureTypeVisible Thanks for the reviews, Jorn and Chen. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19319#issuecomment-2146542294 From vlivanov at openjdk.org Tue Jun 4 04:05:13 2024 From: vlivanov at openjdk.org (Vladimir Ivanov) Date: Tue, 4 Jun 2024 04:05:13 GMT Subject: Integrated: 8332547: Unloaded signature classes in DirectMethodHandles In-Reply-To: References: Message-ID: <88jGtEXd70Hb-Qjg9oZKX_N3Ut3OOn8hAAhb1GsTFnU=.92540a2f-028d-4d4c-b2f1-e72f25d91942@github.com> On Mon, 20 May 2024 21:29:20 GMT, Vladimir Ivanov wrote: > JVM routinely installs loader constraints for unloaded signature classes when method resolution takes place. MethodHandle resolution took a different route and eagerly resolves signature classes instead (see `java.lang.invoke.MemberName$Factory::resolve` and `sun.invoke.util.VerifyAccess::isTypeVisible` for details). > > There's a micro-optimization which bypasses eager resolution for `java.*` classes. The downside is that `java.*` signature classes can show up as unloaded. It manifests as inlining failures during JIT-compilation and may cause severe performance issues. > > Proposed fix removes the aforementioned special case logic during `MethodHandle` resolution. > > In some cases it may slow down `MethodHandle` construction a bit (e.g., when repeatedly constructing `DirectMethodHandle`s with lots of arguments), but `MethodHandle` construction step is not performance critical. > > Testing: hs-tier1 - hs-tier4 This pull request has now been integrated. Changeset: 29e10e45 Author: Vladimir Ivanov URL: https://git.openjdk.org/jdk/commit/29e10e4582c1a844a6db4c42ba01bd1d6d4dfd52 Stats: 83 lines in 4 files changed: 68 ins; 0 del; 15 mod 8332547: Unloaded signature classes in DirectMethodHandles Reviewed-by: jvernee, liach ------------- PR: https://git.openjdk.org/jdk/pull/19319 From thartmann at openjdk.org Tue Jun 4 06:18:12 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Tue, 4 Jun 2024 06:18:12 GMT Subject: RFR: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state In-Reply-To: References: Message-ID: <0L-dcZjTCT4tq85qzlqocWAMQDBINHAR28-HcNYTvws=.fb10d9f5-e361-4d68-81c5-42ee6747f8b3@github.com> On Mon, 3 Jun 2024 11:57:58 GMT, Tobias Hartmann wrote: > Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 > > The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 > > which is casted to a `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 > > I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. > > Thanks, > Tobias Thanks for the review, Vladimir! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19520#issuecomment-2146694073 From epeter at openjdk.org Tue Jun 4 06:23:23 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 4 Jun 2024 06:23:23 GMT Subject: RFR: 8332905: C2 SuperWord: bad AD file, with RotateRightV and first operand not a pack In-Reply-To: References: Message-ID: On Thu, 30 May 2024 06:23:34 GMT, Tobias Hartmann wrote: >> I just discovered this bug by manual code inspection, and found a reproducer. >> >> It seems to be a regression of [JDK-8248830](https://bugs.openjdk.org/browse/JDK-8248830), that is when RotateRightV was added to SuperWord. >> >> The problem is that we directly get the input node, rather than the `vector_opd`, which fails if that input is not a vector already, but for example a `PopulateIndex` pattern that is only vectorized when calling `vector_opd`. >> >> Before this patch: it looks like this: >> >> } else if (VectorNode::is_scalar_rotate(n)) { >> Node* in1 = first->in(1); >> Node* in2 = first->in(2); >> >> >> But at least `in1` should be using `vector_opd`, like most other ops: >> >> ` Node* in1 = vector_opd(p, 1);` >> >> When the input is a `PopulateIndex` pattern, then `first->in(1) `gives us the iv-phi, which is a scalar. `vector_opd` would produce a `PopulateIndex` vector. >> >> In the ad-file, we get an error, because we do not expect a scalar as the first operand of the RotateRightV, but a vector. > > Looks good to me too. > > Fix version was still set to JDK 24 (the bot now warns about this as well: "The fixVersion in this issue is [24] but the fixVersion in .jcheck/conf is 23, a new backport will be created when this pr is integrated."). I set it back to JDK 23. Thanks @TobiHartmann @chhagedorn for the reviews! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19445#issuecomment-2146701006 From epeter at openjdk.org Tue Jun 4 06:23:24 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 4 Jun 2024 06:23:24 GMT Subject: Integrated: 8332905: C2 SuperWord: bad AD file, with RotateRightV and first operand not a pack In-Reply-To: References: Message-ID: On Wed, 29 May 2024 07:20:33 GMT, Emanuel Peter wrote: > I just discovered this bug by manual code inspection, and found a reproducer. > > It seems to be a regression of [JDK-8248830](https://bugs.openjdk.org/browse/JDK-8248830), that is when RotateRightV was added to SuperWord. > > The problem is that we directly get the input node, rather than the `vector_opd`, which fails if that input is not a vector already, but for example a `PopulateIndex` pattern that is only vectorized when calling `vector_opd`. > > Before this patch: it looks like this: > > } else if (VectorNode::is_scalar_rotate(n)) { > Node* in1 = first->in(1); > Node* in2 = first->in(2); > > > But at least `in1` should be using `vector_opd`, like most other ops: > > ` Node* in1 = vector_opd(p, 1);` > > When the input is a `PopulateIndex` pattern, then `first->in(1) `gives us the iv-phi, which is a scalar. `vector_opd` would produce a `PopulateIndex` vector. > > In the ad-file, we get an error, because we do not expect a scalar as the first operand of the RotateRightV, but a vector. This pull request has now been integrated. Changeset: 67d6f3ca Author: Emanuel Peter URL: https://git.openjdk.org/jdk/commit/67d6f3ca9e8d1312c9e3a85dbe19903619f59064 Stats: 21 lines in 2 files changed: 18 ins; 0 del; 3 mod 8332905: C2 SuperWord: bad AD file, with RotateRightV and first operand not a pack Reviewed-by: chagedorn, thartmann ------------- PR: https://git.openjdk.org/jdk/pull/19445 From syan at openjdk.org Tue Jun 4 07:51:36 2024 From: syan at openjdk.org (SendaoYan) Date: Tue, 4 Jun 2024 07:51:36 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles Message-ID: Hi all, This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. Thanks. ------------- Commit messages: - 8333477: Delete extra empty spaces in Makefiles Changes: https://git.openjdk.org/jdk/pull/19537/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19537&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333477 Stats: 9 lines in 4 files changed: 0 ins; 1 del; 8 mod Patch: https://git.openjdk.org/jdk/pull/19537.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19537/head:pull/19537 PR: https://git.openjdk.org/jdk/pull/19537 From aph at openjdk.org Tue Jun 4 07:58:22 2024 From: aph at openjdk.org (Andrew Haley) Date: Tue, 4 Jun 2024 07:58:22 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v17] In-Reply-To: <5ylgxbLHK9_o0K7AjjOcYzdbr4TOyGj1xJ35RKxbFkA=.ba2cb3a1-eec1-43af-b0c8-d85314b320ec@github.com> References: <5ylgxbLHK9_o0K7AjjOcYzdbr4TOyGj1xJ35RKxbFkA=.ba2cb3a1-eec1-43af-b0c8-d85314b320ec@github.com> Message-ID: <4GMsGErvWikJ0U1A_3-b3S69vUkA93aFAZ9_5vv7gyw=.5df176e7-aab0-4b59-9609-14867e8fd82e@github.com> On Mon, 3 Jun 2024 17:34:43 GMT, Thomas Stuefe wrote: >> Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: >> >> Kludge to fix build on Arm 32 > > src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp line 71: > >> 69: #include >> 70: >> 71: static_assert(different_registers(zr, sp), "fucked"); > > Debugging remnant? Ah, LOL! :-) ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/16617#discussion_r1625533145 From dfenacci at openjdk.org Tue Jun 4 08:01:08 2024 From: dfenacci at openjdk.org (Damon Fenacci) Date: Tue, 4 Jun 2024 08:01:08 GMT Subject: RFR: 8326615: C1/C2 don't handle allocation failure properly during initialization (RuntimeStub::new_runtime_stub fatal crash) [v5] In-Reply-To: <8jOAyeuwyQ1V-knX_8AHsdOci0cr5mfcyKseBEt8Kpg=.79b51a11-0aae-4676-aafe-df6113ac6fc7@github.com> References: <8jOAyeuwyQ1V-knX_8AHsdOci0cr5mfcyKseBEt8Kpg=.79b51a11-0aae-4676-aafe-df6113ac6fc7@github.com> Message-ID: <6Hv1R4l3Ey4YafznXHRBHS_UkW8sWMXPRQLBvwN98X8=.d56a4edc-30aa-48f5-814b-a099edc07063@github.com> On Thu, 30 May 2024 22:44:53 GMT, Dean Long wrote: > This looks OK, but isn't it a lot of changes just to get this test to pass? Aren't all of these allocation failures ultimately fatal? Is there a simpler way to handle this problem? It seems a bit much indeed but I think there is potentially always the possibility of not failing but only disabling the compiler. @dean-long do you think the VM would anyway fail later on? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19280#issuecomment-2146860084 From galder at openjdk.org Tue Jun 4 08:17:09 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Tue, 4 Jun 2024 08:17:09 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers Message-ID: Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. There's no barrier added on x86 c1 macro assembler for nothing to do there. I've run the following tests: * tier 1 on darwin/aarch64 * tier 1 on linux/x86_64 * `hotspot_compiler` tests on darwin/aarch64 * `copy.clone.arrays` jcstress tests on darwin/aarch64. I tried but was unable to create a standalone test for the jdk source tree that would fail. FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. ------------- Commit messages: - Membar after array copy for clone intrinsic on aarch64 Changes: https://git.openjdk.org/jdk/pull/19538/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19538&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8332670 Stats: 7 lines in 2 files changed: 6 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19538.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19538/head:pull/19538 PR: https://git.openjdk.org/jdk/pull/19538 From galder at openjdk.org Tue Jun 4 08:29:02 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Tue, 4 Jun 2024 08:29:02 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: References: Message-ID: <-qWG3SDA1axsnT9WwUis0XNhhGXYnHqjqfAiXaR6AbU=.99069589-d57c-4440-925b-947b5247e9e8@github.com> On Tue, 4 Jun 2024 08:10:59 GMT, Galder Zamarre?o wrote: > Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. > > The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. > > There's no barrier added on x86 c1 macro assembler for nothing to do there. > > I've run the following tests: > * tier 1 on darwin/aarch64 > * tier 1 on linux/x86_64 > * `hotspot_compiler` tests on darwin/aarch64 > * `copy.clone.arrays` jcstress tests on darwin/aarch64. > > I tried but was unable to create a standalone test for the jdk source tree that would fail. > > FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. Forgot to say, this change shows no impact on the array clone micro benchmark. This is expected since the change moves the barrier from one place (after array creation) to another (after copying contents), so no additional barriers are introduced. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2146915736 From aph at openjdk.org Tue Jun 4 08:37:03 2024 From: aph at openjdk.org (Andrew Haley) Date: Tue, 4 Jun 2024 08:37:03 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 08:10:59 GMT, Galder Zamarre?o wrote: > Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. > > The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. > > There's no barrier added on x86 c1 macro assembler for nothing to do there. > > I've run the following tests: > * tier 1 on darwin/aarch64 > * tier 1 on linux/x86_64 > * `hotspot_compiler` tests on darwin/aarch64 > * `copy.clone.arrays` jcstress tests on darwin/aarch64. > > I tried but was unable to create a standalone test for the jdk source tree that would fail. > > FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. src/hotspot/cpu/aarch64/c1_MacroAssembler_aarch64.cpp line 309: > 307: // Only add membar if zeroing the array. > 308: // If not zeroing, subsequent instructions should populate the array (e.g. copy contents), > 309: // and the membar should be set after the array has been populated accordingly. Strengthen the language: Suggestion: // Only add membar if zeroing the array. // If not zeroing, subsequent instructions must fully populate the array (e.g. copy contents), // and a membar must be emitted after the array has been populated accordingly. See // GraphBuilder::append_alloc_array_copy. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19538#discussion_r1625598080 From shade at openjdk.org Tue Jun 4 08:46:09 2024 From: shade at openjdk.org (Aleksey Shipilev) Date: Tue, 4 Jun 2024 08:46:09 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 08:10:59 GMT, Galder Zamarre?o wrote: > Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. > > The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. > > There's no barrier added on x86 c1 macro assembler for nothing to do there. > > I've run the following tests: > * tier 1 on darwin/aarch64 > * tier 1 on linux/x86_64 > * `hotspot_compiler` tests on darwin/aarch64 > * `copy.clone.arrays` jcstress tests on darwin/aarch64. > > I tried but was unable to create a standalone test for the jdk source tree that would fail. > > FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. All right, that looks reasonable. I am a bit queasy on removing the storestore barrier from the allocation path, given that it also protects the object metadata. An accidentally missing barrier would probably lead to VM crash, that is in the best case. Current code paths do not seem to be affected by this, but there is also no guardrails that would protect us from making such a mistake in the future: someone adds `new NewTypeArray` somewhere, and forgets a trailing barrier? What would be the cost of still emitting (an excess for cloning path) StoreStore in `C1_MacroAssembler::allocate_array`? Would that cost still matter, given the performance improvement we get with C1 clone intrinsic? ------------- Marked as reviewed by shade (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19538#pullrequestreview-2095739983 From aph at openjdk.org Tue Jun 4 09:03:17 2024 From: aph at openjdk.org (Andrew Haley) Date: Tue, 4 Jun 2024 09:03:17 GMT Subject: RFR: 8331658: secondary_super_cache does not scale well: C1 [v2] In-Reply-To: References: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> Message-ID: <133BPiIlb3ZWKJl4UXF8Y0tNcdCMCY7rFbAIc9JWRI0=.c0e364e6-725a-4ba2-b574-64713874ec8a@github.com> On Mon, 3 Jun 2024 19:41:49 GMT, Vladimir Ivanov wrote: > It's unfortunate to see C1-specific version of secondary supers table lookup. Why don't you reuse `MacroAssembler::lookup_secondary_supers_table` instead? It's a complicated tradeoff. In particular, `lookup_secondary_supers_table` needs to use fixed registers, and quite a lot of them. This patch is a version of the table lookup that uses as few registers as possible, and none of them are fixed. C1 has its own runtime logic partly because of the vagaries of its register allocation strategy, and while `MacroAssembler::lookup_secondary_supers_table` would work here, it would generate more code and work less well for a pure C1 system. > Also, in the context of C1, do performance benefits justify additional implementation complexity? As an alternative, migrating `MacroAssembler::check_klass_subtype_slow_path` away from linear search to a table lookup would also do the job and cover all cases of subtype checks in the JVM. Table lookup needs more registers than a simple linear scan. We'd either have to fix every caller to pass more scratch registers or push and pop some registers in the lookup routine. Given that the time to do a table lookup is in the ns range, pushing and popping a few registers might make it uncompetitive. I'll have a look. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19426#issuecomment-2146987902 From chagedorn at openjdk.org Tue Jun 4 09:09:04 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 09:09:04 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 12:39:05 GMT, Christian Hagedorn wrote: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... Thanks Vladimir for your review! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19522#issuecomment-2146997378 From aph at openjdk.org Tue Jun 4 09:13:10 2024 From: aph at openjdk.org (Andrew Haley) Date: Tue, 4 Jun 2024 09:13:10 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 08:43:04 GMT, Aleksey Shipilev wrote: > All right, that looks reasonable. > > I am a bit queasy on conditionally removing the `StoreStore` barrier from the allocation path, given that it also protects the object metadata. An accidentally missing barrier would probably lead to VM crash, that is in the best case. Current code paths do not seem to be affected by this, but there is also no guardrails that would protect us from making such a mistake in the future: someone adds `new NewTypeArray` somewhere, and forgets a trailing barrier? I agree with you. I suspect an additional StoreStore` barrier after writing the header would make no difference in performance. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2147005682 From mbaesken at openjdk.org Tue Jun 4 09:33:02 2024 From: mbaesken at openjdk.org (Matthias Baesken) Date: Tue, 4 Jun 2024 09:33:02 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer In-Reply-To: References: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> Message-ID: On Mon, 3 Jun 2024 14:46:42 GMT, Martin Doerr wrote: > LGTM. Thanks! @MBaesken: Can you verify it, please? Worked with ubsan enabled (Linux x86_64 fastdebug build) . ------------- PR Comment: https://git.openjdk.org/jdk/pull/19525#issuecomment-2147051199 From aph at openjdk.org Tue Jun 4 09:38:34 2024 From: aph at openjdk.org (Andrew Haley) Date: Tue, 4 Jun 2024 09:38:34 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v18] In-Reply-To: References: Message-ID: <0qa0AQuWoKkuZACRfwZlHvd1cb5yv5oWowLUoW6wh9A=.8bd30a1a-ac09-4253-989e-92fc29351742@github.com> > At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. > > In addition, it would be useful to be able to static_assert different registers. > > Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. > > I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: Review feedback ------------- Changes: - all: https://git.openjdk.org/jdk/pull/16617/files - new: https://git.openjdk.org/jdk/pull/16617/files/eac03cc1..ddfda06e Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=17 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=16617&range=16-17 Stats: 2 lines in 1 file changed: 0 ins; 2 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/16617.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/16617/head:pull/16617 PR: https://git.openjdk.org/jdk/pull/16617 From chagedorn at openjdk.org Tue Jun 4 09:50:15 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 09:50:15 GMT Subject: Integrated: 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 08:29:18 GMT, Christian Hagedorn wrote: > [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and/or non-null-checks which both use `Opaque4` nodes. I've also improved the following bailout from `!= 2` to `< 2` since the original comment suggested that if there is a test without 2 inputs, it must be a dead test (i.e. I assumed a `ConNode`): > > https://github.com/openjdk/jdk/blob/91101f0d4fc8e06d0d74e06361db6ac87efeeb8e/src/hotspot/share/opto/loopTransform.cpp#L1204-L1206 > > But apparently, we could also have the following, also dead, `If` with a condition with 3 inputs (`505 Phi`) created by `split_thru_phi`: > > ![image](https://github.com/openjdk/jdk/assets/17833009/003ad32a-a675-49f2-a263-7ca28d1faf82) > > I therefore revert the check back to `!= 2` and improved the comment. > > Thanks, > Christian This pull request has now been integrated. Changeset: 64bbae75 Author: Christian Hagedorn URL: https://git.openjdk.org/jdk/commit/64bbae75121ccf80c02a0960e2db62eb558052e6 Stats: 61 lines in 2 files changed: 58 ins; 0 del; 3 mod 8333394: C2: assert(bol->is_Opaque4() || bol->is_OpaqueInitializedAssertionPredicate()) failed: Opaque node of non-null-check or of Initialized Assertion Predicate Reviewed-by: thartmann, roland ------------- PR: https://git.openjdk.org/jdk/pull/19517 From thartmann at openjdk.org Tue Jun 4 09:50:08 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Tue, 4 Jun 2024 09:50:08 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer In-Reply-To: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> References: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> Message-ID: On Mon, 3 Jun 2024 14:39:28 GMT, Vladimir Kozlov wrote: > Avoid iterations on empty relocation info. Found by running `ubsan`. > > Tested tier1-4, stress, xcomp. Looks good to me too. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19525#pullrequestreview-2095891457 From chagedorn at openjdk.org Tue Jun 4 10:05:05 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 10:05:05 GMT Subject: RFR: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state In-Reply-To: References: Message-ID: <2zoIQcdWfUdk473fBjUa0-djovTM-B8zIAsb4JSTstw=.5b6eb920-51a2-4bf6-b30c-92a92707dd7c@github.com> On Mon, 3 Jun 2024 11:57:58 GMT, Tobias Hartmann wrote: > Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 > > The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 > > which is casted to a `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 > > I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. > > Thanks, > Tobias src/hotspot/share/ci/ciTypeFlow.cpp line 408: > 406: // Set the rest of the locals to bottom. > 407: while (state->stack_size() != 0) { > 408: state->push(state->bottom_type()); It might be more clear if the condition is `< 0` since we only enter the loop if the stack size is negative. Maybe we could also assert that the stack size is `<= 0` before the loop? Otherwise, nice cleanup! Suggestion: assert(state->stack_size() <= 0, "stack size should not be strictly positive"); while (state->stack_size() < 0) { state->push(state->bottom_type()); ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19520#discussion_r1625724417 From thartmann at openjdk.org Tue Jun 4 10:17:14 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Tue, 4 Jun 2024 10:17:14 GMT Subject: RFR: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state [v2] In-Reply-To: References: Message-ID: > Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 > > The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 > > which is casted to a `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 > > I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. > > Thanks, > Tobias Tobias Hartmann has updated the pull request incrementally with one additional commit since the last revision: Update src/hotspot/share/ci/ciTypeFlow.cpp Co-authored-by: Christian Hagedorn ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19520/files - new: https://git.openjdk.org/jdk/pull/19520/files/ead0e611..c4d45ca4 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19520&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19520&range=00-01 Stats: 2 lines in 1 file changed: 1 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19520.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19520/head:pull/19520 PR: https://git.openjdk.org/jdk/pull/19520 From thartmann at openjdk.org Tue Jun 4 10:17:14 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Tue, 4 Jun 2024 10:17:14 GMT Subject: RFR: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 11:57:58 GMT, Tobias Hartmann wrote: > Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 > > The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 > > which is casted to a `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 > > I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. > > Thanks, > Tobias Thanks for the review Christian. Your suggestion makes sense! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19520#issuecomment-2147153223 From chagedorn at openjdk.org Tue Jun 4 10:25:03 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 10:25:03 GMT Subject: RFR: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state [v2] In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 10:17:14 GMT, Tobias Hartmann wrote: >> Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: >> https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 >> >> The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: >> https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 >> >> which is casted to a `Cell`: >> https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 >> >> I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. >> >> Thanks, >> Tobias > > Tobias Hartmann has updated the pull request incrementally with one additional commit since the last revision: > > Update src/hotspot/share/ci/ciTypeFlow.cpp > > Co-authored-by: Christian Hagedorn Looks good, thanks for the update! ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19520#pullrequestreview-2095970919 From thartmann at openjdk.org Tue Jun 4 10:29:02 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Tue, 4 Jun 2024 10:29:02 GMT Subject: RFR: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state [v2] In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 10:17:14 GMT, Tobias Hartmann wrote: >> Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: >> https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 >> >> The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: >> https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 >> >> which is casted to a `Cell`: >> https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 >> >> I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. >> >> Thanks, >> Tobias > > Tobias Hartmann has updated the pull request incrementally with one additional commit since the last revision: > > Update src/hotspot/share/ci/ciTypeFlow.cpp > > Co-authored-by: Christian Hagedorn Thanks for the review! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19520#issuecomment-2147174859 From mbaesken at openjdk.org Tue Jun 4 12:05:10 2024 From: mbaesken at openjdk.org (Matthias Baesken) Date: Tue, 4 Jun 2024 12:05:10 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer In-Reply-To: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> References: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> Message-ID: On Mon, 3 Jun 2024 14:39:28 GMT, Vladimir Kozlov wrote: > Avoid iterations on empty relocation info. Found by running `ubsan`. > > Tested tier1-4, stress, xcomp. Marked as reviewed by mbaesken (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19525#pullrequestreview-2096180416 From aph at openjdk.org Tue Jun 4 12:53:15 2024 From: aph at openjdk.org (Andrew Haley) Date: Tue, 4 Jun 2024 12:53:15 GMT Subject: RFR: 8331658: secondary_super_cache does not scale well: C1 [v2] In-Reply-To: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> References: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> Message-ID: On Wed, 29 May 2024 09:32:41 GMT, Andrew Haley wrote: >> This is the C1 version of [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). >> >> The new logic in this PR is as simple as I can make it. It is a somewhat-simplified version of the C2 change in [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). In order to reduce risk I haven't touched the existing slow subtype stub. >> The register allocation logic in the existing code is pretty gnarly, and I have no desire to break anything at this point in the release cycle, so I have allocated just one register more than the existing code does. >> >> Performance is pretty good. Before and after: >> >> x64, AMD 2950X, 8 cores: >> >> >> Benchmark Mode Cnt Score Error Units >> SecondarySuperCacheHits.test avgt 5 0.959 ? 0.091 ns/op >> SecondarySuperCacheInterContention.test avgt 5 42.931 ? 6.951 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 42.397 ? 7.708 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 43.466 ? 8.238 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 74.660 ? 0.127 ns/op >> >> SecondarySuperCacheHits.test avgt 5 1.480 ? 0.077 ns/op >> SecondarySuperCacheInterContention.test avgt 5 1.461 ? 0.063 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 1.767 ? 0.078 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 1.155 ? 0.052 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 1.421 ? 0.002 ns/op >> >> AArch64, Mac M3, 8 cores: >> >> >> Benchmark Mode Cnt Score Error Units >> SecondarySuperCacheHits.test avgt 5 0.835 ? 0.021 ns/op >> SecondarySuperCacheInterContention.test avgt 5 74.078 ? 18.095 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 81.863 ? 42.492 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 66.293 ? 11.254 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 335.563 ? 6.171 ns/op >> >> SecondarySuperCacheHits.test avgt 5 1.212 ? 0.004 ns/op >> SecondarySuperCacheInterContention.test avgt 5 0.871 ? 0.002 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 0.626 ? 0.003 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 1.115 ? 0.006 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 0.696 ? 0.001 ns/op >> >> >> >> The first test, `SecondarySuperCacheHits`, showns a small regression. It's... > > Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: > > JDK-8331658: secondary_super_cache does not scale well: C1 Closing this for now. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19426#issuecomment-2147455869 From aph at openjdk.org Tue Jun 4 12:53:15 2024 From: aph at openjdk.org (Andrew Haley) Date: Tue, 4 Jun 2024 12:53:15 GMT Subject: Withdrawn: 8331658: secondary_super_cache does not scale well: C1 In-Reply-To: References: Message-ID: On Tue, 28 May 2024 13:22:41 GMT, Andrew Haley wrote: > This is the C1 version of [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). > > The new logic in this PR is as simple as I can make it. It is a somewhat-simplified version of the C2 change in [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). In order to reduce risk I haven't touched the existing slow subtype stub. > The register allocation logic in the existing code is pretty gnarly, and I have no desire to break anything at this point in the release cycle, so I have allocated just one register more than the existing code does. > > Performance is pretty good. Before and after: > > x64, AMD 2950X, 8 cores: > > > Benchmark Mode Cnt Score Error Units > SecondarySuperCacheHits.test avgt 5 0.959 ? 0.091 ns/op > SecondarySuperCacheInterContention.test avgt 5 42.931 ? 6.951 ns/op > SecondarySuperCacheInterContention.test:t1 avgt 5 42.397 ? 7.708 ns/op > SecondarySuperCacheInterContention.test:t2 avgt 5 43.466 ? 8.238 ns/op > SecondarySuperCacheIntraContention.test avgt 5 74.660 ? 0.127 ns/op > > SecondarySuperCacheHits.test avgt 5 1.480 ? 0.077 ns/op > SecondarySuperCacheInterContention.test avgt 5 1.461 ? 0.063 ns/op > SecondarySuperCacheInterContention.test:t1 avgt 5 1.767 ? 0.078 ns/op > SecondarySuperCacheInterContention.test:t2 avgt 5 1.155 ? 0.052 ns/op > SecondarySuperCacheIntraContention.test avgt 5 1.421 ? 0.002 ns/op > > AArch64, Mac M3, 8 cores: > > > Benchmark Mode Cnt Score Error Units > SecondarySuperCacheHits.test avgt 5 0.835 ? 0.021 ns/op > SecondarySuperCacheInterContention.test avgt 5 74.078 ? 18.095 ns/op > SecondarySuperCacheInterContention.test:t1 avgt 5 81.863 ? 42.492 ns/op > SecondarySuperCacheInterContention.test:t2 avgt 5 66.293 ? 11.254 ns/op > SecondarySuperCacheIntraContention.test avgt 5 335.563 ? 6.171 ns/op > > SecondarySuperCacheHits.test avgt 5 1.212 ? 0.004 ns/op > SecondarySuperCacheInterContention.test avgt 5 0.871 ? 0.002 ns/op > SecondarySuperCacheInterContention.test:t1 avgt 5 0.626 ? 0.003 ns/op > SecondarySuperCacheInterContention.test:t2 avgt 5 1.115 ? 0.006 ns/op > SecondarySuperCacheIntraContention.test avgt 5 0.696 ? 0.001 ns/op > > > > The first test, `SecondarySuperCacheHits`, showns a small regression. It's the "happy path" which simply checks the same subclass again and again in a loop, i... This pull request has been closed without being integrated. ------------- PR: https://git.openjdk.org/jdk/pull/19426 From syan at openjdk.org Tue Jun 4 12:58:11 2024 From: syan at openjdk.org (SendaoYan) Date: Tue, 4 Jun 2024 12:58:11 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 07:47:46 GMT, SendaoYan wrote: > Hi all, > This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. > > Thanks. > /label build Thanks. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19537#issuecomment-2147467980 From erikj at openjdk.org Tue Jun 4 13:01:12 2024 From: erikj at openjdk.org (Erik Joelsson) Date: Tue, 4 Jun 2024 13:01:12 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles In-Reply-To: References: Message-ID: <-zitWvnM2OMNoksPkobR-GY7ydETAQyBLwcrdcoiLWE=.c1e182c2-7b34-43e9-a96f-e3bf1b367f8f@github.com> On Tue, 4 Jun 2024 07:47:46 GMT, SendaoYan wrote: > Hi all, > This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. > > Thanks. Marked as reviewed by erikj (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19537#pullrequestreview-2096319123 From kbarrett at openjdk.org Tue Jun 4 13:53:11 2024 From: kbarrett at openjdk.org (Kim Barrett) Date: Tue, 4 Jun 2024 13:53:11 GMT Subject: RFR: 8319822: Use a linear-time algorithm for assert_different_registers() [v18] In-Reply-To: <0qa0AQuWoKkuZACRfwZlHvd1cb5yv5oWowLUoW6wh9A=.8bd30a1a-ac09-4253-989e-92fc29351742@github.com> References: <0qa0AQuWoKkuZACRfwZlHvd1cb5yv5oWowLUoW6wh9A=.8bd30a1a-ac09-4253-989e-92fc29351742@github.com> Message-ID: On Tue, 4 Jun 2024 09:38:34 GMT, Andrew Haley wrote: >> At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. >> >> In addition, it would be useful to be able to static_assert different registers. >> >> Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. >> >> I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. > > Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: > > Review feedback Still looks good. ------------- Marked as reviewed by kbarrett (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/16617#pullrequestreview-2096484025 From chagedorn at openjdk.org Tue Jun 4 14:49:10 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 14:49:10 GMT Subject: RFR: 8333226: Regressions 2-3% in Compress ZGC after 8331253 In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 17:59:06 GMT, Vladimir Kozlov wrote: > Revert [JDK-8331253](https://bugs.openjdk.org/browse/JDK-8331253) changes [#3383ad63](https://git.openjdk.org/jdk/commit/3383ad6397d5a2d8fb232ffd3e29a54e0b37b686) to avoid regression. > And convert `nmethod::_skipped_instructions_size field` field to `int` type to address original JDK-8331253 issue. > > Tested tier1-3,stress,xcomp and performance. > > Note: You may see some regressions, but performance returns to state before JDK-8331253. I may look again in a future on changes I did to calculated skipped instructions. Looks good to me. ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19531#pullrequestreview-2096640491 From chagedorn at openjdk.org Tue Jun 4 14:52:33 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 14:52:33 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 10:43:11 GMT, kuaiwei wrote: > Some classes in nativeInst_aarch64.hpp are unused and can be removed. > > I checked with tier1 tests. Looks good to me, too. ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19518#pullrequestreview-2096649119 From chagedorn at openjdk.org Tue Jun 4 15:03:27 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 15:03:27 GMT Subject: RFR: 8332499: Gtest codestrings.validate_vm fail on linux x64 when hsdis is present [v5] In-Reply-To: References: Message-ID: <7JKIWstePLoPjwZWHet9HZ3TPVwMj2ENcR4t8h6wORk=.896c75ce-903c-475d-8001-ee27fb885db8@github.com> On Tue, 28 May 2024 15:47:25 GMT, SendaoYan wrote: >> Hi all, >> There's some arch-specific code to trim trailing entries as descripted in [JDK-8332499](https://bugs.openjdk.org/browse/JDK-8332499). Only change the gtest testcase, the risk is low. >> >> On linux x86_64, before this PR, after deal with `std::regex_replace(tmp4, std::regex("\\s+:\\s+hlt[ \\t]+(?!\\n\\s+;;)"), "")`, the output differents because the first output has trailing empty spaces, show as below: >> >> - : nop >> + : nop >> >> So we need to delete the empty spaces after `: nop` use `std::regex_replace(tmp5, std::regex("(\\s+:\\s+nop)[ \\t]*"), "$1")` >> >> >> Additional test: >> - [x] codestrings.validate_vm on linux x64 >> - [x] codestrings.validate_vm on linux aarch64 >> - [x] codestrings.validate_vm on linux riscv64 > > SendaoYan has updated the pull request incrementally with two additional commits since the last revision: > > - Merge branch 'jbs8332499' of github.com:sendaoYan/jdk-ysd into jbs8332499 > - delete the empty spaces after : nop Looks good to me, too. ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19309#pullrequestreview-2096681027 From kvn at openjdk.org Tue Jun 4 15:39:19 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 15:39:19 GMT Subject: RFR: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes In-Reply-To: References: Message-ID: On Fri, 31 May 2024 12:33:04 GMT, Christian Hagedorn wrote: > [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and not non-null-checks which both use `Opaque4` nodes. > > #### Correct Assertion > One of this assert was now hit with a fuzzer found case in `get_assertion_predicates()` called during the elimination of useless predicates. We walk through all loops and collect all useful Template Assertion Predicates and Parse Predicates above the loops. For that we look at the UCTs which are shared among the predicates. When finding a predicate with such an UCT which also has an `Opaque4` node, we know that it is a Template Assertion Predicate. We additionally assert that we must find the `OpaqueLoop*Nodes` above which always belong to a template: > > https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L346-L354 > > So, this assert looks correct. > > #### Why didn't we find `OpaqueLoop*Nodes` in this case? > For the Template Assertion Predicate for the last value, we insert an additional `CastII` to keep the type information of the iv phi: > > https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L1323-L1324 > > But in the test case, the type of the iv phi is a constant (`521 CastII`): > > ![image](https://github.com/openjdk/jdk/assets/17833009/5dc17b9c-abfe-4846-89a1-4e189234b991) > > `521 CastII` will simply be replaced with a constant during IGVN and the `OpaqueLoop*Nodes` above are removed. We therefore cannot find them anymore later when trying to eliminate useless predicates and we hit the assert. > > #### Why does the `CastII`/iv phi have a constant type? > Having a constant type for the iv phi indicates that the counted loop is only going to be executed for one iteration. But C2 has not had the chance, yet, to fold the loop exit test to remove the loop. > > #### How to fix this bug? > Having a single iteration loop raises the question, why we even bother to try and hoist checks out of such a loop with Loop Predication in the first place. I therefore suggest to simply bail out of Loop Predication if the trip count is 1. This will also prevent us from creating a Template Assertion Predicate with a `CastII` with a constant type from the iv phi which would be folded. > > To do that, we can compute the trip count on entry of Loop Predication. By doing that, we can also remove the trip count computation added for hoisting ran... Looks good. src/hotspot/share/opto/loopPredicate.cpp line 1373: > 1371: // Avoid RCE if Counted loop's test is '!='. > 1372: BoolTest::mask bt = cl->loopexit()->test_trip(); > 1373: if (bt != BoolTest::lt && bt != BoolTest::gt) Since you touching this code can you add `{}` for this condition? ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19500#pullrequestreview-2096777095 PR Review Comment: https://git.openjdk.org/jdk/pull/19500#discussion_r1626232817 From chagedorn at openjdk.org Tue Jun 4 15:58:39 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 15:58:39 GMT Subject: RFR: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes [v2] In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 15:34:46 GMT, Vladimir Kozlov wrote: >> Christian Hagedorn has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: >> >> - Add braces >> - Merge branch 'refs/heads/master' into JDK-8333252 >> - 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes > > src/hotspot/share/opto/loopPredicate.cpp line 1373: > >> 1371: // Avoid RCE if Counted loop's test is '!='. >> 1372: BoolTest::mask bt = cl->loopexit()->test_trip(); >> 1373: if (bt != BoolTest::lt && bt != BoolTest::gt) > > Since you touching this code can you add `{}` for this condition? Thanks Vladimir for your review! Yes, absolutely, I've pushed an update. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19500#discussion_r1626259620 From chagedorn at openjdk.org Tue Jun 4 15:58:39 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 15:58:39 GMT Subject: RFR: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes [v2] In-Reply-To: References: Message-ID: > [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and not non-null-checks which both use `Opaque4` nodes. > > #### Correct Assertion > One of this assert was now hit with a fuzzer found case in `get_assertion_predicates()` called during the elimination of useless predicates. We walk through all loops and collect all useful Template Assertion Predicates and Parse Predicates above the loops. For that we look at the UCTs which are shared among the predicates. When finding a predicate with such an UCT which also has an `Opaque4` node, we know that it is a Template Assertion Predicate. We additionally assert that we must find the `OpaqueLoop*Nodes` above which always belong to a template: > > https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L346-L354 > > So, this assert looks correct. > > #### Why didn't we find `OpaqueLoop*Nodes` in this case? > For the Template Assertion Predicate for the last value, we insert an additional `CastII` to keep the type information of the iv phi: > > https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L1323-L1324 > > But in the test case, the type of the iv phi is a constant (`521 CastII`): > > ![image](https://github.com/openjdk/jdk/assets/17833009/5dc17b9c-abfe-4846-89a1-4e189234b991) > > `521 CastII` will simply be replaced with a constant during IGVN and the `OpaqueLoop*Nodes` above are removed. We therefore cannot find them anymore later when trying to eliminate useless predicates and we hit the assert. > > #### Why does the `CastII`/iv phi have a constant type? > Having a constant type for the iv phi indicates that the counted loop is only going to be executed for one iteration. But C2 has not had the chance, yet, to fold the loop exit test to remove the loop. > > #### How to fix this bug? > Having a single iteration loop raises the question, why we even bother to try and hoist checks out of such a loop with Loop Predication in the first place. I therefore suggest to simply bail out of Loop Predication if the trip count is 1. This will also prevent us from creating a Template Assertion Predicate with a `CastII` with a constant type from the iv phi which would be folded. > > To do that, we can compute the trip count on entry of Loop Predication. By doing that, we can also remove the trip count computation added for hoisting ran... Christian Hagedorn has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: - Add braces - Merge branch 'refs/heads/master' into JDK-8333252 - 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19500/files - new: https://git.openjdk.org/jdk/pull/19500/files/c54ba3cc..fdf7ebc1 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19500&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19500&range=00-01 Stats: 13906 lines in 395 files changed: 8408 ins; 3833 del; 1665 mod Patch: https://git.openjdk.org/jdk/pull/19500.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19500/head:pull/19500 PR: https://git.openjdk.org/jdk/pull/19500 From kvn at openjdk.org Tue Jun 4 16:00:27 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 16:00:27 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v4] In-Reply-To: References: Message-ID: On Thu, 16 May 2024 01:39:26 GMT, Richard Reingruber wrote: >> This pr adds a few tweaks to [JDK-8318446](https://bugs.openjdk.org/browse/JDK-8318446) which allows enabling it also on big endian platforms (e.g. AIX, S390). JDK-8318446 introduced a C2 optimization to replace consecutive stores to a primitive array with just one store. >> >> By example (from `TestMergeStores.java`): >> >> >> static Object[] test2a(byte[] a, int offset, long v) { >> if (IS_BIG_ENDIAN) { >> a[offset + 0] = (byte)(v >> 56); >> a[offset + 1] = (byte)(v >> 48); >> a[offset + 2] = (byte)(v >> 40); >> a[offset + 3] = (byte)(v >> 32); >> a[offset + 4] = (byte)(v >> 24); >> a[offset + 5] = (byte)(v >> 16); >> a[offset + 6] = (byte)(v >> 8); >> a[offset + 7] = (byte)(v >> 0); >> } else { >> a[offset + 0] = (byte)(v >> 0); >> a[offset + 1] = (byte)(v >> 8); >> a[offset + 2] = (byte)(v >> 16); >> a[offset + 3] = (byte)(v >> 24); >> a[offset + 4] = (byte)(v >> 32); >> a[offset + 5] = (byte)(v >> 40); >> a[offset + 6] = (byte)(v >> 48); >> a[offset + 7] = (byte)(v >> 56); >> } >> return new Object[]{ a }; >> } >> >> >> Depending on the endianess 8 bytes are stored into an array. The order of the stores is the same as the order of an 8-byte-store therefore 8 1-byte-stores can be replaced with just one 8-byte-store (if there aren't too many range checks). >> >> Additionally I've fixed a few comments and a test bug. >> >> The optimization seems to be a little bit more effective on big endian platforms. >> >> Again by example: >> >> >> static Object[] test800a(byte[] a, int offset, long v) { >> if (IS_BIG_ENDIAN) { >> a[offset + 0] = (byte)(v >> 40); // Removed from candidate list >> a[offset + 1] = (byte)(v >> 32); // Removed from candidate list >> a[offset + 2] = (byte)(v >> 24); // Merged >> a[offset + 3] = (byte)(v >> 16); // Merged >> a[offset + 4] = (byte)(v >> 8); // Merged >> a[offset + 5] = (byte)(v >> 0); // Merged >> } else { >> a[offset + 0] = (byte)(v >> 0); // Removed from candidate list >> a[offset + 1] = (byte)(v >> 8); // Removed from candidate list >> a[offset + 2] = (byte)(v >> 16); // Not merged >> a[offset + 3] = (byte)(v >> 24); // Not merged >> a[offset + 4] = (byte)(v >> 32); // Not merge... > > Richard Reingruber has updated the pull request incrementally with one additional commit since the last revision: > > Eliminate IS_BIG_ENDIAN and always execute both variants Good. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19218#pullrequestreview-2096824986 From kvn at openjdk.org Tue Jun 4 16:10:32 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 16:10:32 GMT Subject: RFR: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer In-Reply-To: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> References: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> Message-ID: On Mon, 3 Jun 2024 14:39:28 GMT, Vladimir Kozlov wrote: > Avoid iterations on empty relocation info. Found by running `ubsan`. > > Tested tier1-4, stress, xcomp. Thank you for reviews. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19525#issuecomment-2147792467 From chagedorn at openjdk.org Tue Jun 4 16:11:03 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 4 Jun 2024 16:11:03 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 07:47:46 GMT, SendaoYan wrote: > Hi all, > This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. > > Thanks. Looks good! Somehow the integrate command did not work. ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19537#pullrequestreview-2096615859 From syan at openjdk.org Tue Jun 4 16:11:05 2024 From: syan at openjdk.org (SendaoYan) Date: Tue, 4 Jun 2024 16:11:05 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 07:47:46 GMT, SendaoYan wrote: > Hi all, > This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. > > Thanks. Thanks for the review. Thanks all for the review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19537#issuecomment-2147523325 PR Comment: https://git.openjdk.org/jdk/pull/19537#issuecomment-2147711173 From syan at openjdk.org Tue Jun 4 16:12:56 2024 From: syan at openjdk.org (SendaoYan) Date: Tue, 4 Jun 2024 16:12:56 GMT Subject: RFR: 8332499: Gtest codestrings.validate_vm fail on linux x64 when hsdis is present [v5] In-Reply-To: References: Message-ID: On Tue, 28 May 2024 15:47:25 GMT, SendaoYan wrote: >> Hi all, >> There's some arch-specific code to trim trailing entries as descripted in [JDK-8332499](https://bugs.openjdk.org/browse/JDK-8332499). Only change the gtest testcase, the risk is low. >> >> On linux x86_64, before this PR, after deal with `std::regex_replace(tmp4, std::regex("\\s+:\\s+hlt[ \\t]+(?!\\n\\s+;;)"), "")`, the output differents because the first output has trailing empty spaces, show as below: >> >> - : nop >> + : nop >> >> So we need to delete the empty spaces after `: nop` use `std::regex_replace(tmp5, std::regex("(\\s+:\\s+nop)[ \\t]*"), "$1")` >> >> >> Additional test: >> - [x] codestrings.validate_vm on linux x64 >> - [x] codestrings.validate_vm on linux aarch64 >> - [x] codestrings.validate_vm on linux riscv64 > > SendaoYan has updated the pull request incrementally with two additional commits since the last revision: > > - Merge branch 'jbs8332499' of github.com:sendaoYan/jdk-ysd into jbs8332499 > - delete the empty spaces after : nop Thanks all for the review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19309#issuecomment-2147770415 From epeter at openjdk.org Tue Jun 4 16:13:41 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 4 Jun 2024 16:13:41 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v4] In-Reply-To: References: <7W-vxm7KC8qwd-GJAPh4TCtDhOzw7X5-gXanLudP27Y=.807f809f-92ce-498f-94c4-49b0405bbb6f@github.com> Message-ID: On Fri, 24 May 2024 15:05:32 GMT, Richard Reingruber wrote: >> Test error is unrelated to the changes. Upload of test results failed: >> `Error: Failed to CreateArtifact: Failed to make request after 5 attempts: Request timeout: /twirp/github.actions.results.api.v1.ArtifactService/CreateArtifact` > >> @reinrich please ping me again to ask if testing is ok before you integrate ;) > > Thanks for picking this up again. I quickly wanted to let you know that I'm out of office. I will be back in a week. @reinrich please still wait until the JDK24 fork on Thrusday to integrate, so that we do not have to backport possible regression fixes - I had 3 or 4 with my original patch ;) ------------- PR Comment: https://git.openjdk.org/jdk/pull/19218#issuecomment-2147899475 From kvn at openjdk.org Tue Jun 4 16:17:00 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 16:17:00 GMT Subject: RFR: 8325155: C2 SuperWord: remove alignment boundaries [v6] In-Reply-To: References: Message-ID: <_txFWrn_nEyBZEFh0YRG5YF9dDlf2JAWIi6gsyW5hjI=.475bd8c2-c494-4a0d-b994-a61897459795@github.com> On Wed, 29 May 2024 07:16:44 GMT, Emanuel Peter wrote: >> I have tried for a very long time to get rid of all the `alignment(n)` code that is all over the SuperWord code. With lots of previous work, I am now finally ready to remove it. >> >> I was able to remove lots of VM code, about 300 lines. And the removed code is I think much more complicated than the new code. >> >> This is what I did in this PR: >> - Removal of `_node_info`: used to have many fields, which I refactored out to the `VLoopAnalyzer` modules. `alignment` is the last component, which I now remove. >> - Changed the implementation of `SuperWord::find_adjacent_refs`, now `SuperWord::find_adjacent_memop_pairs`, completely: >> - It used to be an algorithm that would scan over all `memops` repeatedly, try to find some `mem_ref` and see which other memops were comparable, and then pack pairs for all of those, by comparing all-vs-all memops. This algorithm is at least quadratic, if not much worse. >> - I now add all `memops` into a single array, sort them by groups (those that are comparable with each other and could be packed into vectors), and inside the groups by ascending offset. This allows me to split off the groups much more efficiently, and also the sorting by offset allows me finding adjacent pairs much more efficiently. In the most cases this reduces the cost to `O(n log n)` for sort, and a linear scan for finding adjacent memops. >> - I removed the "alignment boundaries" created in `SuperWord::memory_alignment` by `int off_rem = offset % vw;`. >> - This used to have the effect that all offsets were computed modulo the vector width. Hence, pairs could not be packed across this boundary (e.g. we have nodes with offsets `31, 32`, which are adjacent in theory, but if we have a `vw = 32`, then the modulo-offsets are `31, 0`, and they are not detected as adjacent). >> - These "alignment boundaries" used to be required for correctness about a year ago, before I fixed and relaxed much of the alignment code. >> - The `alignment` used to have another important task: Ensuring compatibility of the input-size of a use node, with the output-size of the def-node. >> - This was done by giving all nodes an `alignment`, even the non-memop nodes. This `alignment` was then scaled up and down at type casts (e.g. int `0, 4, 8, 12` -> long `0, 8, 16, 24`). If the output-size of the def-node did not match the input-size of the use-node, then the `alignment` would not match up, and we would not pack. >> - This is why we used to have checks like `alignment(s1) + da... > > Emanuel Peter has updated the pull request incrementally with one additional commit since the last revision: > > Update src/hotspot/share/opto/superword.cpp > > Co-authored-by: Christian Hagedorn Approved. Thank you for running performance testing. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/18822#pullrequestreview-2096811156 PR Comment: https://git.openjdk.org/jdk/pull/18822#issuecomment-2147874256 From epeter at openjdk.org Tue Jun 4 16:17:00 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 4 Jun 2024 16:17:00 GMT Subject: RFR: 8325155: C2 SuperWord: remove alignment boundaries [v6] In-Reply-To: <_txFWrn_nEyBZEFh0YRG5YF9dDlf2JAWIi6gsyW5hjI=.475bd8c2-c494-4a0d-b994-a61897459795@github.com> References: <_txFWrn_nEyBZEFh0YRG5YF9dDlf2JAWIi6gsyW5hjI=.475bd8c2-c494-4a0d-b994-a61897459795@github.com> Message-ID: On Tue, 4 Jun 2024 15:51:23 GMT, Vladimir Kozlov wrote: >> Emanuel Peter has updated the pull request incrementally with one additional commit since the last revision: >> >> Update src/hotspot/share/opto/superword.cpp >> >> Co-authored-by: Christian Hagedorn > > Thank you for running performance testing. @vnkozlov thanks for the review! I will integrate as soon as the JDK24 fork happens ;) ------------- PR Comment: https://git.openjdk.org/jdk/pull/18822#issuecomment-2147882546 From thartmann at openjdk.org Tue Jun 4 16:32:29 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Tue, 4 Jun 2024 16:32:29 GMT Subject: RFR: 8333226: Regressions 2-3% in Compress ZGC after 8331253 In-Reply-To: References: Message-ID: <8CJdkP4CPy1ZYaBYp3308cDNPsl4F2RrVL4DeFF_Clw=.a1c497e8-4077-4177-98e7-8295f763fe38@github.com> On Mon, 3 Jun 2024 17:59:06 GMT, Vladimir Kozlov wrote: > Revert [JDK-8331253](https://bugs.openjdk.org/browse/JDK-8331253) changes [#3383ad63](https://git.openjdk.org/jdk/commit/3383ad6397d5a2d8fb232ffd3e29a54e0b37b686) to avoid regression. > And convert `nmethod::_skipped_instructions_size field` field to `int` type to address original JDK-8331253 issue. > > Tested tier1-3,stress,xcomp and performance. > > Note: You may see some regressions, but performance returns to state before JDK-8331253. I may look again in a future on changes I did to calculated skipped instructions. Looks good to me too. > I may look again in a future on changes I did to calculated skipped instructions. Do you plan to file an RFE for that? ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19531#pullrequestreview-2096854523 From kvn at openjdk.org Tue Jun 4 16:32:29 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 16:32:29 GMT Subject: RFR: 8333226: Regressions 2-3% in Compress ZGC after 8331253 In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 17:59:06 GMT, Vladimir Kozlov wrote: > Revert [JDK-8331253](https://bugs.openjdk.org/browse/JDK-8331253) changes [#3383ad63](https://git.openjdk.org/jdk/commit/3383ad6397d5a2d8fb232ffd3e29a54e0b37b686) to avoid regression. > And convert `nmethod::_skipped_instructions_size field` field to `int` type to address original JDK-8331253 issue. > > Tested tier1-3,stress,xcomp and performance. > > Note: You may see some regressions, but performance returns to state before JDK-8331253. I may look again in a future on changes I did to calculated skipped instructions. Thank you, Christian, for review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19531#issuecomment-2147794720 From kvn at openjdk.org Tue Jun 4 16:32:29 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 16:32:29 GMT Subject: RFR: 8333226: Regressions 2-3% in Compress ZGC after 8331253 In-Reply-To: <8CJdkP4CPy1ZYaBYp3308cDNPsl4F2RrVL4DeFF_Clw=.a1c497e8-4077-4177-98e7-8295f763fe38@github.com> References: <8CJdkP4CPy1ZYaBYp3308cDNPsl4F2RrVL4DeFF_Clw=.a1c497e8-4077-4177-98e7-8295f763fe38@github.com> Message-ID: <_4FZdRWj-0ovlt6eGZHF7H1Pdg2f2WXxZ9G25ok2JF8=.036d76b2-eb73-4682-9aa0-73cf539ecd81@github.com> On Tue, 4 Jun 2024 16:10:38 GMT, Tobias Hartmann wrote: > Looks good to me too. > > > I may look again in a future on changes I did to calculated skipped instructions. > > Do you plan to file an RFE for that? https://bugs.openjdk.org/browse/JDK-8333546 ------------- PR Comment: https://git.openjdk.org/jdk/pull/19531#issuecomment-2147925942 From mdoerr at openjdk.org Tue Jun 4 16:33:04 2024 From: mdoerr at openjdk.org (Martin Doerr) Date: Tue, 4 Jun 2024 16:33:04 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: References: Message-ID: <7Y5Ia2fiMqXWnutjkQCxPhiPKOJrSKYXjmGveP7L3gM=.38c1b7f0-29fd-4428-87b7-95fd7aab5cff@github.com> On Tue, 4 Jun 2024 08:10:59 GMT, Galder Zamarre?o wrote: > Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. > > The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. > > There's no barrier added on x86 c1 macro assembler for nothing to do there. > > I've run the following tests: > * tier 1 on darwin/aarch64 > * tier 1 on linux/x86_64 > * `hotspot_compiler` tests on darwin/aarch64 > * `copy.clone.arrays` jcstress tests on darwin/aarch64. > > I tried but was unable to create a standalone test for the jdk source tree that would fail. > > FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. PPC64 adds storestore barriers in LIR (not in assembler as aarch64): https://github.com/openjdk/jdk/blob/8d3de45f4dfd60dc4e2f210cb0c085fcf6efb8e2/src/hotspot/cpu/ppc/c1_LIRGenerator_ppc.cpp#L928 I think this fits nicely to your change. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2147920726 From jbhateja at openjdk.org Tue Jun 4 16:41:12 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Tue, 4 Jun 2024 16:41:12 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v2] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: <9XaWVoWrxcT71wJgXaaMqYDnSwdMSFCAqb5jJuqK03k=.2f0a9261-d241-4e0f-ac76-60deab84c839@github.com> On Sun, 2 Jun 2024 15:43:39 GMT, Jatin Bhateja wrote: >> Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. >> >> This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin >> >> PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 >> [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review Comments Incorporated. Hi @TobiHartmann , @vnkozlov , please let me know if it's good to integrate. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19442#issuecomment-2147966860 From kvn at openjdk.org Tue Jun 4 16:41:58 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 16:41:58 GMT Subject: Integrated: 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer In-Reply-To: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> References: <0T3s29DTUHvExVdfSihCGxxv6cl5ECZEjKOOYPOa_Sg=.321f1cdf-fba6-4520-af0c-cf2fd1d5813f@github.com> Message-ID: <-2RPxHQzW_v-yY-9ak24JavOolHLXI7BTEbd723QeeE=.c0c521d8-7b7e-4508-9d6e-d6dec47618de@github.com> On Mon, 3 Jun 2024 14:39:28 GMT, Vladimir Kozlov wrote: > Avoid iterations on empty relocation info. Found by running `ubsan`. > > Tested tier1-4, stress, xcomp. This pull request has now been integrated. Changeset: 664c993c Author: Vladimir Kozlov URL: https://git.openjdk.org/jdk/commit/664c993c41753843293388a6ff1481a94a5b4c22 Stats: 8 lines in 2 files changed: 1 ins; 4 del; 3 mod 8331731: ubsan: relocInfo.cpp:155:30: runtime error: applying non-zero offset to null pointer Co-authored-by: Axel Boldt-Christmas Co-authored-by: Dean Long Reviewed-by: mdoerr, thartmann, mbaesken ------------- PR: https://git.openjdk.org/jdk/pull/19525 From kvn at openjdk.org Tue Jun 4 16:52:06 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 16:52:06 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v2] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: On Sun, 2 Jun 2024 15:43:39 GMT, Jatin Bhateja wrote: >> Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. >> >> This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin >> >> PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 >> [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review Comments Incorporated. Please, wait our review and testing. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19442#issuecomment-2147985278 From kvn at openjdk.org Tue Jun 4 16:54:56 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 16:54:56 GMT Subject: RFR: 8333226: Regressions 2-3% in Compress ZGC after 8331253 In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 17:59:06 GMT, Vladimir Kozlov wrote: > Revert [JDK-8331253](https://bugs.openjdk.org/browse/JDK-8331253) changes [#3383ad63](https://git.openjdk.org/jdk/commit/3383ad6397d5a2d8fb232ffd3e29a54e0b37b686) to avoid regression. > And convert `nmethod::_skipped_instructions_size field` field to `int` type to address original JDK-8331253 issue. > > Tested tier1-3,stress,xcomp and performance. > > Note: You may see some regressions, but performance returns to state before JDK-8331253. I may look again in a future on changes I did to calculated skipped instructions. Thank you for reviews. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19531#issuecomment-2147989663 From kvn at openjdk.org Tue Jun 4 17:01:02 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 17:01:02 GMT Subject: Integrated: 8333226: Regressions 2-3% in Compress ZGC after 8331253 In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 17:59:06 GMT, Vladimir Kozlov wrote: > Revert [JDK-8331253](https://bugs.openjdk.org/browse/JDK-8331253) changes [#3383ad63](https://git.openjdk.org/jdk/commit/3383ad6397d5a2d8fb232ffd3e29a54e0b37b686) to avoid regression. > And convert `nmethod::_skipped_instructions_size field` field to `int` type to address original JDK-8331253 issue. > > Tested tier1-3,stress,xcomp and performance. > > Note: You may see some regressions, but performance returns to state before JDK-8331253. I may look again in a future on changes I did to calculated skipped instructions. This pull request has now been integrated. Changeset: dce97031 Author: Vladimir Kozlov URL: https://git.openjdk.org/jdk/commit/dce97031555dcf689fecda16e444e7e8e9d5b270 Stats: 43 lines in 9 files changed: 6 ins; 24 del; 13 mod 8333226: Regressions 2-3% in Compress ZGC after 8331253 Reviewed-by: chagedorn, thartmann ------------- PR: https://git.openjdk.org/jdk/pull/19531 From dfenacci at openjdk.org Tue Jun 4 17:53:58 2024 From: dfenacci at openjdk.org (Damon Fenacci) Date: Tue, 4 Jun 2024 17:53:58 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 06:50:18 GMT, Gui Cao wrote: > @dafedafe Hi Damon, Maybe you can take a look at this small change? Thanks. @zifeihan you're right that the case with a vector length of 64 bits is not covered correctly. I wouldn't fix it using different species depending on the length though. The Java Vector API explicitly says that not all shapes are supported by all platforms, and one should choose shape-agnostic code. Also, the IR framework uses the maximum vector size possible (if not explicitly set in the rules). I'd rather make sure that masks are not the same for lengths of 1 (for long and double vectors): something like this `longMask[i] = L_SPECIES.length() > 1 && i % 2 == 0;` would probably do. For long and double indices this cannot be done with length 1. So an option would be to change the asserts to add a check for the length, e.g. `Asserts.assertFalse(L_SPECIES.length() != 1 && res.equals(res2));`. There is always the alternative of not running long and double vector tests with max vector lengths of 64. Regarding `-XX:+IncrementalInlineForceCleanup`: this has been added because of the issue described in [JDK-8302459](https://bugs.openjdk.org/browse/JDK-8302459) that sometimes prevents C2 to properly vectorise (it should be removed when this issue is fixed). Having it there shouldn't really make any test fail though. If this is the case with `aarch64` it might mean that there is another issue somewhere else. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19473#issuecomment-2148091539 From kvn at openjdk.org Tue Jun 4 17:56:12 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 4 Jun 2024 17:56:12 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v2] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: <12JZgqLnIqOxqbmQ4sXDJT_CUM-nC0j4QRFanyaDD4M=.ea2845d6-461b-43f9-931e-8c4342a32c98@github.com> On Sun, 2 Jun 2024 15:43:39 GMT, Jatin Bhateja wrote: >> Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. >> >> This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin >> >> PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 >> [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review Comments Incorporated. I have some comments. src/hotspot/share/opto/vectorIntrinsics.cpp line 514: > 512: } > 513: > 514: Node* LibraryCallKit::partially_wrap_indexes(Node* index_vec, int num_elem, BasicType elem_bt) { Can you add comment with pseudo code to show what this method do? src/hotspot/share/opto/vectorIntrinsics.cpp line 517: > 515: assert(elem_bt == T_BYTE, ""); > 516: const TypeVect * vt = TypeVect::make(elem_bt, num_elem); > 517: const Type * type_bt = Type::get_const_basic_type(elem_bt); Please remove space between a type and `*`. src/hotspot/share/opto/vectorIntrinsics.cpp line 622: > 620: res = gvn().transform(VectorNode::make(Op_AndV, res, bcast_mod, vt)); > 621: } else { > 622: res = partially_wrap_indexes(res, num_elem, elem_bt); Original spacing here was correct. One at the line 621 is wrong and have to be fixed. src/hotspot/share/opto/vectorIntrinsics.cpp line 2308: > 2306: !arch_supports_vector(Op_AndV, num_elem_to, elem_bt_to, VecMaskNotUsed) || > 2307: !arch_supports_vector(Op_Replicate, num_elem_to, elem_bt_to, VecMaskNotUsed))) { > 2308: return false; Please add `log_if_needed("` here too. ------------- PR Review: https://git.openjdk.org/jdk/pull/19442#pullrequestreview-2097045286 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1626403018 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1626398694 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1626407248 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1626406018 From cslucas at openjdk.org Tue Jun 4 19:11:20 2024 From: cslucas at openjdk.org (Cesar Soares Lucas) Date: Tue, 4 Jun 2024 19:11:20 GMT Subject: RFR: 8331736: C2: Live Node limit exceeded limit after JDK-8316991 Message-ID: Please, consider this patch to interrupt execution of `split_unique_types` (SUT) when number of live nodes reaches 3/4 of `max_live_nodes`. The included test case reproduces the problem. The number of live nodes before running phase 3 of SUT is ~20k, after processing about 750 mergemem nodes the number of live nodes is over 70k. This problem was first encountered when running an old `.jar` file that was created before `invokedynamic` optimizations - that's why I disable string optimizations in the test case. The test case is strongly based on [the method that was originally triggering the problem](https://github.com/Unidata/netcdf-java/blob/c782ef80ab54a09befd6d5065c6baeed54949222/cdm/radial/src/main/java/ucar/nc2/iosp/nids/Nidsheader.java#L2174). Tested on Linux, Win, Mac x86_64 tier1-3 and GHA. ------------- Commit messages: - Throttle split unique types if number of live nodes is too big. Changes: https://git.openjdk.org/jdk/pull/19545/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19545&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8331736 Stats: 180 lines in 2 files changed: 180 ins; 0 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/19545.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19545/head:pull/19545 PR: https://git.openjdk.org/jdk/pull/19545 From dlong at openjdk.org Tue Jun 4 19:43:59 2024 From: dlong at openjdk.org (Dean Long) Date: Tue, 4 Jun 2024 19:43:59 GMT Subject: RFR: 8326615: C1/C2 don't handle allocation failure properly during initialization (RuntimeStub::new_runtime_stub fatal crash) [v5] In-Reply-To: References: Message-ID: On Tue, 28 May 2024 07:16:15 GMT, Damon Fenacci wrote: >> # Issue >> >> The test `compiler/startup/StartupOutput.java` fails intermittently due to a crash after correctly printing the error `Initial size of CodeCache is too small` (the test limits the code cache using k-XX:InitialCodeCacheSize=1024K -XX:ReservedCodeCacheSize=1200k`). >> The appearance of the issue is very dependent on thread scheduling. The original report happens during C1 initialization but C2 initialization is affected as well. >> >> # Causes >> >> There is one occurrence during C1 initialization and one during C2 initialization where a call to `RuntimeStub::new_runtime_stub` can fail fatally if there is not enough space left. >> For C1: `Compiler::init_c1_runtime` -> `Runtime1::initialize` -> `Runtime1::generate_blob_for` -> `Runtime1::generate_blob` -> `RuntimeStub::new_runtime_stub`. >> For C2: `C2Compiler::initialize` -> `OptoRuntime::generate` -> `OptoRuntime::generate_stub` -> `Compile::Compile` -> `Compile::Code_Gen` -> `PhaseOutput::install` -> `PhaseOutput::install_stub` -> `RuntimeStub::new_runtime_stub`. >> >> # Solution >> >> https://github.com/openjdk/jdk/pull/15970 introduced an optional argument to `RuntimeStub::new_runtime_stub` to determine if it fails fatally or not. We can take advantage of it to avoid crashing and instead pass the information about the success or failure of the allocation up the (C1 and C2 initialization) call stack up to where we can set the compilations as failed. > > Damon Fenacci has updated the pull request incrementally with three additional commits since the last revision: > > - Update src/hotspot/share/gc/z/c1/zBarrierSetC1.cpp > > Co-authored-by: Tobias Hartmann > - Update src/hotspot/share/gc/z/c1/zBarrierSetC1.cpp > > Co-authored-by: Tobias Hartmann > - Update src/hotspot/share/gc/x/c1/xBarrierSetC1.cpp > > Co-authored-by: Tobias Hartmann It may not fail, but if it can't create a C1 or C2 compiler, then that's bad, and we might argue that this kind of failure should be similar to a failure during JVM startup. In fact, I have been thinking that there are reasons why we might want these compiler stubs to be created earlier in startup, when we are still single-threaded. That would get rid of any races, and make a failure fatal. It would also allow us to allow us to put these stubs, if they are effectively execute-only because they don't need to be patched, into a special JIT region, avoiding the MAP_JIT overhead on macos-aarch64 (see JDK-8331978). ------------- PR Comment: https://git.openjdk.org/jdk/pull/19280#issuecomment-2148286373 From liach at openjdk.org Wed Jun 5 00:55:03 2024 From: liach at openjdk.org (Chen Liang) Date: Wed, 5 Jun 2024 00:55:03 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 07:47:46 GMT, SendaoYan wrote: > Hi all, > This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. > > Thanks. @altrisi As trivial and low-effort as this seems, this is actually fixing some technical debt for legacy Makefiles last changed before 8e7a855ee8f085cee080395058f79c8a75bfef40 when trailing whitespace checks applied to Makefiles. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19537#issuecomment-2148641237 From gcao at openjdk.org Wed Jun 5 01:32:58 2024 From: gcao at openjdk.org (Gui Cao) Date: Wed, 5 Jun 2024 01:32:58 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: <-qWG3SDA1axsnT9WwUis0XNhhGXYnHqjqfAiXaR6AbU=.99069589-d57c-4440-925b-947b5247e9e8@github.com> References: <-qWG3SDA1axsnT9WwUis0XNhhGXYnHqjqfAiXaR6AbU=.99069589-d57c-4440-925b-947b5247e9e8@github.com> Message-ID: On Tue, 4 Jun 2024 08:26:08 GMT, Galder Zamarre?o wrote: >> Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. >> >> The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. >> >> There's no barrier added on x86 c1 macro assembler for nothing to do there. >> >> I've run the following tests: >> * tier 1 on darwin/aarch64 >> * tier 1 on linux/x86_64 >> * `hotspot_compiler` tests on darwin/aarch64 >> * `copy.clone.arrays` jcstress tests on darwin/aarch64. >> >> I tried but was unable to create a standalone test for the jdk source tree that would fail. >> >> FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. > > Forgot to say, this change shows no impact on the array clone micro benchmark. This is expected since the change moves the barrier from one place (after array creation) to another (after copying contents), so no additional barriers are introduced. @galderz Hi, I have finished the RISC-V part, tier1-3 tested on SOPHON SG2042. Please help us to add the RISC-V part, thanks a lot! [19538-riscv-port-v1.diff.txt](https://github.com/user-attachments/files/15571986/19538-riscv-port-v1.diff.txt) ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2148694857 From duke at openjdk.org Wed Jun 5 02:23:56 2024 From: duke at openjdk.org (kuaiwei) Date: Wed, 5 Jun 2024 02:23:56 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 14:49:16 GMT, Christian Hagedorn wrote: > Looks good to me, too. Thanks. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19518#issuecomment-2148737342 From jbhateja at openjdk.org Wed Jun 5 02:34:26 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Wed, 5 Jun 2024 02:34:26 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v3] In-Reply-To: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: > Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. > > This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. > > Kindly review and share your feedback. > > Best Regards, > Jatin > > PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) > > [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 > [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: Review comments resolutions. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19442/files - new: https://git.openjdk.org/jdk/pull/19442/files/102b78ae..16996e57 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19442&range=02 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19442&range=01-02 Stats: 16 lines in 1 file changed: 12 ins; 0 del; 4 mod Patch: https://git.openjdk.org/jdk/pull/19442.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19442/head:pull/19442 PR: https://git.openjdk.org/jdk/pull/19442 From syan at openjdk.org Wed Jun 5 05:13:01 2024 From: syan at openjdk.org (SendaoYan) Date: Wed, 5 Jun 2024 05:13:01 GMT Subject: Integrated: 8332499: Gtest codestrings.validate_vm fail on linux x64 when hsdis is present In-Reply-To: References: Message-ID: <36o78ZoM876FobO9d3G6qGS5Hxt7ySL4IocrBoHd6ms=.c783d2ee-4671-41bd-94b2-28bbb6f9766b@github.com> On Mon, 20 May 2024 12:24:09 GMT, SendaoYan wrote: > Hi all, > There's some arch-specific code to trim trailing entries as descripted in [JDK-8332499](https://bugs.openjdk.org/browse/JDK-8332499). Only change the gtest testcase, the risk is low. > > On linux x86_64, before this PR, after deal with `std::regex_replace(tmp4, std::regex("\\s+:\\s+hlt[ \\t]+(?!\\n\\s+;;)"), "")`, the output differents because the first output has trailing empty spaces, show as below: > > - : nop > + : nop > > So we need to delete the empty spaces after `: nop` use `std::regex_replace(tmp5, std::regex("(\\s+:\\s+nop)[ \\t]*"), "$1")` > > > Additional test: > - [x] codestrings.validate_vm on linux x64 > - [x] codestrings.validate_vm on linux aarch64 > - [x] codestrings.validate_vm on linux riscv64 This pull request has now been integrated. Changeset: 7dbd0338 Author: SendaoYan Committer: Tobias Hartmann URL: https://git.openjdk.org/jdk/commit/7dbd03388eef9cddbab6a622338b00ce250be3dc Stats: 2 lines in 1 file changed: 1 ins; 0 del; 1 mod 8332499: Gtest codestrings.validate_vm fail on linux x64 when hsdis is present Reviewed-by: thartmann, chagedorn ------------- PR: https://git.openjdk.org/jdk/pull/19309 From thartmann at openjdk.org Wed Jun 5 05:42:57 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Wed, 5 Jun 2024 05:42:57 GMT Subject: RFR: 8333334: C2: Make result of `Node::dominates` more precise to enhance scalar replacement In-Reply-To: References: Message-ID: On Fri, 31 May 2024 09:01:38 GMT, MaxXing wrote: > This patch changes the algorithm of `Node::dominates` to make the result more precise, and allows the iterators of `ConcurrentHashMap` to be scalar replaced. > > The previous algorithm will return a conservative result when encountering a dead control flow, and only try the first two input paths of a multi-input Region node, which may prevent the scalar replacement in some cases. > > For example, with G1 GC enabled, C2 generates GC barriers for `ConcurrentHashMap` iteration operations at some early phases, and then eliminates them in a later IGVN, but `LoadNode` is also idealized in the same IGVN. This causes `LoadNode::Ideal` to see some dead barrier control flows, and refuse to split some instance field loads through Phi due to the conservative result of `Node::dominates`, and thus the scalar replacement can not be applied to iterators in the later macro elimination phase. > > This patch allows `Node::dominates` to try other paths of the last multi-input Region node when the first path is dead, and makes `ConcurrentHashMap` iteration ~30% faster: > > > Benchmark (nkeys) Mode Cnt Score Error Units > Maps.testConcurrentHashMapIterators 10000 avgt 15 414099.085 ? 33230.945 ns/op # baseline > Maps.testConcurrentHashMapIterators 10000 avgt 15 315490.281 ? 3037.056 ns/op # patch > > > Testing: tier1-4. Impressive results! I haven't looked at the change yet but here are a few questions / requests: - Could you add a screenshot of the IR of the case you are describing? - Wouldn't it help to add the LoadNode back to the IGVN worklist and wait for the dead path to be removed? - Could you add an [IR framework](https://github.com/openjdk/jdk/blob/master/test/hotspot/jtreg/compiler/lib/ir_framework/README.md) test that verifies that the optimization works as expected? Thanks, Tobias ------------- PR Review: https://git.openjdk.org/jdk/pull/19496#pullrequestreview-2098076379 From thartmann at openjdk.org Wed Jun 5 06:01:58 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Wed, 5 Jun 2024 06:01:58 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 12:39:05 GMT, Christian Hagedorn wrote: > If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. Is that comment incorrect then? https://github.com/openjdk/jdk/blob/2edb6d98133d8bd6dc4527c7497c460283fdc53e/src/hotspot/share/opto/loopopts.cpp#L2993-L2994 ------------- PR Review: https://git.openjdk.org/jdk/pull/19522#pullrequestreview-2098103875 From syan at openjdk.org Wed Jun 5 06:34:01 2024 From: syan at openjdk.org (SendaoYan) Date: Wed, 5 Jun 2024 06:34:01 GMT Subject: RFR: 8332499: Gtest codestrings.validate_vm fail on linux x64 when hsdis is present [v5] In-Reply-To: References: Message-ID: On Tue, 28 May 2024 15:47:25 GMT, SendaoYan wrote: >> Hi all, >> There's some arch-specific code to trim trailing entries as descripted in [JDK-8332499](https://bugs.openjdk.org/browse/JDK-8332499). Only change the gtest testcase, the risk is low. >> >> On linux x86_64, before this PR, after deal with `std::regex_replace(tmp4, std::regex("\\s+:\\s+hlt[ \\t]+(?!\\n\\s+;;)"), "")`, the output differents because the first output has trailing empty spaces, show as below: >> >> - : nop >> + : nop >> >> So we need to delete the empty spaces after `: nop` use `std::regex_replace(tmp5, std::regex("(\\s+:\\s+nop)[ \\t]*"), "$1")` >> >> >> Additional test: >> - [x] codestrings.validate_vm on linux x64 >> - [x] codestrings.validate_vm on linux aarch64 >> - [x] codestrings.validate_vm on linux riscv64 > > SendaoYan has updated the pull request incrementally with two additional commits since the last revision: > > - Merge branch 'jbs8332499' of github.com:sendaoYan/jdk-ysd into jbs8332499 > - delete the empty spaces after : nop Thanks all for the review and sponsor. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19309#issuecomment-2148985641 From epeter at openjdk.org Wed Jun 5 06:48:59 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 06:48:59 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v3] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: <1o2ba7vrqVN7L2qVFAVeT_nHbae7UOHiQg_7HxUmeDk=.98694c70-1895-49ef-9110-d71e2e4bbfb5@github.com> On Wed, 5 Jun 2024 02:34:26 GMT, Jatin Bhateja wrote: >> Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. >> >> This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin >> >> PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 >> [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review comments resolutions. Changes requested by epeter (Reviewer). src/hotspot/share/opto/vectorIntrinsics.cpp line 525: > 523: // indexes. > 524: Node* LibraryCallKit::partially_wrap_indexes(Node* index_vec, int num_elem, BasicType elem_bt) { > 525: assert(elem_bt == T_BYTE, ""); Write message in assert: why is it limited to byte? src/hotspot/share/opto/vectorIntrinsics.cpp line 530: > 528: > 529: Node* mod_val = gvn().makecon(TypeInt::make(num_elem-1)); > 530: Node* bcast_mod = gvn().transform(VectorNode::scalar2vector(mod_val, num_elem, type_bt)); Naming issue: this is not the result of the mod, so "mod" is a bit misleading. I would use `mask`, as it is used as a mask in the AndV below. test/hotspot/jtreg/compiler/vectorapi/TestTwoVectorPermute.java line 29: > 27: * @summary Incorrect IllegalArgumentException for C2 compiled permute kernel > 28: * @modules jdk.incubator.vector > 29: * @requires vm.compiler2.enabled Is this necessary to restrict to C2? Maybe this test tickles something for other compilers as well. test/hotspot/jtreg/compiler/vectorapi/TestTwoVectorPermute.java line 32: > 30: * @library /test/lib / > 31: * @run main/othervm -XX:+UnlockDiagnosticVMOptions -Xbatch -XX:-TieredCompilation -XX:CompileOnly=TestTwoVectorPermute::micro compiler.vectorapi.TestTwoVectorPermute > 32: * @run main/othervm -XX:+UnlockDiagnosticVMOptions -Xbatch -XX:-TieredCompilation compiler.vectorapi.TestTwoVectorPermute I would also add a run without `-XX:-TieredCompilation`, that could lead to different compilation patterns, and increase our test coverage. test/hotspot/jtreg/compiler/vectorapi/TestTwoVectorPermute.java line 44: > 42: public static final VectorSpecies FSP = FloatVector.SPECIES_256; > 43: > 44: public static void validate(float [] res, float [] shuf, float [] src1, float [] src2) { Suggestion: public static void validate(float[] res, float[] shuf, float[] src1, float[] src2) { ------------- PR Review: https://git.openjdk.org/jdk/pull/19442#pullrequestreview-2098143389 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1627046199 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1627047910 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1627062793 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1627063960 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1627065037 From epeter at openjdk.org Wed Jun 5 06:49:00 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 06:49:00 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v3] In-Reply-To: <1o2ba7vrqVN7L2qVFAVeT_nHbae7UOHiQg_7HxUmeDk=.98694c70-1895-49ef-9110-d71e2e4bbfb5@github.com> References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> <1o2ba7vrqVN7L2qVFAVeT_nHbae7UOHiQg_7HxUmeDk=.98694c70-1895-49ef-9110-d71e2e4bbfb5@github.com> Message-ID: On Wed, 5 Jun 2024 06:25:41 GMT, Emanuel Peter wrote: >> Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: >> >> Review comments resolutions. > > src/hotspot/share/opto/vectorIntrinsics.cpp line 530: > >> 528: >> 529: Node* mod_val = gvn().makecon(TypeInt::make(num_elem-1)); >> 530: Node* bcast_mod = gvn().transform(VectorNode::scalar2vector(mod_val, num_elem, type_bt)); > > Naming issue: this is not the result of the mod, so "mod" is a bit misleading. I would use `mask`, as it is used as a mask in the AndV below. Also: it seems to me that you are duplicating these 4 lines above from its call-site. I wonder if this means that you are slicing the boundary of your new method right, or if maybe the whole if-else block from the call-site should be a new method? > test/hotspot/jtreg/compiler/vectorapi/TestTwoVectorPermute.java line 44: > >> 42: public static final VectorSpecies FSP = FloatVector.SPECIES_256; >> 43: >> 44: public static void validate(float [] res, float [] shuf, float [] src1, float [] src2) { > > Suggestion: > > public static void validate(float[] res, float[] shuf, float[] src1, float[] src2) { Similar issues below. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1627057938 PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1627065396 From rrich at openjdk.org Wed Jun 5 06:54:10 2024 From: rrich at openjdk.org (Richard Reingruber) Date: Wed, 5 Jun 2024 06:54:10 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v5] In-Reply-To: References: Message-ID: > This pr adds a few tweaks to [JDK-8318446](https://bugs.openjdk.org/browse/JDK-8318446) which allows enabling it also on big endian platforms (e.g. AIX, S390). JDK-8318446 introduced a C2 optimization to replace consecutive stores to a primitive array with just one store. > > By example (from `TestMergeStores.java`): > > > static Object[] test2a(byte[] a, int offset, long v) { > if (IS_BIG_ENDIAN) { > a[offset + 0] = (byte)(v >> 56); > a[offset + 1] = (byte)(v >> 48); > a[offset + 2] = (byte)(v >> 40); > a[offset + 3] = (byte)(v >> 32); > a[offset + 4] = (byte)(v >> 24); > a[offset + 5] = (byte)(v >> 16); > a[offset + 6] = (byte)(v >> 8); > a[offset + 7] = (byte)(v >> 0); > } else { > a[offset + 0] = (byte)(v >> 0); > a[offset + 1] = (byte)(v >> 8); > a[offset + 2] = (byte)(v >> 16); > a[offset + 3] = (byte)(v >> 24); > a[offset + 4] = (byte)(v >> 32); > a[offset + 5] = (byte)(v >> 40); > a[offset + 6] = (byte)(v >> 48); > a[offset + 7] = (byte)(v >> 56); > } > return new Object[]{ a }; > } > > > Depending on the endianess 8 bytes are stored into an array. The order of the stores is the same as the order of an 8-byte-store therefore 8 1-byte-stores can be replaced with just one 8-byte-store (if there aren't too many range checks). > > Additionally I've fixed a few comments and a test bug. > > The optimization seems to be a little bit more effective on big endian platforms. > > Again by example: > > > static Object[] test800a(byte[] a, int offset, long v) { > if (IS_BIG_ENDIAN) { > a[offset + 0] = (byte)(v >> 40); // Removed from candidate list > a[offset + 1] = (byte)(v >> 32); // Removed from candidate list > a[offset + 2] = (byte)(v >> 24); // Merged > a[offset + 3] = (byte)(v >> 16); // Merged > a[offset + 4] = (byte)(v >> 8); // Merged > a[offset + 5] = (byte)(v >> 0); // Merged > } else { > a[offset + 0] = (byte)(v >> 0); // Removed from candidate list > a[offset + 1] = (byte)(v >> 8); // Removed from candidate list > a[offset + 2] = (byte)(v >> 16); // Not merged > a[offset + 3] = (byte)(v >> 24); // Not merged > a[offset + 4] = (byte)(v >> 32); // Not merged > a[offset + 5] = (byte)(v >> 40); // Not merged > } > return new Object[]{ a };... Richard Reingruber has updated the pull request incrementally with one additional commit since the last revision: Feedback Emanuel ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19218/files - new: https://git.openjdk.org/jdk/pull/19218/files/3169a310..fc870e2b Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19218&range=04 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19218&range=03-04 Stats: 5 lines in 1 file changed: 0 ins; 2 del; 3 mod Patch: https://git.openjdk.org/jdk/pull/19218.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19218/head:pull/19218 PR: https://git.openjdk.org/jdk/pull/19218 From rrich at openjdk.org Wed Jun 5 06:56:57 2024 From: rrich at openjdk.org (Richard Reingruber) Date: Wed, 5 Jun 2024 06:56:57 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v4] In-Reply-To: References: Message-ID: On Fri, 24 May 2024 08:18:41 GMT, Emanuel Peter wrote: >> Richard Reingruber has updated the pull request incrementally with one additional commit since the last revision: >> >> Eliminate IS_BIG_ENDIAN and always execute both variants > > src/hotspot/share/opto/memnode.cpp line 3310: > >> 3308: Node* hi = first->in(MemNode::ValueIn); >> 3309: Node* lo = _store->in(MemNode::ValueIn); >> 3310: #endif // VM_LITTLE_ENDIAN > > A `swap` could be more concise. But I leave that up to you ;) You're right. It's better to just swap `hi` with `lo` and it matches the comment. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19218#discussion_r1627088951 From epeter at openjdk.org Wed Jun 5 06:58:01 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 06:58:01 GMT Subject: RFR: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes [v2] In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 15:58:39 GMT, Christian Hagedorn wrote: >> [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and not non-null-checks which both use `Opaque4` nodes. >> >> #### Correct Assertion >> One of this assert was now hit with a fuzzer found case in `get_assertion_predicates()` called during the elimination of useless predicates. We walk through all loops and collect all useful Template Assertion Predicates and Parse Predicates above the loops. For that we look at the UCTs which are shared among the predicates. When finding a predicate with such an UCT which also has an `Opaque4` node, we know that it is a Template Assertion Predicate. We additionally assert that we must find the `OpaqueLoop*Nodes` above which always belong to a template: >> >> https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L346-L354 >> >> So, this assert looks correct. >> >> #### Why didn't we find `OpaqueLoop*Nodes` in this case? >> For the Template Assertion Predicate for the last value, we insert an additional `CastII` to keep the type information of the iv phi: >> >> https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L1323-L1324 >> >> But in the test case, the type of the iv phi is a constant (`521 CastII`): >> >> ![image](https://github.com/openjdk/jdk/assets/17833009/5dc17b9c-abfe-4846-89a1-4e189234b991) >> >> `521 CastII` will simply be replaced with a constant during IGVN and the `OpaqueLoop*Nodes` above are removed. We therefore cannot find them anymore later when trying to eliminate useless predicates and we hit the assert. >> >> #### Why does the `CastII`/iv phi have a constant type? >> Having a constant type for the iv phi indicates that the counted loop is only going to be executed for one iteration. But C2 has not had the chance, yet, to fold the loop exit test to remove the loop. >> >> #### How to fix this bug? >> Having a single iteration loop raises the question, why we even bother to try and hoist checks out of such a loop with Loop Predication in the first place. I therefore suggest to simply bail out of Loop Predication if the trip count is 1. This will also prevent us from creating a Template Assertion Predicate with a `CastII` with a constant type from the iv phi which would be folded. >> >> To do that, we can compute the trip count on entry of Loop Predication. By doing that, we can also re... > > Christian Hagedorn has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: > > - Add braces > - Merge branch 'refs/heads/master' into JDK-8333252 > - 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes Looks good to me :) test/hotspot/jtreg/compiler/predicates/assertion/TestTemplateWithoutOpaqueLoopNodes.java line 29: > 27: * @summary Test that no Template Assertion Predicate is created in Loop Prediction for one iteration loop. > 28: * @run main/othervm -Xcomp -XX:CompileCommand=compileonly,*TestTemplateWithoutOpaqueLoopNodes::test > 29: * compiler.predicates.assertion.TestTemplateWithoutOpaqueLoopNodes Could it make sense to repeat calling `test` a few times more, and also add a run without Xcomp? I leave that to you, I guess the fuzzer generates these patterns so if there was a bug it would find it. ------------- Marked as reviewed by epeter (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19500#pullrequestreview-2098190761 PR Review Comment: https://git.openjdk.org/jdk/pull/19500#discussion_r1627086066 From epeter at openjdk.org Wed Jun 5 07:15:58 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 07:15:58 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit In-Reply-To: References: Message-ID: <3bxGw3LLx8Tde1JAFK7-oizimUl1S7x9j-VnHDJYnZM=.37437a3c-ce88-41c9-b037-9bd0652ca587@github.com> On Mon, 3 Jun 2024 12:39:05 GMT, Christian Hagedorn wrote: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... More to come later src/hotspot/share/opto/loopopts.cpp line 3052: > 3050: // limit >= 0 (COND) > 3051: // then the unsigned loop exit condition is equivalent to the signed loop exit condition > 3052: // i < 0 || i >= limit Could we come up with an alternative equation if `limit < 0`? And maybe even a combined condition? Not sure if that is helpful, but I'd like to think about it. Could also be a follow-up RFE. src/hotspot/share/opto/loopopts.cpp line 3068: > 3066: } > 3067: > 3068: // For stride < 0, we split off the signed loop exit condition Suggestion: // For stride < 0, we insert off the signed loop exit condition Why do you say "split", we are inserting this extra check, right? Or is the idea that the "rotation" puts this new condition as the last check, hence the exit check in the loop body? Being a big more explicit could help here. src/hotspot/share/opto/loopopts.cpp line 3087: > 3085: // "Signed Loop Exit Test" implies "Unsigned Loop Exit Test" > 3086: // This is trivially given: > 3087: // - Stride < 0: Suggestion: // - stride < 0: Stylistic decision, leave this to your src/hotspot/share/opto/loopopts.cpp line 3104: > 3102: // > 3103: // > 3104: // Signed Loop Exit Condition i < 0 or i >= limit There is not litterally an `OR` here, right? It's a bit confusing. ------------- PR Review: https://git.openjdk.org/jdk/pull/19522#pullrequestreview-2098213690 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627105115 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627116743 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627112350 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627118565 From rrich at openjdk.org Wed Jun 5 07:30:04 2024 From: rrich at openjdk.org (Richard Reingruber) Date: Wed, 5 Jun 2024 07:30:04 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v4] In-Reply-To: References: Message-ID: On Fri, 24 May 2024 08:24:39 GMT, Emanuel Peter wrote: > I'm running testing again, but the code looks good now! > > I just had another idea: Could we use some sort of "byte reverse / shuffle" operation to do these use cases for both big/little-endian? > > ``` > storeBytes(bytes, offset, (byte)(value >> 8), > (byte)(value >> 0)); > > storeBytes(bytes, offset, (byte)(value >> 0), > (byte)(value >> 8)); > ``` > > Not sure if that would be profitable or even available on all platforms. Could be a future RFE someone can work on after this. What do you think? It might make performance more predictable across platforms. You mean to combine the stores even if the explicit ordering does not match the ordering of the store instruction, adding a `ReverseBytes[SIL]Node` iff supported in that case, right? I've been thinking about this, too. In my opinion it would be worthwhile. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19218#issuecomment-2149074109 From rrich at openjdk.org Wed Jun 5 07:34:01 2024 From: rrich at openjdk.org (Richard Reingruber) Date: Wed, 5 Jun 2024 07:34:01 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v4] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 07:27:04 GMT, Richard Reingruber wrote: >> I'm running testing again, but the code looks good now! >> >> I just had another idea: >> Could we use some sort of "byte reverse / shuffle" operation to do these use cases for both big/little-endian? >> >> >> storeBytes(bytes, offset, (byte)(value >> 8), >> (byte)(value >> 0)); >> >> storeBytes(bytes, offset, (byte)(value >> 0), >> (byte)(value >> 8)); >> >> >> Not sure if that would be profitable or even available on all platforms. Could be a future RFE someone can work on after this. What do you think? It might make performance more predictable across platforms. > >> I'm running testing again, but the code looks good now! >> >> I just had another idea: Could we use some sort of "byte reverse / shuffle" operation to do these use cases for both big/little-endian? >> >> ``` >> storeBytes(bytes, offset, (byte)(value >> 8), >> (byte)(value >> 0)); >> >> storeBytes(bytes, offset, (byte)(value >> 0), >> (byte)(value >> 8)); >> ``` >> >> Not sure if that would be profitable or even available on all platforms. Could be a future RFE someone can work on after this. What do you think? It might make performance more predictable across platforms. > > You mean to combine the stores even if the explicit ordering does not match the ordering of the store instruction, adding a `ReverseBytes[SIL]Node` iff supported in that case, right? I've been thinking about this, too. In my opinion it would be worthwhile. > @reinrich please still wait until the JDK24 fork on Thrusday to integrate, so that we do not have to backport possible regression fixes - I had 3 or 4 with my original patch ;) Thanks for the reviews @eme64 and @vnkozlov! I'll integrate after the code split if more local testing is successful. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19218#issuecomment-2149082168 From thartmann at openjdk.org Wed Jun 5 07:48:58 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Wed, 5 Jun 2024 07:48:58 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 12:39:05 GMT, Christian Hagedorn wrote: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... Looks great otherwise. Nice test and proof! src/hotspot/share/opto/loopopts.cpp line 3054: > 3052: // i < 0 || i >= limit > 3053: // > 3054: // Note that this does not hold for limit < 0: As we discussed, rephrase this as counterexample. src/hotspot/share/opto/loopopts.cpp line 3076: > 3074: // Loop: > 3075: // > 3076: // Signed Loop Exit Condition i < 0 or i >= limit Suggestion: // Signed Loop Exit Condition i < 0 (or i >= limit) src/hotspot/share/opto/loopopts.cpp line 3090: > 3088: // i < 0 // Signed Loop Exit Condition > 3089: // i >u MAX_INT // all negative values are greater than MAX_INT when converted to unsigned > 3090: // i >=u limit // limit <= MAX_INT (trivially) and since limit >= 0 assumption (COND) As we discussed, an intermediate step in the proof would be good here. src/hotspot/share/opto/loopopts.cpp line 3099: > 3097: // After Partial Peeling, we have the following structure: > 3098: // > 3099: // Signed Loop Exit Condition i < 0 or i >= limit Suggestion: // Signed Loop Exit Condition i < 0 (or i >= limit) src/hotspot/share/opto/loopopts.cpp line 3104: > 3102: // > 3103: // > 3104: // Signed Loop Exit Condition i < 0 or i >= limit Suggestion: // Signed Loop Exit Condition i < 0 (or i >= limit) test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java line 145: > 143: // > 144: > 145: // Found as loop head in ciTypeFlow, but both path inside loop -> head not cloned. Suggestion: // Found as loop head in ciTypeFlow, but both paths inside loop -> head not cloned. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19522#pullrequestreview-2098147678 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627150450 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627145726 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627176821 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627144467 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627144737 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627050035 From chagedorn at openjdk.org Wed Jun 5 08:06:35 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 08:06:35 GMT Subject: RFR: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes [v3] In-Reply-To: References: Message-ID: > [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and not non-null-checks which both use `Opaque4` nodes. > > #### Correct Assertion > One of this assert was now hit with a fuzzer found case in `get_assertion_predicates()` called during the elimination of useless predicates. We walk through all loops and collect all useful Template Assertion Predicates and Parse Predicates above the loops. For that we look at the UCTs which are shared among the predicates. When finding a predicate with such an UCT which also has an `Opaque4` node, we know that it is a Template Assertion Predicate. We additionally assert that we must find the `OpaqueLoop*Nodes` above which always belong to a template: > > https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L346-L354 > > So, this assert looks correct. > > #### Why didn't we find `OpaqueLoop*Nodes` in this case? > For the Template Assertion Predicate for the last value, we insert an additional `CastII` to keep the type information of the iv phi: > > https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L1323-L1324 > > But in the test case, the type of the iv phi is a constant (`521 CastII`): > > ![image](https://github.com/openjdk/jdk/assets/17833009/5dc17b9c-abfe-4846-89a1-4e189234b991) > > `521 CastII` will simply be replaced with a constant during IGVN and the `OpaqueLoop*Nodes` above are removed. We therefore cannot find them anymore later when trying to eliminate useless predicates and we hit the assert. > > #### Why does the `CastII`/iv phi have a constant type? > Having a constant type for the iv phi indicates that the counted loop is only going to be executed for one iteration. But C2 has not had the chance, yet, to fold the loop exit test to remove the loop. > > #### How to fix this bug? > Having a single iteration loop raises the question, why we even bother to try and hoist checks out of such a loop with Loop Predication in the first place. I therefore suggest to simply bail out of Loop Predication if the trip count is 1. This will also prevent us from creating a Template Assertion Predicate with a `CastII` with a constant type from the iv phi which would be folded. > > To do that, we can compute the trip count on entry of Loop Predication. By doing that, we can also remove the trip count computation added for hoisting ran... Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: Adding Xbatch run ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19500/files - new: https://git.openjdk.org/jdk/pull/19500/files/fdf7ebc1..42001e31 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19500&range=02 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19500&range=01-02 Stats: 13 lines in 1 file changed: 11 ins; 0 del; 2 mod Patch: https://git.openjdk.org/jdk/pull/19500.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19500/head:pull/19500 PR: https://git.openjdk.org/jdk/pull/19500 From chagedorn at openjdk.org Wed Jun 5 08:06:36 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 08:06:36 GMT Subject: RFR: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes [v2] In-Reply-To: References: Message-ID: <6L9NoDoenlkK3vfNKrGQq2xUF_TKXfyQjTnUPMxmMEc=.2a0f45b8-5430-425f-9334-801dd7c5074e@github.com> On Wed, 5 Jun 2024 06:52:17 GMT, Emanuel Peter wrote: >> Christian Hagedorn has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: >> >> - Add braces >> - Merge branch 'refs/heads/master' into JDK-8333252 >> - 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes > > test/hotspot/jtreg/compiler/predicates/assertion/TestTemplateWithoutOpaqueLoopNodes.java line 29: > >> 27: * @summary Test that no Template Assertion Predicate is created in Loop Prediction for one iteration loop. >> 28: * @run main/othervm -Xcomp -XX:CompileCommand=compileonly,*TestTemplateWithoutOpaqueLoopNodes::test >> 29: * compiler.predicates.assertion.TestTemplateWithoutOpaqueLoopNodes > > Could it make sense to repeat calling `test` a few times more, and also add a run without Xcomp? I leave that to you, I guess the fuzzer generates these patterns so if there was a bug it would find it. I guess it does not hurt to add such a run. I've pushed an update ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19500#discussion_r1627203523 From gcao at openjdk.org Wed Jun 5 08:22:25 2024 From: gcao at openjdk.org (Gui Cao) Date: Wed, 5 Jun 2024 08:22:25 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 [v2] In-Reply-To: References: Message-ID: > Hi, VectorGatherMaskFoldingTest.java Test fails when max vector bits is 64, when max vector bits is 64, LongVector.SPECIES_MAX.length() and DoubleVector.SPECIES_MAX.length() is 1. > > We can reproduce this problem in two ways: > 1. We can use riscv without rvv1.0 board to reproduce this problem > 2. Run VectorGatherMaskFoldingTest.java on aarch64 client mode without `-XX:+IncrementalInlineForceCleanup` Option, the `-XX:+IncrementalInlineForceCleanup` is C2 Option, so we need to remove this Option from the VectorGatherMaskFoldingTest.main method. error message: > > Base Test: @Test testDoubleVectorStoreLoadMaskedVector: > compiler.lib.ir_framework.shared.TestRunException: There was an error while invoking @Test method public static void compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(). Target: null. Arguments: > at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:84) > at compiler.lib.ir_framework.test.BaseTest.invokeTest(BaseTest.java:71) > at compiler.lib.ir_framework.test.AbstractTest.run(AbstractTest.java:98) > at compiler.lib.ir_framework.test.TestVM.runTests(TestVM.java:861) > at compiler.lib.ir_framework.test.TestVM.start(TestVM.java:252) > at compiler.lib.ir_framework.test.TestVM.main(TestVM.java:165) > Caused by: java.lang.reflect.InvocationTargetException > at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:118) > at java.base/java.lang.reflect.Method.invoke(Method.java:580) > at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:80) > ... 5 more > Caused by: java.lang.RuntimeException: assertNotEquals: expected [1.0] to not equal [1.0] > at jdk.test.lib.Asserts.fail(Asserts.java:691) > at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:451) > at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:435) > at compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(VectorGatherMaskFoldingTest.java:1089) > at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103) > ... 7 more > > > For example, the following method will be failed: > > private static final VectorSpecies L_SPECIES = LongVector.SPECIES_MAX; > private static final VectorSpecies D_SPECIES = DoubleVector.SPECIES_MAX; > ... > @Test > @IR(counts = { IRNode.STORE_VECTOR_MASKED, ">= 1", IRNode.LOAD_VECTOR_MASKED, ">= 1" }, applyIfCPUFeatureOr = {"avx512", "true", "sve", "true"}) > public static ... Gui Cao has updated the pull request incrementally with one additional commit since the last revision: Fix for Damon comment ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19473/files - new: https://git.openjdk.org/jdk/pull/19473/files/cd53bab4..1235a453 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19473&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19473&range=00-01 Stats: 39 lines in 1 file changed: 1 ins; 3 del; 35 mod Patch: https://git.openjdk.org/jdk/pull/19473.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19473/head:pull/19473 PR: https://git.openjdk.org/jdk/pull/19473 From gcao at openjdk.org Wed Jun 5 08:35:58 2024 From: gcao at openjdk.org (Gui Cao) Date: Wed, 5 Jun 2024 08:35:58 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 17:51:27 GMT, Damon Fenacci wrote: > > @dafedafe Hi Damon, Maybe you can take a look at this small change? Thanks. > > @zifeihan you're right that the case with a vector length of 64 bits is not covered correctly. > > I wouldn't fix it using different species depending on the length though. The Java Vector API explicitly says that not all shapes are supported by all platforms, and one should choose shape-agnostic code. Also, the IR framework uses the maximum vector size possible (if not explicitly set in the rules). I'd rather make sure that masks are not the same for lengths of 1 (for long and double vectors): something like this `longMask[i] = L_SPECIES.length() > 1 && i % 2 == 0;` would probably do. For long and double indices this cannot be done with length 1. So an option would be to change the asserts to add a check for the length, e.g. `Asserts.assertFalse(L_SPECIES.length() != 1 && res.equals(res2));`. There is always the alternative of not running long and double vector tests with max vector lengths of 64. > Hi Damon, I have done the fix. Can you talk a look again? Thanks ------------- PR Comment: https://git.openjdk.org/jdk/pull/19473#issuecomment-2149208164 From fgao at openjdk.org Wed Jun 5 08:40:12 2024 From: fgao at openjdk.org (Fei Gao) Date: Wed, 5 Jun 2024 08:40:12 GMT Subject: RFR: 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug" [v3] In-Reply-To: References: <16J-lJ2AceGTVcRWBcP15yKcwO-1IA1XsngyOuNjf7k=.0776f081-ae2c-4279-87cf-d909806c2bc4@github.com> Message-ID: On Fri, 31 May 2024 15:25:16 GMT, Andrew Haley wrote: > > But `1030` can't be encoded as `base` + `offset` mode > > Why not? See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/LDR--immediate---Load-Register--immediate--?lang=en. For `long` type, signed immediate byte offset should be in range `-256` to `255` and positive immediate byte offset: a multiple of `8` in the range `0` to `32760`. ------------- PR Comment: https://git.openjdk.org/jdk/pull/16991#issuecomment-2149217082 From thartmann at openjdk.org Wed Jun 5 09:11:00 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Wed, 5 Jun 2024 09:11:00 GMT Subject: RFR: 8331736: C2: Live Node limit exceeded limit after JDK-8316991 In-Reply-To: References: Message-ID: <2YQNi-e1oREuM9Mth4aGwsPhbfWxsDcosqhuB99BhOY=.2b0927ab-8a17-4403-9255-56a8eea20c88@github.com> On Tue, 4 Jun 2024 16:32:15 GMT, Cesar Soares Lucas wrote: > Please, consider this patch to interrupt execution of `split_unique_types` (SUT) when number of live nodes reaches 3/4 of `max_live_nodes`. > > The included test case reproduces the problem. The number of live nodes before running phase 3 of SUT is ~20k, after processing about 750 mergemem nodes the number of live nodes is over 70k. This problem was first encountered when running an old `.jar` file that was created before `invokedynamic` optimizations - that's why I disable string optimizations in the test case. The test case is strongly based on [the method that was originally triggering the problem](https://github.com/Unidata/netcdf-java/blob/c782ef80ab54a09befd6d5065c6baeed54949222/cdm/radial/src/main/java/ucar/nc2/iosp/nids/Nidsheader.java#L2174). > > Tested on Linux, Win, Mac x86_64 tier1-3 and GHA. Looks good to me. We have similar logic for split-if (see `must_throttle_split_if`). src/hotspot/share/opto/escape.cpp line 4807: > 4805: _compile->record_failure(C2Compiler::retry_no_escape_analysis()); > 4806: } > 4807: return ; Suggestion: return; test/hotspot/jtreg/compiler/c2/TestScalarReplacementMaxLiveNodes.java line 41: > 39: * -XX:DesiredMethodLimit=100000 > 40: * compiler.c2.TestScalarReplacementMaxLiveNodes > 41: */ Please add a run without any arguments (`@run main/othervm ...`) so that we get some additional coverage with other VM args through this test. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19545#pullrequestreview-2098456843 PR Review Comment: https://git.openjdk.org/jdk/pull/19545#discussion_r1627291547 PR Review Comment: https://git.openjdk.org/jdk/pull/19545#discussion_r1627296582 From thartmann at openjdk.org Wed Jun 5 09:14:56 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Wed, 5 Jun 2024 09:14:56 GMT Subject: RFR: 8331736: C2: Live Node limit exceeded limit after JDK-8316991 In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 16:32:15 GMT, Cesar Soares Lucas wrote: > Please, consider this patch to interrupt execution of `split_unique_types` (SUT) when number of live nodes reaches 3/4 of `max_live_nodes`. > > The included test case reproduces the problem. The number of live nodes before running phase 3 of SUT is ~20k, after processing about 750 mergemem nodes the number of live nodes is over 70k. This problem was first encountered when running an old `.jar` file that was created before `invokedynamic` optimizations - that's why I disable string optimizations in the test case. The test case is strongly based on [the method that was originally triggering the problem](https://github.com/Unidata/netcdf-java/blob/c782ef80ab54a09befd6d5065c6baeed54949222/cdm/radial/src/main/java/ucar/nc2/iosp/nids/Nidsheader.java#L2174). > > Tested on Linux, Win, Mac x86_64 tier1-3 and GHA. test/hotspot/jtreg/compiler/c2/TestScalarReplacementMaxLiveNodes.java line 29: > 27: * @summary Check that C2 does not exceed max live node limit when splitting unique types of large allocation merge. > 28: * @library /test/lib / > 29: * @requires vm.debug & vm.flagless & vm.compiler2.enabled & vm.opt.final.EliminateAllocations Suggestion: * @requires vm.debug & vm.compiler2.enabled It think it's best to not overly restrict the test to get some more coverage from different flag combinations in the CI. test/hotspot/jtreg/compiler/c2/TestScalarReplacementMaxLiveNodes.java line 31: > 29: * @requires vm.debug & vm.flagless & vm.compiler2.enabled & vm.opt.final.EliminateAllocations > 30: * @compile -XDstringConcat=inline TestScalarReplacementMaxLiveNodes.java > 31: * @run main/othervm -Xbatch -server -XX:-OptimizeStringConcat -XX:-TieredCompilation Suggestion: * @run main/othervm -Xbatch -XX:-OptimizeStringConcat -XX:-TieredCompilation ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19545#discussion_r1627326981 PR Review Comment: https://git.openjdk.org/jdk/pull/19545#discussion_r1627324105 From chagedorn at openjdk.org Wed Jun 5 09:35:57 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 09:35:57 GMT Subject: RFR: 8331736: C2: Live Node limit exceeded limit after JDK-8316991 In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 16:32:15 GMT, Cesar Soares Lucas wrote: > Please, consider this patch to interrupt execution of `split_unique_types` (SUT) when number of live nodes reaches 3/4 of `max_live_nodes`. > > The included test case reproduces the problem. The number of live nodes before running phase 3 of SUT is ~20k, after processing about 750 mergemem nodes the number of live nodes is over 70k. This problem was first encountered when running an old `.jar` file that was created before `invokedynamic` optimizations - that's why I disable string optimizations in the test case. The test case is strongly based on [the method that was originally triggering the problem](https://github.com/Unidata/netcdf-java/blob/c782ef80ab54a09befd6d5065c6baeed54949222/cdm/radial/src/main/java/ucar/nc2/iosp/nids/Nidsheader.java#L2174). > > Tested on Linux, Win, Mac x86_64 tier1-3 and GHA. Apart from Tobias' suggestions, it looks good to me, too! src/hotspot/share/opto/escape.cpp line 4799: > 4797: // If we have crossed the 3/4 point of max node limit it's too risky > 4798: // to continue with EA/SR because we might hit the max node limit. > 4799: if (_compile->live_nodes() >= _compile->max_node_limit()*0.75) { Suggestion: if (_compile->live_nodes() >= _compile->max_node_limit() * 0.75) { ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19545#pullrequestreview-2098553417 PR Review Comment: https://git.openjdk.org/jdk/pull/19545#discussion_r1627363454 From chagedorn at openjdk.org Wed Jun 5 11:44:11 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 11:44:11 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v2] In-Reply-To: References: Message-ID: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: Updated comments, improved proofs, improved motivation and idea at method comment ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19522/files - new: https://git.openjdk.org/jdk/pull/19522/files/0ea3bacf..33d89249 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=00-01 Stats: 133 lines in 1 file changed: 71 ins; 4 del; 58 mod Patch: https://git.openjdk.org/jdk/pull/19522.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19522/head:pull/19522 PR: https://git.openjdk.org/jdk/pull/19522 From thartmann at openjdk.org Wed Jun 5 11:51:56 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Wed, 5 Jun 2024 11:51:56 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v2] In-Reply-To: References: Message-ID: <005N1ZTBVWU4dmrioB10csGVYAlmciziGddOQSRxUd0=.53840208-f71d-4a45-bc05-4a21e532bfff@github.com> On Wed, 5 Jun 2024 11:44:11 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Updated comments, improved proofs, improved motivation and idea at method comment Nice! That looks good to me. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19522#pullrequestreview-2098903147 From chagedorn at openjdk.org Wed Jun 5 12:26:57 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 12:26:57 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 11:44:11 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Updated comments, improved proofs, improved motivation and idea at method comment Thanks Tobias for your review! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19522#issuecomment-2149704669 From rehn at openjdk.org Wed Jun 5 12:41:08 2024 From: rehn at openjdk.org (Robbin Ehn) Date: Wed, 5 Jun 2024 12:41:08 GMT Subject: RFR: 8333649: Allow different NativeCall encodings Message-ID: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Hi all, please consider! We want to have different selectable NativeCalls. These are not the same size, shared code should query instead of using the enum directly. Sanity build and tested RV/x86, hoping GHA will catch anything else. Thanks, Robbin ------------- Commit messages: - Query throught static method size of NativeCall in shared code Changes: https://git.openjdk.org/jdk/pull/19556/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19556&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333649 Stats: 12 lines in 9 files changed: 9 ins; 0 del; 3 mod Patch: https://git.openjdk.org/jdk/pull/19556.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19556/head:pull/19556 PR: https://git.openjdk.org/jdk/pull/19556 From epeter at openjdk.org Wed Jun 5 12:41:02 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 12:41:02 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 11:44:11 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Updated comments, improved proofs, improved motivation and idea at method comment src/hotspot/share/opto/loopopts.cpp line 3057: > 3055: // dummy-if | > 3056: // / | | > 3057: // other | | what is this? ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627682645 From epeter at openjdk.org Wed Jun 5 12:49:59 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 12:49:59 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v2] In-Reply-To: References: Message-ID: <5HUv4gOW2RVa4ipvi1tXg4ytWi3oR4ITEZf_MTaaStc=.1618d947-c787-4415-b6d1-a5c527264247@github.com> On Wed, 5 Jun 2024 11:44:11 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Updated comments, improved proofs, improved motivation and idea at method comment src/hotspot/share/opto/loopopts.cpp line 3128: > 3126: } > 3127: > 3128: // From (SLE-full), we can extract a single signed loop exit condition depending on the stride: Suggestion: // We prove below that we can extract a single signed loop exit condition from (SLE-full), depending on the stride: src/hotspot/share/opto/loopopts.cpp line 3132: > 3130: // i < 0 (SLE-negative) > 3131: // stride > 0: > 3132: // i >= limit (SLE-positive) Suggestion: // stride < 0: // i < 0 (SLE = SLE-negative) // stride > 0: // i >= limit (SLE = SLE-positive) That would give you a definition of (SLE). ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627691930 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627693974 From epeter at openjdk.org Wed Jun 5 12:56:59 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 12:56:59 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 11:44:11 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Updated comments, improved proofs, improved motivation and idea at method comment src/hotspot/share/opto/loopopts.cpp line 3155: > 3153: // which is the unsigned loop exit condition (ULE). > 3154: // - stride < 0: > 3155: // i < 0 // (SLE-negative) Suggestion: // i >= limit // (SLE = SLE-positive) // i >= limit >= 0 // (COND) // i >=u limit >= 0 // (LEMMA) // which is the unsigned loop exit condition (ULE). // - stride < 0: // i < 0 // (SLE = SLE-negative) src/hotspot/share/opto/loopopts.cpp line 3171: > 3169: // > 3170: // > 3171: // i >= limit (SLE) Suggestion: // i >= limit (SLE-positive) // Loop: // i >=u limit (ULE) // // // i >= limit (SLE-positive) test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java line 270: > 268: check(MIN_VALUE + 2001); // MAX_VALUE + 2002 iterations > 269: testWhileLTDecr(MIN_VALUE + 2000, MIN_VALUE + 2001); > 270: check(MIN_VALUE + 2001); // MAX_VALUE + 2002 iterations Could you also add a randomized input test here, that plays close to the boundaries? Just to make sure we would catch things like some off-by one errors. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627700229 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627700805 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627704332 From epeter at openjdk.org Wed Jun 5 13:16:16 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 13:16:16 GMT Subject: RFR: 8333647: C2 SuperWord: some additional PopulateIndex tests Message-ID: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> When I did the deep refactoring in https://github.com/openjdk/jdk/pull/19261, I wanted some more tests for `PopulateIndex`. I push them separately to keep the other RFE smaller. I filed a follow-up RFE for some cases that do not vectorize: https://bugs.openjdk.org/browse/JDK-8332878 ------------- Commit messages: - 8333647 Changes: https://git.openjdk.org/jdk/pull/19558/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19558&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333647 Stats: 97 lines in 2 files changed: 97 ins; 0 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/19558.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19558/head:pull/19558 PR: https://git.openjdk.org/jdk/pull/19558 From chagedorn at openjdk.org Wed Jun 5 13:40:06 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 13:40:06 GMT Subject: RFR: 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi Message-ID: In the patch of [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386), I've added a test case that took the code path in `clone_loop_handle_data_uses()` with the new `OpaqueInitializedAssertionPredicateNode`. So, it proved that we should also add a case for `OpaqueInitializedAssertionPredicate` - yet somehow, I forgot to do that. Unfortunately, I could not come up with a failing test if we do not handle this case separately and the mistake went unnoticed. But the fuzzer has now found such a case where we crash when creating the Mach graph because we have an `If` with a `Phi` input instead of a `Bool`. This happens exactly because of not handling `OpaqueInitializedAssertionPredicate` in `clone_loop_handle_data_uses()`. We have an outside use of a `4843 Bool` (`4193 RangeCheck`) which is also an input into a `OpaqueInitializedAssertionPredicate` node: ![image](https://github.com/openjdk/jdk/assets/17833009/7f43d987-6e29-449e-8392-58e76e111e80) We should now clone this `Bool/Cmp` down to avoid having a `Phi` between the `OpaqueInitializedAssertionPredicate` and the `Bool` node which currently wrongly happens: ![image](https://github.com/openjdk/jdk/assets/17833009/a10554a0-d48a-406c-a510-fef9666c45af) and we crash later. With this simple patch, this is being avoided. The good thing is that we now have a test that makes sure that this new condition is properly tested. Thanks, Christian ------------- Commit messages: - 8332920: C2: assert(is_Bool()) failed: invalid node class: Phi Changes: https://git.openjdk.org/jdk/pull/19561/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19561&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333644 Stats: 52 lines in 2 files changed: 44 ins; 1 del; 7 mod Patch: https://git.openjdk.org/jdk/pull/19561.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19561/head:pull/19561 PR: https://git.openjdk.org/jdk/pull/19561 From chagedorn at openjdk.org Wed Jun 5 13:40:06 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 13:40:06 GMT Subject: RFR: 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi In-Reply-To: References: Message-ID: <_wpg0zz8ah52IR2iTFZG2M52mcH0-86vCIp3sVf1e_8=.22417f41-414f-4d07-88ab-3ee2b4667625@github.com> On Wed, 5 Jun 2024 13:33:27 GMT, Christian Hagedorn wrote: > In the patch of [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386), I've added a test case that took the code path in `clone_loop_handle_data_uses()` with the new `OpaqueInitializedAssertionPredicateNode`. So, it proved that we should also add a case for `OpaqueInitializedAssertionPredicate` - yet somehow, I forgot to do that. Unfortunately, I could not come up with a failing test if we do not handle this case separately and the mistake went unnoticed. > > But the fuzzer has now found such a case where we crash when creating the Mach graph because we have an `If` with a `Phi` input instead of a `Bool`. This happens exactly because of not handling `OpaqueInitializedAssertionPredicate` in `clone_loop_handle_data_uses()`. > > We have an outside use of a `4843 Bool` (`4193 RangeCheck`) which is also an input into a `OpaqueInitializedAssertionPredicate` node: > > ![image](https://github.com/openjdk/jdk/assets/17833009/7f43d987-6e29-449e-8392-58e76e111e80) > > We should now clone this `Bool/Cmp` down to avoid having a `Phi` between the `OpaqueInitializedAssertionPredicate` and > the `Bool` node which currently wrongly happens: > > ![image](https://github.com/openjdk/jdk/assets/17833009/a10554a0-d48a-406c-a510-fef9666c45af) > > and we crash later. With this simple patch, this is being avoided. > > The good thing is that we now have a test that makes sure that this new condition is properly tested. > > Thanks, > Christian src/hotspot/share/opto/loopopts.cpp line 2167: > 2165: // to avoid such a phi in between. > 2166: // For example, it is unexpected that there is a Phi between an > 2167: // AllocateArray node and its ValidLengthTest input that could cause Was outdated (we already removed `Opaque1` nodes for predicates some time ago) and we also handle more nodes now and not only `AllocateArray`. Improved that a little bit. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19561#discussion_r1627801684 From aph at openjdk.org Wed Jun 5 13:41:03 2024 From: aph at openjdk.org (Andrew Haley) Date: Wed, 5 Jun 2024 13:41:03 GMT Subject: RFR: 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug" [v3] In-Reply-To: References: <16J-lJ2AceGTVcRWBcP15yKcwO-1IA1XsngyOuNjf7k=.0776f081-ae2c-4279-87cf-d909806c2bc4@github.com> Message-ID: <2c48X45JWEQrzKj_rwUsCeBV_c8Ee7r80urMm8haK1Q=.fcea07d4-7635-4f83-b8d6-882b210ba84d@github.com> On Wed, 5 Jun 2024 08:37:34 GMT, Fei Gao wrote: > > > But `1030` can't be encoded as `base` + `offset` mode > > > > > > Why not? > > See https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/LDR--immediate---Load-Register--immediate--?lang=en. For `long` type, signed immediate byte offset should be in range `-256` to `255` and positive immediate byte offset: a multiple of `8` in the range `0` to `32760`. Oh, I see. I really don't think that matters. Current AArch64 processors often add an extra cycle anyway for offsetted addresses, so all we'd add is a slight code size expansion for misaligned loads. ------------- PR Comment: https://git.openjdk.org/jdk/pull/16991#issuecomment-2149970058 From chagedorn at openjdk.org Wed Jun 5 13:49:14 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 13:49:14 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v3] In-Reply-To: References: Message-ID: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... Christian Hagedorn has updated the pull request incrementally with two additional commits since the last revision: - Apply suggestions from code review Co-authored-by: Emanuel Peter Co-authored-by: Tobias Hartmann - Fix test timeout with DeoptimizeALot and fix graph comment ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19522/files - new: https://git.openjdk.org/jdk/pull/19522/files/33d89249..21201f4e Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=02 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=01-02 Stats: 10 lines in 2 files changed: 0 ins; 0 del; 10 mod Patch: https://git.openjdk.org/jdk/pull/19522.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19522/head:pull/19522 PR: https://git.openjdk.org/jdk/pull/19522 From thartmann at openjdk.org Wed Jun 5 14:26:02 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Wed, 5 Jun 2024 14:26:02 GMT Subject: RFR: 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 13:33:27 GMT, Christian Hagedorn wrote: > In the patch of [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386), I've added a test case that took the code path in `clone_loop_handle_data_uses()` with the new `OpaqueInitializedAssertionPredicateNode`. So, it proved that we should also add a case for `OpaqueInitializedAssertionPredicate` - yet somehow, I forgot to do that. Unfortunately, I could not come up with a failing test if we do not handle this case separately and the mistake went unnoticed. > > But the fuzzer has now found such a case where we crash when creating the Mach graph because we have an `If` with a `Phi` input instead of a `Bool`. This happens exactly because of not handling `OpaqueInitializedAssertionPredicate` in `clone_loop_handle_data_uses()`. > > We have an outside use of a `4843 Bool` (`4193 RangeCheck`) which is also an input into a `OpaqueInitializedAssertionPredicate` node: > > ![image](https://github.com/openjdk/jdk/assets/17833009/7f43d987-6e29-449e-8392-58e76e111e80) > > We should now clone this `Bool/Cmp` down to avoid having a `Phi` between the `OpaqueInitializedAssertionPredicate` and > the `Bool` node which currently wrongly happens: > > ![image](https://github.com/openjdk/jdk/assets/17833009/a10554a0-d48a-406c-a510-fef9666c45af) > > and we crash later. With this simple patch, this is being avoided. > > The good thing is that we now have a test that makes sure that this new condition is properly tested. > > Thanks, > Christian Looks good to me. test/hotspot/jtreg/compiler/predicates/assertion/TestOpaqueInitializedAssertionPredicateNode.java line 62: > 60: * @modules java.base/jdk.internal.misc:+open > 61: * @summary Test that using OpaqueInitializedAssertionPredicate for Initialized Assertion Predicates instead of Opaque4 > 62: * nodes also work with clone_loop_handle_data_uses() which missed a case before. Suggestion: * nodes also works with clone_loop_handle_data_uses() which missed a case before. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19561#pullrequestreview-2099375442 PR Review Comment: https://git.openjdk.org/jdk/pull/19561#discussion_r1627889142 From chagedorn at openjdk.org Wed Jun 5 14:48:12 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 14:48:12 GMT Subject: RFR: 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi [v2] In-Reply-To: References: Message-ID: > In the patch of [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386), I've added a test case that took the code path in `clone_loop_handle_data_uses()` with the new `OpaqueInitializedAssertionPredicateNode`. So, it proved that we should also add a case for `OpaqueInitializedAssertionPredicate` - yet somehow, I forgot to do that. Unfortunately, I could not come up with a failing test if we do not handle this case separately and the mistake went unnoticed. > > But the fuzzer has now found such a case where we crash when creating the Mach graph because we have an `If` with a `Phi` input instead of a `Bool`. This happens exactly because of not handling `OpaqueInitializedAssertionPredicate` in `clone_loop_handle_data_uses()`. > > We have an outside use of a `4843 Bool` (`4193 RangeCheck`) which is also an input into a `OpaqueInitializedAssertionPredicate` node: > > ![image](https://github.com/openjdk/jdk/assets/17833009/7f43d987-6e29-449e-8392-58e76e111e80) > > We should now clone this `Bool/Cmp` down to avoid having a `Phi` between the `OpaqueInitializedAssertionPredicate` and > the `Bool` node which currently wrongly happens: > > ![image](https://github.com/openjdk/jdk/assets/17833009/a10554a0-d48a-406c-a510-fef9666c45af) > > and we crash later. With this simple patch, this is being avoided. > > The good thing is that we now have a test that makes sure that this new condition is properly tested. > > Thanks, > Christian Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: Update test/hotspot/jtreg/compiler/predicates/assertion/TestOpaqueInitializedAssertionPredicateNode.java Co-authored-by: Tobias Hartmann ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19561/files - new: https://git.openjdk.org/jdk/pull/19561/files/68215366..72e1d663 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19561&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19561&range=00-01 Stats: 1 line in 1 file changed: 0 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19561.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19561/head:pull/19561 PR: https://git.openjdk.org/jdk/pull/19561 From chagedorn at openjdk.org Wed Jun 5 14:48:12 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 14:48:12 GMT Subject: RFR: 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi In-Reply-To: References: Message-ID: <93cI05-075e0iticgY3_2kvR92_Y0SmYcHNbrTP_LFw=.aaad9ff1-0fa7-4877-8755-1dd7bbd953cd@github.com> On Wed, 5 Jun 2024 13:33:27 GMT, Christian Hagedorn wrote: > In the patch of [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386), I've added a test case that took the code path in `clone_loop_handle_data_uses()` with the new `OpaqueInitializedAssertionPredicateNode`. So, it proved that we should also add a case for `OpaqueInitializedAssertionPredicate` - yet somehow, I forgot to do that. Unfortunately, I could not come up with a failing test if we do not handle this case separately and the mistake went unnoticed. > > But the fuzzer has now found such a case where we crash when creating the Mach graph because we have an `If` with a `Phi` input instead of a `Bool`. This happens exactly because of not handling `OpaqueInitializedAssertionPredicate` in `clone_loop_handle_data_uses()`. > > We have an outside use of a `4843 Bool` (`4193 RangeCheck`) which is also an input into a `OpaqueInitializedAssertionPredicate` node: > > ![image](https://github.com/openjdk/jdk/assets/17833009/7f43d987-6e29-449e-8392-58e76e111e80) > > We should now clone this `Bool/Cmp` down to avoid having a `Phi` between the `OpaqueInitializedAssertionPredicate` and > the `Bool` node which currently wrongly happens: > > ![image](https://github.com/openjdk/jdk/assets/17833009/a10554a0-d48a-406c-a510-fef9666c45af) > > and we crash later. With this simple patch, this is being avoided. > > The good thing is that we now have a test that makes sure that this new condition is properly tested. > > Thanks, > Christian Thanks Tobias for your review! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19561#issuecomment-2150242030 From dfenacci at openjdk.org Wed Jun 5 14:52:01 2024 From: dfenacci at openjdk.org (Damon Fenacci) Date: Wed, 5 Jun 2024 14:52:01 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 08:22:25 GMT, Gui Cao wrote: >> Hi, VectorGatherMaskFoldingTest.java Test fails when max vector bits is 64, when max vector bits is 64, LongVector.SPECIES_MAX.length() and DoubleVector.SPECIES_MAX.length() is 1. >> >> We can reproduce this problem in two ways: >> 1. We can use riscv without rvv1.0 board to reproduce this problem >> 2. Run VectorGatherMaskFoldingTest.java on aarch64 client mode without `-XX:+IncrementalInlineForceCleanup` Option, the `-XX:+IncrementalInlineForceCleanup` is C2 Option, so we need to remove this Option from the VectorGatherMaskFoldingTest.main method. error message: >> >> Base Test: @Test testDoubleVectorStoreLoadMaskedVector: >> compiler.lib.ir_framework.shared.TestRunException: There was an error while invoking @Test method public static void compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(). Target: null. Arguments: >> at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:84) >> at compiler.lib.ir_framework.test.BaseTest.invokeTest(BaseTest.java:71) >> at compiler.lib.ir_framework.test.AbstractTest.run(AbstractTest.java:98) >> at compiler.lib.ir_framework.test.TestVM.runTests(TestVM.java:861) >> at compiler.lib.ir_framework.test.TestVM.start(TestVM.java:252) >> at compiler.lib.ir_framework.test.TestVM.main(TestVM.java:165) >> Caused by: java.lang.reflect.InvocationTargetException >> at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:118) >> at java.base/java.lang.reflect.Method.invoke(Method.java:580) >> at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:80) >> ... 5 more >> Caused by: java.lang.RuntimeException: assertNotEquals: expected [1.0] to not equal [1.0] >> at jdk.test.lib.Asserts.fail(Asserts.java:691) >> at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:451) >> at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:435) >> at compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(VectorGatherMaskFoldingTest.java:1089) >> at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103) >> ... 7 more >> >> >> For example, the following method will be failed: >> >> private static final VectorSpecies L_SPECIES = LongVector.SPECIES_MAX; >> private static final VectorSpecies D_SPECIES = DoubleVector.SPECIES_MAX; >> ... >> @Test >> @IR(counts = { IRNode.STORE_VECTOR_MASKED, ">= 1", IRNode.LOAD_VECTOR_MASKED, ">= 1" }, apply... > > Gui Cao has updated the pull request incrementally with one additional commit since the last revision: > > Fix for Damon comment On the other hand you might need to add the check here: https://github.com/openjdk/jdk/blob/1235a453eef4a838fe07009b0c5e8a962b527bb4/test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java#L966 and here: https://github.com/openjdk/jdk/blob/1235a453eef4a838fe07009b0c5e8a962b527bb4/test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java#L1098 test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java line 163: > 161: LongVector res = LongVector.fromArray(L_SPECIES, longArray, 0, longIndices, 0); > 162: LongVector res2 = LongVector.fromArray(L_SPECIES, longArray2, 0, longIndices, 0); > 163: Asserts.assertFalse(L_SPECIES.length() != 1 && res.equals(res2)); I think you don't need this change (`longArray` and `longArray2` are different even with length 1). test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java line 227: > 225: LongVector res = LongVector.fromArray(L_SPECIES, longArray, 0, longIndices, 0, longVectorMask); > 226: LongVector res2 = LongVector.fromArray(L_SPECIES, longArray, 0, longIndices, 0, longVectorMask2); > 227: Asserts.assertFalse(L_SPECIES.length() != 1 && res.equals(res2)); You don't need this change either (`longVectorMask` and `longVectorMask` are different even with length 1). test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java line 329: > 327: DoubleVector res = DoubleVector.fromArray(D_SPECIES, doubleArray, 0, doubleIndices, 0); > 328: DoubleVector res2 = DoubleVector.fromArray(D_SPECIES, doubleArray2, 0, doubleIndices, 0); > 329: Asserts.assertFalse(D_SPECIES.length() != 1 && res.equals(res2)); Same here. test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java line 393: > 391: DoubleVector res = DoubleVector.fromArray(D_SPECIES, doubleArray, 0, doubleIndices, 0, doubleVectorMask); > 392: DoubleVector res2 = DoubleVector.fromArray(D_SPECIES, doubleArray, 0, doubleIndices, 0, doubleVectorMask2); > 393: Asserts.assertFalse(D_SPECIES.length() != 1 && res.equals(res2)); Same here. test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java line 579: > 577: longVector.intoArray(res, 0, longIndices, 0, longVectorMask); > 578: longVector.intoArray(res2, 0, longIndices, 0, longVectorMask2); > 579: Asserts.assertFalse(L_SPECIES.length() != 1 && Arrays.equals(res, res2)); Same here. test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java line 705: > 703: doubleVector.intoArray(res, 0, doubleIndices, 0); > 704: doubleVector2.intoArray(res2, 0, doubleIndices, 0); > 705: Asserts.assertFalse(D_SPECIES.length() != 1 && Arrays.equals(res, res2)); Same here. test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java line 785: > 783: doubleVector.intoArray(res, 0, doubleIndices, 0, doubleVectorMask); > 784: doubleVector.intoArray(res2, 0, doubleIndices, 0, doubleVectorMask2); > 785: Asserts.assertFalse(D_SPECIES.length() != 1 && Arrays.equals(res, res2)); Same here. ------------- PR Review: https://git.openjdk.org/jdk/pull/19473#pullrequestreview-2099417323 PR Review Comment: https://git.openjdk.org/jdk/pull/19473#discussion_r1627914309 PR Review Comment: https://git.openjdk.org/jdk/pull/19473#discussion_r1627914948 PR Review Comment: https://git.openjdk.org/jdk/pull/19473#discussion_r1627915692 PR Review Comment: https://git.openjdk.org/jdk/pull/19473#discussion_r1627916442 PR Review Comment: https://git.openjdk.org/jdk/pull/19473#discussion_r1627917053 PR Review Comment: https://git.openjdk.org/jdk/pull/19473#discussion_r1627917544 PR Review Comment: https://git.openjdk.org/jdk/pull/19473#discussion_r1627918103 From galder at openjdk.org Wed Jun 5 15:14:58 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Wed, 5 Jun 2024 15:14:58 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 08:43:04 GMT, Aleksey Shipilev wrote: > All right, that looks reasonable. > > I am a bit queasy on conditionally removing the `StoreStore` barrier from the allocation path, given that it also protects the object metadata. An accidentally missing barrier would probably lead to VM crash, that is in the best case. Current code paths do not seem to be affected by this, but there is also no guardrails that would protect us from making such a mistake in the future: someone adds `new NewTypeArray` somewhere, and forgets a trailing barrier? > > What would be the cost of still emitting (an excess for cloning path) `StoreStore` in `C1_MacroAssembler::allocate_array`? Would that cost still matter, given the performance improvement we get with C1 clone intrinsic? Let me make sure I understand this right: you are suggesting removing the `if (zero_array)` branch and instead always emit storestore barrier, as well as the storestore barrier at the end of `append_alloc_array_copy`? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2150323145 From chagedorn at openjdk.org Wed Jun 5 15:18:12 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 15:18:12 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v4] In-Reply-To: References: Message-ID: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: Add randomized tests ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19522/files - new: https://git.openjdk.org/jdk/pull/19522/files/21201f4e..e0a18f06 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=03 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=02-03 Stats: 48 lines in 1 file changed: 48 ins; 0 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/19522.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19522/head:pull/19522 PR: https://git.openjdk.org/jdk/pull/19522 From chagedorn at openjdk.org Wed Jun 5 15:18:13 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 15:18:13 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v2] In-Reply-To: References: Message-ID: <5juQ8V3PhrTkONFzXJZV4VRF9t97MZkefLQ7z2wPMtI=.b2531424-7ce6-4e87-b1a7-7d873ed0df38@github.com> On Wed, 5 Jun 2024 12:38:46 GMT, Emanuel Peter wrote: >> Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: >> >> Updated comments, improved proofs, improved motivation and idea at method comment > > src/hotspot/share/opto/loopopts.cpp line 3057: > >> 3055: // dummy-if | >> 3056: // / | | >> 3057: // other | | > > what is this? This dummy-if is created with `insert_region_before_proj()` such that we can have a region between the exit projection (`exit-proj`) of the original unsigned loop exit test and the If node while keeping both the If and the exit-proj. Also see: https://github.com/openjdk/jdk/blob/cbb6747e6b9ce7e2b9e0ffb0a1f9499f7e0e13b0/src/hotspot/share/opto/loopopts.cpp#L2934-L2960 > test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java line 270: > >> 268: check(MIN_VALUE + 2001); // MAX_VALUE + 2002 iterations >> 269: testWhileLTDecr(MIN_VALUE + 2000, MIN_VALUE + 2001); >> 270: check(MIN_VALUE + 2001); // MAX_VALUE + 2002 iterations > > Could you also add a randomized input test here, that plays close to the boundaries? Just to make sure we would catch things like some off-by one errors. Good idea, I've added some random test and computed the number of iteration that I expect. I only did it for the interesting cases. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627811507 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627977241 From chagedorn at openjdk.org Wed Jun 5 15:18:12 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 15:18:12 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v4] In-Reply-To: <3bxGw3LLx8Tde1JAFK7-oizimUl1S7x9j-VnHDJYnZM=.37437a3c-ce88-41c9-b037-9bd0652ca587@github.com> References: <3bxGw3LLx8Tde1JAFK7-oizimUl1S7x9j-VnHDJYnZM=.37437a3c-ce88-41c9-b037-9bd0652ca587@github.com> Message-ID: On Wed, 5 Jun 2024 07:04:25 GMT, Emanuel Peter wrote: >> Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: >> >> Add randomized tests > > src/hotspot/share/opto/loopopts.cpp line 3052: > >> 3050: // limit >= 0 (COND) >> 3051: // then the unsigned loop exit condition is equivalent to the signed loop exit condition >> 3052: // i < 0 || i >= limit > > Could we come up with an alternative equation if `limit < 0`? And maybe even a combined condition? Not sure if that is helpful, but I'd like to think about it. Could also be a follow-up RFE. I have not thought about it, yet, but I also have a feeling that we might can do better. Since this is a P2 bug fix, I propose to do that separately in an RFE as you suggested. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1627284133 From epeter at openjdk.org Wed Jun 5 15:24:57 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Wed, 5 Jun 2024 15:24:57 GMT Subject: RFR: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes [v3] In-Reply-To: References: Message-ID: <_pjF1qZQbbmk62BfaQhbzUg_ShCr-9XzujuV8ucJE-4=.48120b87-2719-4b2b-a6e8-7542e0bf1119@github.com> On Wed, 5 Jun 2024 08:06:35 GMT, Christian Hagedorn wrote: >> [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and not non-null-checks which both use `Opaque4` nodes. >> >> #### Correct Assertion >> One of this assert was now hit with a fuzzer found case in `get_assertion_predicates()` called during the elimination of useless predicates. We walk through all loops and collect all useful Template Assertion Predicates and Parse Predicates above the loops. For that we look at the UCTs which are shared among the predicates. When finding a predicate with such an UCT which also has an `Opaque4` node, we know that it is a Template Assertion Predicate. We additionally assert that we must find the `OpaqueLoop*Nodes` above which always belong to a template: >> >> https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L346-L354 >> >> So, this assert looks correct. >> >> #### Why didn't we find `OpaqueLoop*Nodes` in this case? >> For the Template Assertion Predicate for the last value, we insert an additional `CastII` to keep the type information of the iv phi: >> >> https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L1323-L1324 >> >> But in the test case, the type of the iv phi is a constant (`521 CastII`): >> >> ![image](https://github.com/openjdk/jdk/assets/17833009/5dc17b9c-abfe-4846-89a1-4e189234b991) >> >> `521 CastII` will simply be replaced with a constant during IGVN and the `OpaqueLoop*Nodes` above are removed. We therefore cannot find them anymore later when trying to eliminate useless predicates and we hit the assert. >> >> #### Why does the `CastII`/iv phi have a constant type? >> Having a constant type for the iv phi indicates that the counted loop is only going to be executed for one iteration. But C2 has not had the chance, yet, to fold the loop exit test to remove the loop. >> >> #### How to fix this bug? >> Having a single iteration loop raises the question, why we even bother to try and hoist checks out of such a loop with Loop Predication in the first place. I therefore suggest to simply bail out of Loop Predication if the trip count is 1. This will also prevent us from creating a Template Assertion Predicate with a `CastII` with a constant type from the iv phi which would be folded. >> >> To do that, we can compute the trip count on entry of Loop Predication. By doing that, we can also re... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Adding Xbatch run Thanks for the updates! ------------- Marked as reviewed by epeter (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19500#pullrequestreview-2099538401 From shade at openjdk.org Wed Jun 5 15:26:56 2024 From: shade at openjdk.org (Aleksey Shipilev) Date: Wed, 5 Jun 2024 15:26:56 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: References: Message-ID: <-Q0VHuGn1BYc_vki8F-BP0a0p11FU7VllamQ8eRHCH0=.69e0759e-ace2-4f5d-8482-c6894c61fbf7@github.com> On Wed, 5 Jun 2024 15:12:33 GMT, Galder Zamarre?o wrote: > Let me make sure I understand this right: you are suggesting removing the `if (zero_array)` branch and instead always emit storestore barrier, as well as the storestore barrier at the end of `append_alloc_array_copy`? Yes. AFAICS, we would have an excess `StoreStore` only on C1 clone path, which is should be fast enough already after you introduced the intrinsic. That is to say, that even if that `StoreStore` regresses performance a little bit in comparison to current mainline tip, it should still be faster than the version that did not do C1 clone intrinsic implemented at all. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2150348432 From galder at openjdk.org Wed Jun 5 15:34:57 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Wed, 5 Jun 2024 15:34:57 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: <-Q0VHuGn1BYc_vki8F-BP0a0p11FU7VllamQ8eRHCH0=.69e0759e-ace2-4f5d-8482-c6894c61fbf7@github.com> References: <-Q0VHuGn1BYc_vki8F-BP0a0p11FU7VllamQ8eRHCH0=.69e0759e-ace2-4f5d-8482-c6894c61fbf7@github.com> Message-ID: On Wed, 5 Jun 2024 15:23:42 GMT, Aleksey Shipilev wrote: > > Let me make sure I understand this right: you are suggesting removing the `if (zero_array)` branch and instead always emit storestore barrier, as well as the storestore barrier at the end of `append_alloc_array_copy`? > > Yes. AFAICS, we would have an excess `StoreStore` only on C1 clone path, which is should be fast enough already after you introduced the intrinsic. That is to say, that even if that `StoreStore` regresses performance a little bit in comparison to current mainline tip, it should still be faster than the version that did not have C1 clone intrinsic implemented at all. I run the array clone microbenchmarks with/without the if branch and I don't see any differences. So, the additional storestore barrier does indeed not introduce a performance regression. I'll update the PR. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2150366960 From chagedorn at openjdk.org Wed Jun 5 15:40:11 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 15:40:11 GMT Subject: RFR: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes [v3] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 08:06:35 GMT, Christian Hagedorn wrote: >> [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and not non-null-checks which both use `Opaque4` nodes. >> >> #### Correct Assertion >> One of this assert was now hit with a fuzzer found case in `get_assertion_predicates()` called during the elimination of useless predicates. We walk through all loops and collect all useful Template Assertion Predicates and Parse Predicates above the loops. For that we look at the UCTs which are shared among the predicates. When finding a predicate with such an UCT which also has an `Opaque4` node, we know that it is a Template Assertion Predicate. We additionally assert that we must find the `OpaqueLoop*Nodes` above which always belong to a template: >> >> https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L346-L354 >> >> So, this assert looks correct. >> >> #### Why didn't we find `OpaqueLoop*Nodes` in this case? >> For the Template Assertion Predicate for the last value, we insert an additional `CastII` to keep the type information of the iv phi: >> >> https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L1323-L1324 >> >> But in the test case, the type of the iv phi is a constant (`521 CastII`): >> >> ![image](https://github.com/openjdk/jdk/assets/17833009/5dc17b9c-abfe-4846-89a1-4e189234b991) >> >> `521 CastII` will simply be replaced with a constant during IGVN and the `OpaqueLoop*Nodes` above are removed. We therefore cannot find them anymore later when trying to eliminate useless predicates and we hit the assert. >> >> #### Why does the `CastII`/iv phi have a constant type? >> Having a constant type for the iv phi indicates that the counted loop is only going to be executed for one iteration. But C2 has not had the chance, yet, to fold the loop exit test to remove the loop. >> >> #### How to fix this bug? >> Having a single iteration loop raises the question, why we even bother to try and hoist checks out of such a loop with Loop Predication in the first place. I therefore suggest to simply bail out of Loop Predication if the trip count is 1. This will also prevent us from creating a Template Assertion Predicate with a `CastII` with a constant type from the iv phi which would be folded. >> >> To do that, we can compute the trip count on entry of Loop Predication. By doing that, we can also re... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Adding Xbatch run Thanks Emanuel for your review! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19500#issuecomment-2150375846 From chagedorn at openjdk.org Wed Jun 5 15:40:12 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Wed, 5 Jun 2024 15:40:12 GMT Subject: Integrated: 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes In-Reply-To: References: Message-ID: On Fri, 31 May 2024 12:33:04 GMT, Christian Hagedorn wrote: > [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386) added some additional asserts to ensure that we are dealing with Template Assertion Predicates and not non-null-checks which both use `Opaque4` nodes. > > #### Correct Assertion > One of this assert was now hit with a fuzzer found case in `get_assertion_predicates()` called during the elimination of useless predicates. We walk through all loops and collect all useful Template Assertion Predicates and Parse Predicates above the loops. For that we look at the UCTs which are shared among the predicates. When finding a predicate with such an UCT which also has an `Opaque4` node, we know that it is a Template Assertion Predicate. We additionally assert that we must find the `OpaqueLoop*Nodes` above which always belong to a template: > > https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L346-L354 > > So, this assert looks correct. > > #### Why didn't we find `OpaqueLoop*Nodes` in this case? > For the Template Assertion Predicate for the last value, we insert an additional `CastII` to keep the type information of the iv phi: > > https://github.com/openjdk/jdk/blob/7ab74c5f268dac82bbd36355acf8e4f3d357134c/src/hotspot/share/opto/loopPredicate.cpp#L1323-L1324 > > But in the test case, the type of the iv phi is a constant (`521 CastII`): > > ![image](https://github.com/openjdk/jdk/assets/17833009/5dc17b9c-abfe-4846-89a1-4e189234b991) > > `521 CastII` will simply be replaced with a constant during IGVN and the `OpaqueLoop*Nodes` above are removed. We therefore cannot find them anymore later when trying to eliminate useless predicates and we hit the assert. > > #### Why does the `CastII`/iv phi have a constant type? > Having a constant type for the iv phi indicates that the counted loop is only going to be executed for one iteration. But C2 has not had the chance, yet, to fold the loop exit test to remove the loop. > > #### How to fix this bug? > Having a single iteration loop raises the question, why we even bother to try and hoist checks out of such a loop with Loop Predication in the first place. I therefore suggest to simply bail out of Loop Predication if the trip count is 1. This will also prevent us from creating a Template Assertion Predicate with a `CastII` with a constant type from the iv phi which would be folded. > > To do that, we can compute the trip count on entry of Loop Predication. By doing that, we can also remove the trip count computation added for hoisting ran... This pull request has now been integrated. Changeset: c5c08678 Author: Christian Hagedorn URL: https://git.openjdk.org/jdk/commit/c5c0867881a43c81e88453274ac12e45454685a4 Stats: 90 lines in 2 files changed: 86 ins; 0 del; 4 mod 8333252: C2: assert(assertion_predicate_has_loop_opaque_node(iff)) failed: must find OpaqueLoop* nodes Reviewed-by: kvn, epeter ------------- PR: https://git.openjdk.org/jdk/pull/19500 From galder at openjdk.org Wed Jun 5 15:40:16 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Wed, 5 Jun 2024 15:40:16 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: References: Message-ID: > Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. > > The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. > > There's no barrier added on x86 c1 macro assembler for nothing to do there. > > I've run the following tests: > * tier 1 on darwin/aarch64 > * tier 1 on linux/x86_64 > * `hotspot_compiler` tests on darwin/aarch64 > * `copy.clone.arrays` jcstress tests on darwin/aarch64. > > I tried but was unable to create a standalone test for the jdk source tree that would fail. > > FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. Galder Zamarre?o has updated the pull request incrementally with one additional commit since the last revision: Keep storestore barrier for array allocation * Having c1 array clone use 2 storestore barriers has no performance impact, so it's safer to keep it in place. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19538/files - new: https://git.openjdk.org/jdk/pull/19538/files/6400b755..24d72c67 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19538&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19538&range=00-01 Stats: 6 lines in 1 file changed: 0 ins; 5 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19538.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19538/head:pull/19538 PR: https://git.openjdk.org/jdk/pull/19538 From shade at openjdk.org Wed Jun 5 15:45:03 2024 From: shade at openjdk.org (Aleksey Shipilev) Date: Wed, 5 Jun 2024 15:45:03 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 15:40:16 GMT, Galder Zamarre?o wrote: >> Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. >> >> The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. >> >> There's no barrier added on x86 c1 macro assembler for nothing to do there. >> >> I've run the following tests: >> * tier 1 on darwin/aarch64 >> * tier 1 on linux/x86_64 >> * `hotspot_compiler` tests on darwin/aarch64 >> * `copy.clone.arrays` jcstress tests on darwin/aarch64. >> >> I tried but was unable to create a standalone test for the jdk source tree that would fail. >> >> FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. > > Galder Zamarre?o has updated the pull request incrementally with one additional commit since the last revision: > > Keep storestore barrier for array allocation > > * Having c1 array clone use 2 storestore barriers > has no performance impact, so it's safer to keep it in place. Yup, like this: nice and tidy. ------------- Marked as reviewed by shade (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19538#pullrequestreview-2099586461 From galder at openjdk.org Wed Jun 5 15:48:59 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Wed, 5 Jun 2024 15:48:59 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: References: Message-ID: <9L2v8GymMJR9m2lKvtOv5194tLLPxvzgZTL9ATDjOPw=.53c5a8d4-f0bc-49a2-a997-9ababf7ffe1a@github.com> On Wed, 5 Jun 2024 15:42:22 GMT, Aleksey Shipilev wrote: > Yup, like this: nice and tidy. Just like you initially suggested! ? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2150395906 From kvn at openjdk.org Wed Jun 5 16:14:59 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 16:14:59 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v3] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: <4wf8B2Yf59K8-eM2cF1L4dcMu4ILgVt4t2lkoCt28Dw=.a2d36926-735b-4b4d-b1f9-b21e2628b8cb@github.com> On Wed, 5 Jun 2024 02:34:26 GMT, Jatin Bhateja wrote: >> Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. >> >> This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin >> >> PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 >> [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review comments resolutions. Good. Tobias ran testing for v01 and it passed. Please, answer Emanuel's questions/suggestions before integration. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19442#pullrequestreview-2099671808 PR Comment: https://git.openjdk.org/jdk/pull/19442#issuecomment-2150452928 From kvn at openjdk.org Wed Jun 5 16:30:57 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 16:30:57 GMT Subject: RFR: 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 14:48:12 GMT, Christian Hagedorn wrote: >> In the patch of [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386), I've added a test case that took the code path in `clone_loop_handle_data_uses()` with the new `OpaqueInitializedAssertionPredicateNode`. So, it proved that we should also add a case for `OpaqueInitializedAssertionPredicate` - yet somehow, I forgot to do that. Unfortunately, I could not come up with a failing test if we do not handle this case separately and the mistake went unnoticed. >> >> But the fuzzer has now found such a case where we crash when creating the Mach graph because we have an `If` with a `Phi` input instead of a `Bool`. This happens exactly because of not handling `OpaqueInitializedAssertionPredicate` in `clone_loop_handle_data_uses()`. >> >> We have an outside use of a `4843 Bool` (`4193 RangeCheck`) which is also an input into a `OpaqueInitializedAssertionPredicate` node: >> >> ![image](https://github.com/openjdk/jdk/assets/17833009/7f43d987-6e29-449e-8392-58e76e111e80) >> >> We should now clone this `Bool/Cmp` down to avoid having a `Phi` between the `OpaqueInitializedAssertionPredicate` and >> the `Bool` node which currently wrongly happens: >> >> ![image](https://github.com/openjdk/jdk/assets/17833009/a10554a0-d48a-406c-a510-fef9666c45af) >> >> and we crash later. With this simple patch, this is being avoided. >> >> The good thing is that we now have a test that makes sure that this new condition is properly tested. >> >> Thanks, >> Christian > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Update test/hotspot/jtreg/compiler/predicates/assertion/TestOpaqueInitializedAssertionPredicateNode.java > > Co-authored-by: Tobias Hartmann Looks good. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19561#pullrequestreview-2099706997 From kvn at openjdk.org Wed Jun 5 16:34:01 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 16:34:01 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v4] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 15:18:12 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Add randomized tests Update is good. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19522#pullrequestreview-2099713258 From kvn at openjdk.org Wed Jun 5 16:44:56 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 16:44:56 GMT Subject: RFR: 8333649: Allow different NativeCall encodings In-Reply-To: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Message-ID: <_mL0-HtkkEsVyzPntAmwa22r-DgbxuSIKFLptcz9A8Q=.943c4deb-eca5-4f05-aa2a-6cd15fd4ab6d@github.com> On Wed, 5 Jun 2024 12:35:47 GMT, Robbin Ehn wrote: > Hi all, please consider! > > We want to have different selectable NativeCalls. > These are not the same size, shared code should query instead of using the enum directly. > > Sanity build and tested RV/x86, hoping GHA will catch anything else. > > Thanks, Robbin Do you have an RFE/JEP which requires these changes? ------------- PR Review: https://git.openjdk.org/jdk/pull/19556#pullrequestreview-2099732529 From kvn at openjdk.org Wed Jun 5 16:47:58 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 16:47:58 GMT Subject: RFR: 8333647: C2 SuperWord: some additional PopulateIndex tests In-Reply-To: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> References: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> Message-ID: On Wed, 5 Jun 2024 13:06:10 GMT, Emanuel Peter wrote: > When I did the deep refactoring in https://github.com/openjdk/jdk/pull/19261, I wanted some more tests for `PopulateIndex`. I push them separately to keep the other RFE smaller. > > I filed a follow-up RFE for some cases that do not vectorize: https://bugs.openjdk.org/browse/JDK-8332878 Add Oracle's copyright line to these files. ------------- Changes requested by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19558#pullrequestreview-2099739346 From rehn at openjdk.org Wed Jun 5 16:54:56 2024 From: rehn at openjdk.org (Robbin Ehn) Date: Wed, 5 Jun 2024 16:54:56 GMT Subject: RFR: 8333649: Allow different NativeCall encodings In-Reply-To: <_mL0-HtkkEsVyzPntAmwa22r-DgbxuSIKFLptcz9A8Q=.943c4deb-eca5-4f05-aa2a-6cd15fd4ab6d@github.com> References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> <_mL0-HtkkEsVyzPntAmwa22r-DgbxuSIKFLptcz9A8Q=.943c4deb-eca5-4f05-aa2a-6cd15fd4ab6d@github.com> Message-ID: <1iX2uddhqa-LZIm0fxganjx7oeJ_XfRW9QgJFHAw63I=.789a8b59-39f9-4ea0-8e97-2b70807fcaf7@github.com> On Wed, 5 Jun 2024 16:41:56 GMT, Vladimir Kozlov wrote: > Do you have an RFE/JEP which requires these changes? Hey, yes, here: https://github.com/openjdk/jdk/pull/19453 We will have both 4 bytes and 12 bytes NativeCalls on Risc-V, selectable in runtime. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19556#issuecomment-2150526391 From kvn at openjdk.org Wed Jun 5 16:59:56 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 16:59:56 GMT Subject: RFR: 8333649: Allow different NativeCall encodings In-Reply-To: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Message-ID: <4tIBtXSxq6o2Wd5c6UXYBju9QZdlyhmcEIodZUSJLp0=.638ed4b5-f70b-4511-8cfc-1af0f11e7411@github.com> On Wed, 5 Jun 2024 12:35:47 GMT, Robbin Ehn wrote: > Hi all, please consider! > > We want to have different selectable NativeCalls. > These are not the same size, shared code should query instead of using the enum directly. > > Sanity build and tested RV/x86, hoping GHA will catch anything else. > > Thanks, Robbin Good. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19556#pullrequestreview-2099762415 From aph at openjdk.org Wed Jun 5 17:13:04 2024 From: aph at openjdk.org (Andrew Haley) Date: Wed, 5 Jun 2024 17:13:04 GMT Subject: Integrated: 8319822: Use a linear-time algorithm for assert_different_registers() In-Reply-To: References: Message-ID: On Fri, 10 Nov 2023 15:46:59 GMT, Andrew Haley wrote: > At the present time, `assert_different_registers()` uses an O(N**2) algorithm in assert_different_registers(). We can utilize RegSet to do it in O(N) time. This would be a useful optimization for all builds with assertions enabled. > > In addition, it would be useful to be able to static_assert different registers. > > Also, I've taken the opportunity to expand the maximum size of a RegSet to 64 on 64-bit platforms. > > I also fixed a bug: sometimes `noreg` is passed to `assert_different_registers()`, but it may only be passed once or a spurious assertion is triggered. This pull request has now been integrated. Changeset: 9b3694c4 Author: Andrew Haley URL: https://git.openjdk.org/jdk/commit/9b3694c4fcc3cf46c0d827427ae8aadb477e8e22 Stats: 69 lines in 4 files changed: 33 ins; 0 del; 36 mod 8319822: Use a linear-time algorithm for assert_different_registers() Reviewed-by: kbarrett, stefank, stuefe ------------- PR: https://git.openjdk.org/jdk/pull/16617 From jbhateja at openjdk.org Wed Jun 5 17:59:13 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Wed, 5 Jun 2024 17:59:13 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v4] In-Reply-To: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: > Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. > > This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. > > Kindly review and share your feedback. > > Best Regards, > Jatin > > PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) > > [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 > [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: Review comments addressed. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19442/files - new: https://git.openjdk.org/jdk/pull/19442/files/16996e57..cb2877bf Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19442&range=03 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19442&range=02-03 Stats: 8 lines in 2 files changed: 1 ins; 1 del; 6 mod Patch: https://git.openjdk.org/jdk/pull/19442.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19442/head:pull/19442 PR: https://git.openjdk.org/jdk/pull/19442 From jbhateja at openjdk.org Wed Jun 5 17:59:13 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Wed, 5 Jun 2024 17:59:13 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v3] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> <1o2ba7vrqVN7L2qVFAVeT_nHbae7UOHiQg_7HxUmeDk=.98694c70-1895-49ef-9110-d71e2e4bbfb5@github.com> Message-ID: On Wed, 5 Jun 2024 06:32:49 GMT, Emanuel Peter wrote: > Also: it seems to me that you are duplicating these 4 lines above from its call-site. I wonder if this means that you are slicing the boundary of your new method right, or if maybe the whole if-else block from the call-site should be a new method? The duplication you are pointing in code may not translate into IR since GVN implicitly promotes sharing based on nodes hash value which is a function of node's opcode and inputs. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19442#discussion_r1628188964 From sviswanathan at openjdk.org Wed Jun 5 18:13:02 2024 From: sviswanathan at openjdk.org (Sandhya Viswanathan) Date: Wed, 5 Jun 2024 18:13:02 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) In-Reply-To: References: Message-ID: On Mon, 1 Apr 2024 12:01:27 GMT, Jatin Bhateja wrote: > Summary of changes include with the patch:- > > 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) > 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. > > Kindly review and share your feedback. > > Best Regards, > Jatin Marked as reviewed by sviswanathan (Reviewer). src/hotspot/cpu/x86/vm_version_x86.cpp line 113: > 111: VM_Version_StubGenerator(CodeBuffer *c) : StubCodeGenerator(c) {} > 112: > 113: address clear_apx_test_state() { Why do we need to clear_apx_test_state? r16 onwards are not callee saved. And checking r15 save/restore is not needed so we could remove r15 changes altogether. src/hotspot/cpu/x86/vm_version_x86.cpp line 433: > 431: __ jcc(Assembler::notEqual, vector_save_restore); > 432: > 433: /* FIXME: Uncomment after integration of JDK-8328998 Did you mean to uncomment these now that JDK-8328998 has integrated? src/hotspot/cpu/x86/vm_version_x86.cpp line 434: > 432: > 433: /* FIXME: Uncomment after integration of JDK-8328998 > 434: __ mov64(r15, VM_Version::egpr_test_value()); Why are we modifying r15? It is not an APX egpr. src/hotspot/cpu/x86/vm_version_x86.cpp line 435: > 433: /* FIXME: Uncomment after integration of JDK-8328998 > 434: __ mov64(r15, VM_Version::egpr_test_value()); > 435: __ mov64(r16, VM_Version::egpr_test_value()); You would need to temporarily set UseAPX feature before generating this instruction, otherwise assembler will complain. src/hotspot/cpu/x86/vm_version_x86.cpp line 447: > 445: /* FIXME: Uncomment after integration of JDK-8328998 > 446: __ mov64(rax, VM_Version::egpr_test_value()); > 447: __ cmpq(rax, r15); Likewise r15 validation can be removed. src/hotspot/cpu/x86/vm_version_x86.cpp line 456: > 454: // Generate SEGV to signal unsuccessful save/restore. > 455: __ bind(apx_save_restore_error); > 456: __ lea(rax, ExternalAddress(VM_Version::_apx_state_restore_error_handler)); Generating an error message here won't be the right thing (especially since this is default by feature detection). It should only result in setting UseAPX feature to false. src/hotspot/cpu/x86/vm_version_x86.hpp line 476: > 474: uint32_t dcp_cpuid4_edx; // unused currently > 475: > 476: // cpuid function 7 (structured extended features enumeration leaf) Good to add here a comment: // eax = 7, ecx = 0 src/hotspot/os_cpu/bsd_x86/os_bsd_x86.cpp line 420: > 418: > 419: #ifndef PRODUCT > 420: if ((sig == SIGSEGV) && VM_Version::is_cpuinfo_segv_addr_apx(pc)) { Do we want to include SIGBUS also here like above? ------------- PR Review: https://git.openjdk.org/jdk/pull/18562#pullrequestreview-2097590632 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1626760270 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1628196385 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1626759049 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1628197767 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1626759263 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1628208548 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1626753662 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1626756465 From jbhateja at openjdk.org Wed Jun 5 18:39:03 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Wed, 5 Jun 2024 18:39:03 GMT Subject: RFR: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel [v2] In-Reply-To: References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: <0rsuC87xEDE3ANJo3kbhfPAKI9utbyTsnSJ5wArntmM=.e43d4353-0735-42e8-b304-a38c943086cb@github.com> On Mon, 3 Jun 2024 15:24:16 GMT, Sandhya Viswanathan wrote: >> Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: >> >> Review Comments Incorporated. > > Looks good to me. Thanks @sviswa7 , @vnkozlov , @eme64 , your comments have been addressed. Integrating the patch. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19442#issuecomment-2150709662 From jbhateja at openjdk.org Wed Jun 5 18:39:04 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Wed, 5 Jun 2024 18:39:04 GMT Subject: Integrated: 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel In-Reply-To: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> References: <6IxHpLmCr2e1fKOcbdG38uhJEOsmVUpgVbcGoH4uMnQ=.ac6c99bd-a222-4dbc-a2b2-fdaf1f94a155@github.com> Message-ID: <36kX77mu9mFxT2MzZRNFhb4FXjueGkZLXwRv67yhKT0=.bb8ae455-edb8-4e9b-9a54-f8995fcbe2b7@github.com> On Wed, 29 May 2024 06:10:53 GMT, Jatin Bhateja wrote: > Currently inline expansion of vector to shuffle conversion simply type casts the vector holding indexes to byte vector[1] where as fallback implementation[2] also wraps the indexes to a valid index range [0, VEC_LEN-1) or generates a -ve index for exceptional / OOB indices. > > This patch extends the conversion inline expander to match the fall back implementation. This imposes around 20% performance tax on Vector.toShuffle() intrinsic but fixes this functional bug. > > Kindly review and share your feedback. > > Best Regards, > Jatin > > PS: Patch also fixes an incorrectness issue reported with [JDK-8332118](https://bugs.openjdk.org/browse/JDK-8332118) > > [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2352 > [2] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/AbstractShuffle.java#L58 This pull request has now been integrated. Changeset: 4c09d9f8 Author: Jatin Bhateja URL: https://git.openjdk.org/jdk/commit/4c09d9f8280092949a9fe0f26ee516e699f7ba84 Stats: 161 lines in 3 files changed: 150 ins; 9 del; 2 mod 8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel Reviewed-by: sviswanathan, kvn ------------- PR: https://git.openjdk.org/jdk/pull/19442 From sviswanathan at openjdk.org Wed Jun 5 19:40:57 2024 From: sviswanathan at openjdk.org (Sandhya Viswanathan) Date: Wed, 5 Jun 2024 19:40:57 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) In-Reply-To: References: Message-ID: On Mon, 1 Apr 2024 12:01:27 GMT, Jatin Bhateja wrote: > Summary of changes include with the patch:- > > 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) > 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. > > Kindly review and share your feedback. > > Best Regards, > Jatin @jatin-bhateja Please ignore my approval above, it was in mistake, I don't know how to undo that. Please do look into the review comments/suggestions above. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18562#issuecomment-2150819857 From rrich at openjdk.org Wed Jun 5 20:07:17 2024 From: rrich at openjdk.org (Richard Reingruber) Date: Wed, 5 Jun 2024 20:07:17 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v6] In-Reply-To: References: Message-ID: > This pr adds a few tweaks to [JDK-8318446](https://bugs.openjdk.org/browse/JDK-8318446) which allows enabling it also on big endian platforms (e.g. AIX, S390). JDK-8318446 introduced a C2 optimization to replace consecutive stores to a primitive array with just one store. > > By example (from `TestMergeStores.java`): > > > static Object[] test2a(byte[] a, int offset, long v) { > if (IS_BIG_ENDIAN) { > a[offset + 0] = (byte)(v >> 56); > a[offset + 1] = (byte)(v >> 48); > a[offset + 2] = (byte)(v >> 40); > a[offset + 3] = (byte)(v >> 32); > a[offset + 4] = (byte)(v >> 24); > a[offset + 5] = (byte)(v >> 16); > a[offset + 6] = (byte)(v >> 8); > a[offset + 7] = (byte)(v >> 0); > } else { > a[offset + 0] = (byte)(v >> 0); > a[offset + 1] = (byte)(v >> 8); > a[offset + 2] = (byte)(v >> 16); > a[offset + 3] = (byte)(v >> 24); > a[offset + 4] = (byte)(v >> 32); > a[offset + 5] = (byte)(v >> 40); > a[offset + 6] = (byte)(v >> 48); > a[offset + 7] = (byte)(v >> 56); > } > return new Object[]{ a }; > } > > > Depending on the endianess 8 bytes are stored into an array. The order of the stores is the same as the order of an 8-byte-store therefore 8 1-byte-stores can be replaced with just one 8-byte-store (if there aren't too many range checks). > > Additionally I've fixed a few comments and a test bug. > > The optimization seems to be a little bit more effective on big endian platforms. > > Again by example: > > > static Object[] test800a(byte[] a, int offset, long v) { > if (IS_BIG_ENDIAN) { > a[offset + 0] = (byte)(v >> 40); // Removed from candidate list > a[offset + 1] = (byte)(v >> 32); // Removed from candidate list > a[offset + 2] = (byte)(v >> 24); // Merged > a[offset + 3] = (byte)(v >> 16); // Merged > a[offset + 4] = (byte)(v >> 8); // Merged > a[offset + 5] = (byte)(v >> 0); // Merged > } else { > a[offset + 0] = (byte)(v >> 0); // Removed from candidate list > a[offset + 1] = (byte)(v >> 8); // Removed from candidate list > a[offset + 2] = (byte)(v >> 16); // Not merged > a[offset + 3] = (byte)(v >> 24); // Not merged > a[offset + 4] = (byte)(v >> 32); // Not merged > a[offset + 5] = (byte)(v >> 40); // Not merged > } > return new Object[]{ a };... Richard Reingruber has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains 10 additional commits since the last revision: - Merge branch 'master' into 8331311_merge_stores_on_big_endian - Feedback Emanuel - Eliminate IS_BIG_ENDIAN and always execute both variants - test2BE: big endian version of test2 - Improve make_merged_input_value based on Emanuel's feedback - Improve comment - Improve comment - Add bug id - Typo - 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19218/files - new: https://git.openjdk.org/jdk/pull/19218/files/fc870e2b..f7dc0f97 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19218&range=05 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19218&range=04-05 Stats: 86670 lines in 1687 files changed: 63142 ins; 15403 del; 8125 mod Patch: https://git.openjdk.org/jdk/pull/19218.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19218/head:pull/19218 PR: https://git.openjdk.org/jdk/pull/19218 From dlong at openjdk.org Wed Jun 5 20:13:57 2024 From: dlong at openjdk.org (Dean Long) Date: Wed, 5 Jun 2024 20:13:57 GMT Subject: RFR: 8333649: Allow different NativeCall encodings In-Reply-To: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Message-ID: On Wed, 5 Jun 2024 12:35:47 GMT, Robbin Ehn wrote: > Hi all, please consider! > > We want to have different selectable NativeCalls. > These are not the same size, shared code should query instead of using the enum directly. > > Sanity build and tested RV/x86, hoping GHA will catch anything else. > > Thanks, Robbin Do you want to make the enum private so it can't be accessed directly? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19556#issuecomment-2150878616 From cslucas at openjdk.org Wed Jun 5 21:36:05 2024 From: cslucas at openjdk.org (Cesar Soares Lucas) Date: Wed, 5 Jun 2024 21:36:05 GMT Subject: RFR: 8331736: C2: Live Node limit exceeded limit after JDK-8316991 [v2] In-Reply-To: References: Message-ID: <0GGP8Sc3uVZ2ajCe1NsFHvKmZ-WYm1oVn4SL_dLKz-g=.1aca88fe-75b5-46dc-8199-61c0853ccc82@github.com> > Please, consider this patch to interrupt execution of `split_unique_types` (SUT) when number of live nodes reaches 3/4 of `max_live_nodes`. > > The included test case reproduces the problem. The number of live nodes before running phase 3 of SUT is ~20k, after processing about 750 mergemem nodes the number of live nodes is over 70k. This problem was first encountered when running an old `.jar` file that was created before `invokedynamic` optimizations - that's why I disable string optimizations in the test case. The test case is strongly based on [the method that was originally triggering the problem](https://github.com/Unidata/netcdf-java/blob/c782ef80ab54a09befd6d5065c6baeed54949222/cdm/radial/src/main/java/ucar/nc2/iosp/nids/Nidsheader.java#L2174). > > Tested on Linux, Win, Mac x86_64 tier1-3 and GHA. Cesar Soares Lucas has updated the pull request incrementally with one additional commit since the last revision: Address PR feedback: formatting & additional test run. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19545/files - new: https://git.openjdk.org/jdk/pull/19545/files/1e92268a..73808e42 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19545&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19545&range=00-01 Stats: 5 lines in 2 files changed: 1 ins; 0 del; 4 mod Patch: https://git.openjdk.org/jdk/pull/19545.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19545/head:pull/19545 PR: https://git.openjdk.org/jdk/pull/19545 From cslucas at openjdk.org Wed Jun 5 21:36:05 2024 From: cslucas at openjdk.org (Cesar Soares Lucas) Date: Wed, 5 Jun 2024 21:36:05 GMT Subject: RFR: 8331736: C2: Live Node limit exceeded limit after JDK-8316991 [v2] In-Reply-To: <2YQNi-e1oREuM9Mth4aGwsPhbfWxsDcosqhuB99BhOY=.2b0927ab-8a17-4403-9255-56a8eea20c88@github.com> References: <2YQNi-e1oREuM9Mth4aGwsPhbfWxsDcosqhuB99BhOY=.2b0927ab-8a17-4403-9255-56a8eea20c88@github.com> Message-ID: <9Faj-LT0qXLg63sWiVe20tsGHnUm_CHDdt7v-jv_Z20=.51a099ff-f762-43ca-9318-9da73bf285f8@github.com> On Wed, 5 Jun 2024 09:08:14 GMT, Tobias Hartmann wrote: >> Cesar Soares Lucas has updated the pull request incrementally with one additional commit since the last revision: >> >> Address PR feedback: formatting & additional test run. > > Looks good to me. We have similar logic for split-if (see `must_throttle_split_if`). Thank you @TobiHartmann, @chhagedorn for reviewing once more! I addressed your feedback. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19545#issuecomment-2150997218 From kvn at openjdk.org Wed Jun 5 22:34:41 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 22:34:41 GMT Subject: RFR: 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed Message-ID: Add missing check to `pd_call_destination()` similar to check in `pd_set_call_destination()` to avoid arithmetic with `(address)(-1)`. Tested tier1-3,stress,xcomp ------------- Commit messages: - 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed Changes: https://git.openjdk.org/jdk/pull/19568/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19568&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333622 Stats: 5 lines in 1 file changed: 4 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19568.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19568/head:pull/19568 PR: https://git.openjdk.org/jdk/pull/19568 From kvn at openjdk.org Wed Jun 5 22:34:41 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 22:34:41 GMT Subject: RFR: 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 22:28:39 GMT, Vladimir Kozlov wrote: > Add missing check to `pd_call_destination()` similar to check in `pd_set_call_destination()` to avoid arithmetic with `(address)(-1)`. > > Tested tier1-3,stress,xcomp Note: current code works because pd_set_call_destination() ignores result of pd_call_destination() when destination address is -1: [relocInfo_x86.cpp#L120](https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/x86/relocInfo_x86.cpp#L120) ------------- PR Comment: https://git.openjdk.org/jdk/pull/19568#issuecomment-2151063495 From kvn at openjdk.org Wed Jun 5 22:37:46 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Wed, 5 Jun 2024 22:37:46 GMT Subject: RFR: 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 22:28:39 GMT, Vladimir Kozlov wrote: > Add missing check to `pd_call_destination()` similar to check in `pd_set_call_destination()` to avoid arithmetic with `(address)(-1)`. > > Tested tier1-3,stress,xcomp @MBaesken, please verify the fix. Thanks! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19568#issuecomment-2151071123 From duke at openjdk.org Thu Jun 6 01:55:54 2024 From: duke at openjdk.org (kuaiwei) Date: Thu, 6 Jun 2024 01:55:54 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp In-Reply-To: <3wjDkxySe8t8A8feJjH9SzuEK2a2BDt9xBFwJOW8iac=.8b2ba0b1-9457-41da-be9c-2b477a2fa655@github.com> References: <3wjDkxySe8t8A8feJjH9SzuEK2a2BDt9xBFwJOW8iac=.8b2ba0b1-9457-41da-be9c-2b477a2fa655@github.com> Message-ID: On Mon, 3 Jun 2024 11:40:59 GMT, Andrew Haley wrote: >> Some classes in nativeInst_aarch64.hpp are unused and can be removed. >> >> I checked with tier1 tests. > > OK, it's some remnants of the Graal-based jaotc. Patch approved. @theRealAph @chhagedorn Could you help to sponsor it? Thanks. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19518#issuecomment-2151264711 From fyang at openjdk.org Thu Jun 6 02:01:52 2024 From: fyang at openjdk.org (Fei Yang) Date: Thu, 6 Jun 2024 02:01:52 GMT Subject: RFR: 8317720: RISC-V: Implement Adler32 intrinsic [v10] In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 19:38:44 GMT, ArsenyBochkarev wrote: >> Hello everyone! Please review this ~non-vectorized~ implementation of `_updateBytesAdler32` intrinsic. Reference implementation for AArch64 can be found [here](https://github.com/openjdk/jdk9/blob/master/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp#L3281). >> >> ### Correctness checks >> >> Test `test/hotspot/jtreg/compiler/intrinsics/zip/TestAdler32.java` is ok. All tier1 also passed. >> >> ### Performance results on T-Head board >> >> Enabled intrinsic: >> >> | Benchmark | (count) | Mode | Cnt | Score | Error | Units | >> | ------------------------------------- | ----------- | ------ | --------- | ------ | --------- | ---------- | >> | Adler32.TestAdler32.testAdler32Update | 64 | thrpt | 25 | 5522.693 | 23.387 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 128 | thrpt | 25 | 3430.761 | 9.210 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 256 | thrpt | 25 | 1962.888 | 5.323 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 512 | thrpt | 25 | 1050.938 | 0.144 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 1024 | thrpt | 25 | 549.227 | 0.375 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 2048 | thrpt | 25 | 280.829 | 0.170 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 5012 | thrpt | 25 | 116.333 | 0.057 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 8192 | thrpt | 25 | 71.392 | 0.060 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 16384 | thrpt | 25 | 35.784 | 0.019 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 32768 | thrpt | 25 | 17.924 | 0.010 | ops/ms | >> | Adler32.TestAdler32.testAdler32Update | 65536 | thrpt | 25 | 8.940 | 0.003 | ops/ms | >> >> Disabled intrinsic: >> >> | Benchmark | (count) | Mode | Cnt | Score | Error | Units | >> | ------------------------------------- | ----------- | ------ | --------- | ------ | --------- | ---------- | >> |Adler32.TestAdler32.testAdler32Update|64|thrpt|25|655.633|5.845|ops/ms| >> |Adler32.TestAdler32.testAdler32Update|128|thrpt|25|587.418|10.062|ops/ms| >> |Adler32.TestAdler32.testAdler32Update|256|thrpt|25|546.675|11.598|ops/ms| >> |Adler32.TestAdler32.testAdler32Update|512|thrpt|25|432.328|11.517|ops/ms| >> |Adler32.TestAdler32.testAdler32Update|1024|thrpt|25|311.771|4.238|ops/ms| >> |Adler32.TestAdler32.testAdler32Update|2048|thrpt|25|202.648|2.486|ops/ms| >> |Adler32.TestAdler32.testAdler32Update|5012|thrpt|... > > ArsenyBochkarev has updated the pull request incrementally with two additional commits since the last revision: > > - Fix vrsub_vi for case of vlen > 128 > - Add process_bytes_by32 function Hi, Thanks for the update. src/hotspot/cpu/riscv/stubGenerator_riscv.cpp line 5168: > 5166: void adler32_process_bytes_by16(Register buff, Register s1, Register s2, Register right_16_bits, > 5167: VectorRegister vtable, VectorRegister vzero, VectorRegister *vbytes, VectorRegister *vs1acc, VectorRegister *vs2acc, > 5168: Register temp0, Register temp1, Register temp2, VectorRegister vtemp1, VectorRegister vtemp2, int LMUL) { Let's remove this `LMUL` param as all the callsites now passes value 1. Question: Did you consider unifying adler32_process_bytes_by16/32/64 into one function with one extra param indicating the size? Seems to me that they duplicate most of the code. And I guess there should be no big difference for the 16 variant to do vector-widening reduction sum at the end just like the other two? BTW: I can help test the performance difference as I have just added Banana-PI into my RV testing army. ------------- PR Review: https://git.openjdk.org/jdk/pull/18382#pullrequestreview-2100636299 PR Review Comment: https://git.openjdk.org/jdk/pull/18382#discussion_r1628660207 From gcao at openjdk.org Thu Jun 6 02:51:50 2024 From: gcao at openjdk.org (Gui Cao) Date: Thu, 6 Jun 2024 02:51:50 GMT Subject: RFR: 8333652: RISC-V: compiler/vectorapi/VectorGatherMaskFoldingTest.java fails when using RVV Message-ID: Hi, We are experiencing test failures in test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java on Banana Pi BPI-F3 board (has RVV1.0), see jbs issue for exception information. related C2 instruct: https://github.com/openjdk/jdk/blob/326dbb1b139dd1ec1b8605339b91697cdf49da9a/src/hotspot/cpu/riscv/riscv_v.ad#L4805-L4811 If we use vluxei32_v, then the sew here needs to be Assembler::e32, Similarly, here if it's Assembler::e64, then we need to use vluxei64_v. ### Testing - [x] Run VectorGatherMaskFoldingTest.java on Banana Pi BPI-F3 board (has RVV1.0) - [x] test/jdk/jdk/incubator/vector on Banana Pi BPI-F3 board (has RVV1.0) - [x] Run VectorGatherMaskFoldingTest.java on SOPHON SG2042 (without RVV1.0) ------------- Commit messages: - 8333652: RISC-V: compiler/vectorapi/VectorGatherMaskFoldingTest.java fails when using RVV Changes: https://git.openjdk.org/jdk/pull/19564/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19564&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333652 Stats: 86 lines in 2 files changed: 70 ins; 2 del; 14 mod Patch: https://git.openjdk.org/jdk/pull/19564.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19564/head:pull/19564 PR: https://git.openjdk.org/jdk/pull/19564 From gcao at openjdk.org Thu Jun 6 03:47:52 2024 From: gcao at openjdk.org (Gui Cao) Date: Thu, 6 Jun 2024 03:47:52 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 [v3] In-Reply-To: References: Message-ID: <4cMjqP88OTYsjdIMqNva2ZOjie86WoKEzYh1IMHRrCM=.b3021b47-e456-4767-98d8-94e993b00d9b@github.com> > Hi, VectorGatherMaskFoldingTest.java Test fails when max vector bits is 64, when max vector bits is 64, LongVector.SPECIES_MAX.length() and DoubleVector.SPECIES_MAX.length() is 1. > > We can reproduce this problem in two ways: > 1. We can use riscv without rvv1.0 board to reproduce this problem > 2. Run VectorGatherMaskFoldingTest.java on aarch64 client mode without `-XX:+IncrementalInlineForceCleanup` Option, the `-XX:+IncrementalInlineForceCleanup` is C2 Option, so we need to remove this Option from the VectorGatherMaskFoldingTest.main method. error message: > > Base Test: @Test testDoubleVectorStoreLoadMaskedVector: > compiler.lib.ir_framework.shared.TestRunException: There was an error while invoking @Test method public static void compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(). Target: null. Arguments: > at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:84) > at compiler.lib.ir_framework.test.BaseTest.invokeTest(BaseTest.java:71) > at compiler.lib.ir_framework.test.AbstractTest.run(AbstractTest.java:98) > at compiler.lib.ir_framework.test.TestVM.runTests(TestVM.java:861) > at compiler.lib.ir_framework.test.TestVM.start(TestVM.java:252) > at compiler.lib.ir_framework.test.TestVM.main(TestVM.java:165) > Caused by: java.lang.reflect.InvocationTargetException > at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:118) > at java.base/java.lang.reflect.Method.invoke(Method.java:580) > at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:80) > ... 5 more > Caused by: java.lang.RuntimeException: assertNotEquals: expected [1.0] to not equal [1.0] > at jdk.test.lib.Asserts.fail(Asserts.java:691) > at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:451) > at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:435) > at compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(VectorGatherMaskFoldingTest.java:1089) > at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103) > ... 7 more > > > For example, the following method will be failed: > > private static final VectorSpecies L_SPECIES = LongVector.SPECIES_MAX; > private static final VectorSpecies D_SPECIES = DoubleVector.SPECIES_MAX; > ... > @Test > @IR(counts = { IRNode.STORE_VECTOR_MASKED, ">= 1", IRNode.LOAD_VECTOR_MASKED, ">= 1" }, applyIfCPUFeatureOr = {"avx512", "true", "sve", "true"}) > public static ... Gui Cao has updated the pull request incrementally with one additional commit since the last revision: Fix for some missed ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19473/files - new: https://git.openjdk.org/jdk/pull/19473/files/1235a453..42907ff3 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19473&range=02 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19473&range=01-02 Stats: 10 lines in 1 file changed: 0 ins; 0 del; 10 mod Patch: https://git.openjdk.org/jdk/pull/19473.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19473/head:pull/19473 PR: https://git.openjdk.org/jdk/pull/19473 From jwaters at openjdk.org Thu Jun 6 04:51:49 2024 From: jwaters at openjdk.org (Julian Waters) Date: Thu, 6 Jun 2024 04:51:49 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp In-Reply-To: References: Message-ID: <4d7ZF-5hU_zw0sVL8jWY6q6akdWzWL9IR2xvLzXwRNI=.57dc9e3e-700c-42ea-b3ff-7975926a771b@github.com> On Mon, 3 Jun 2024 10:43:11 GMT, kuaiwei wrote: > Some classes in nativeInst_aarch64.hpp are unused and can be removed. > > I checked with tier1 tests. I'll help sponsor, but I'm a little concerned about the "?? Found leading lowercase letter in issue title for 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp" - Is that an issue? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19518#issuecomment-2151407076 From amitkumar at openjdk.org Thu Jun 6 05:23:47 2024 From: amitkumar at openjdk.org (Amit Kumar) Date: Thu, 6 Jun 2024 05:23:47 GMT Subject: RFR: 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp In-Reply-To: <4d7ZF-5hU_zw0sVL8jWY6q6akdWzWL9IR2xvLzXwRNI=.57dc9e3e-700c-42ea-b3ff-7975926a771b@github.com> References: <4d7ZF-5hU_zw0sVL8jWY6q6akdWzWL9IR2xvLzXwRNI=.57dc9e3e-700c-42ea-b3ff-7975926a771b@github.com> Message-ID: On Thu, 6 Jun 2024 04:48:41 GMT, Julian Waters wrote: >"?? Found leading lowercase letter in issue title for 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp" - Is that an issue? I think it's complaining because instead of `clean` it should have been `Clean`; That should not be a problem. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19518#issuecomment-2151432277 From cslucas at openjdk.org Thu Jun 6 05:23:50 2024 From: cslucas at openjdk.org (Cesar Soares Lucas) Date: Thu, 6 Jun 2024 05:23:50 GMT Subject: Integrated: 8331736: C2: Live Node limit exceeded limit after JDK-8316991 In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 16:32:15 GMT, Cesar Soares Lucas wrote: > Please, consider this patch to interrupt execution of `split_unique_types` (SUT) when number of live nodes reaches 3/4 of `max_live_nodes`. > > The included test case reproduces the problem. The number of live nodes before running phase 3 of SUT is ~20k, after processing about 750 mergemem nodes the number of live nodes is over 70k. This problem was first encountered when running an old `.jar` file that was created before `invokedynamic` optimizations - that's why I disable string optimizations in the test case. The test case is strongly based on [the method that was originally triggering the problem](https://github.com/Unidata/netcdf-java/blob/c782ef80ab54a09befd6d5065c6baeed54949222/cdm/radial/src/main/java/ucar/nc2/iosp/nids/Nidsheader.java#L2174). > > Tested on Linux, Win, Mac x86_64 tier1-3 and GHA. This pull request has now been integrated. Changeset: b351b5f6 Author: Cesar Soares Lucas Committer: Tobias Hartmann URL: https://git.openjdk.org/jdk/commit/b351b5f60ed836e6e21aa4ce5681e573a6057eb6 Stats: 181 lines in 2 files changed: 181 ins; 0 del; 0 mod 8331736: C2: Live Node limit exceeded limit after JDK-8316991 Reviewed-by: thartmann, chagedorn ------------- PR: https://git.openjdk.org/jdk/pull/19545 From thartmann at openjdk.org Thu Jun 6 05:28:45 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Thu, 6 Jun 2024 05:28:45 GMT Subject: RFR: 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 22:28:39 GMT, Vladimir Kozlov wrote: > Add missing check to `pd_call_destination()` similar to check in `pd_set_call_destination()` to avoid arithmetic with `(address)(-1)`. > > Tested tier1-3,stress,xcomp Looks good to me. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19568#pullrequestreview-2100823865 From gcao at openjdk.org Thu Jun 6 05:31:43 2024 From: gcao at openjdk.org (Gui Cao) Date: Thu, 6 Jun 2024 05:31:43 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 14:49:30 GMT, Damon Fenacci wrote: > On the other hand you might need to add the check here: > > https://github.com/openjdk/jdk/blob/1235a453eef4a838fe07009b0c5e8a962b527bb4/test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java#L966 > > > and here: > https://github.com/openjdk/jdk/blob/1235a453eef4a838fe07009b0c5e8a962b527bb4/test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java#L1098 fixed. > test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java line 785: > >> 783: doubleVector.intoArray(res, 0, doubleIndices, 0, doubleVectorMask); >> 784: doubleVector.intoArray(res2, 0, doubleIndices, 0, doubleVectorMask2); >> 785: Asserts.assertFalse(D_SPECIES.length() != 1 && Arrays.equals(res, res2)); > > Same here. I've fixed all of the above comments. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19473#issuecomment-2151439427 PR Review Comment: https://git.openjdk.org/jdk/pull/19473#discussion_r1628794733 From fyang at openjdk.org Thu Jun 6 05:34:43 2024 From: fyang at openjdk.org (Fei Yang) Date: Thu, 6 Jun 2024 05:34:43 GMT Subject: RFR: 8333652: RISC-V: compiler/vectorapi/VectorGatherMaskFoldingTest.java fails when using RVV In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 15:26:57 GMT, Gui Cao wrote: > Hi, We are experiencing test failures in test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java on Banana Pi BPI-F3 board (has RVV1.0), see jbs issue for exception information. > > related C2 instruct: > https://github.com/openjdk/jdk/blob/326dbb1b139dd1ec1b8605339b91697cdf49da9a/src/hotspot/cpu/riscv/riscv_v.ad#L4805-L4811 > > As rvv1.0 manual requirements for vector indexed loads[1]: `Vector unit-stride and constant-stride use the EEW/EMUL encoded in the instruction for the data values, while vector indexed loads and stores use the EEW/EMUL encoded in the instruction for the index values and the SEW/LMUL encoded in vtype for the data values.` > So in this case where a 64-bit vector index is used, we need to use the vluxei64_v (64-bit indexed load) > > ### Testing > - [x] Run VectorGatherMaskFoldingTest.java on Banana Pi BPI-F3 board (has RVV1.0) > - [x] test/jdk/jdk/incubator/vector on Banana Pi BPI-F3 board (has RVV1.0) > - [x] Run VectorGatherMaskFoldingTest.java on SOPHON SG2042 (without RVV1.0) > > [1] https://github.com/riscv/riscv-v-spec/blob/v1.0/v-spec.adoc#sec-vector-loadstore-width-encoding Looks reasonable. ------------- Marked as reviewed by fyang (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19564#pullrequestreview-2100830297 From thartmann at openjdk.org Thu Jun 6 05:41:47 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Thu, 6 Jun 2024 05:41:47 GMT Subject: Integrated: 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 11:57:58 GMT, Tobias Hartmann wrote: > Ubsan detected undefined behavior in `ciTypeFlow::get_start_state` because an invalid value of `4294967295` is assigned to enum `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L150-L152 > > The problem is that if the C++ compiler decides to encode `Cell` with an unsigned int, casting a negative integer value will lead to an underflow and therefore a value > `Cell_max = INT_MAX`. Here, `state->tos()` returns a value < 0: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.cpp#L407 > > which is casted to a `Cell`: > https://github.com/openjdk/jdk/blob/ac7119f0d5319a3fb44dc67a938c3e1eb21b9202/src/hotspot/share/ci/ciTypeFlow.hpp#L211 > > I simply re-wrote the code to not require a negative `Cell` value to iterate over the locals and setting them to bottom type. > > Thanks, > Tobias This pull request has now been integrated. Changeset: 6f690a5b Author: Tobias Hartmann URL: https://git.openjdk.org/jdk/commit/6f690a5b01c3d438ba0a2a848a3909e43db650d8 Stats: 5 lines in 1 file changed: 0 ins; 2 del; 3 mod 8333177: Invalid value used for enum Cell in ciTypeFlow::get_start_state Reviewed-by: kvn, chagedorn ------------- PR: https://git.openjdk.org/jdk/pull/19520 From vlivanov at openjdk.org Thu Jun 6 06:05:55 2024 From: vlivanov at openjdk.org (Vladimir Ivanov) Date: Thu, 6 Jun 2024 06:05:55 GMT Subject: RFR: 8331658: secondary_super_cache does not scale well: C1 [v2] In-Reply-To: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> References: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> Message-ID: <_Okxq_4Hxr2yG9D5tLaqCtjhSMhHNb7yiUiYwO1mRT8=.89819337-8256-49d4-b933-7f0d168e0e78@github.com> On Wed, 29 May 2024 09:32:41 GMT, Andrew Haley wrote: >> This is the C1 version of [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). >> >> The new logic in this PR is as simple as I can make it. It is a somewhat-simplified version of the C2 change in [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). In order to reduce risk I haven't touched the existing slow subtype stub. >> The register allocation logic in the existing code is pretty gnarly, and I have no desire to break anything at this point in the release cycle, so I have allocated just one register more than the existing code does. >> >> Performance is pretty good. Before and after: >> >> x64, AMD 2950X, 8 cores: >> >> >> Benchmark Mode Cnt Score Error Units >> SecondarySuperCacheHits.test avgt 5 0.959 ? 0.091 ns/op >> SecondarySuperCacheInterContention.test avgt 5 42.931 ? 6.951 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 42.397 ? 7.708 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 43.466 ? 8.238 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 74.660 ? 0.127 ns/op >> >> SecondarySuperCacheHits.test avgt 5 1.480 ? 0.077 ns/op >> SecondarySuperCacheInterContention.test avgt 5 1.461 ? 0.063 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 1.767 ? 0.078 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 1.155 ? 0.052 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 1.421 ? 0.002 ns/op >> >> AArch64, Mac M3, 8 cores: >> >> >> Benchmark Mode Cnt Score Error Units >> SecondarySuperCacheHits.test avgt 5 0.835 ? 0.021 ns/op >> SecondarySuperCacheInterContention.test avgt 5 74.078 ? 18.095 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 81.863 ? 42.492 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 66.293 ? 11.254 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 335.563 ? 6.171 ns/op >> >> SecondarySuperCacheHits.test avgt 5 1.212 ? 0.004 ns/op >> SecondarySuperCacheInterContention.test avgt 5 0.871 ? 0.002 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 0.626 ? 0.003 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 1.115 ? 0.006 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 0.696 ? 0.001 ns/op >> >> >> >> The first test, `SecondarySuperCacheHits`, showns a small regression. It's... > > Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: > > JDK-8331658: secondary_super_cache does not scale well: C1 Thinking more about the proposal itself (JDK-8331658) I'm curious how relevant scalability issues with SSC are for Client (C1-only) VM. I'd expect it to be deployed in constrained environments where contention has much smaller effects (if present at all). Maybe it's fine to leave SSC as is in Client VM and focus performance work on Tiered VM? > lookup_secondary_supers_table needs to use fixed registers, and quite a lot of them. This patch is a version of the table lookup that uses as few registers as possible, and none of them are fixed. The main reason why `lookup_secondary_supers_table` uses pre-defined registers is calling conventions between fast path checks and the stub on slow path. If slow path is inlined, the register set can be chosen arbitrarily. Still, I agree that table lookup needs more scratch registers to operate. FTR `MacroAssembler::check_klass_subtype_slow_path` also has some constraints (at least, on x86), but that's because it relies on `SCAS` instruction. Still, `MacroAssembler::check_klass_subtype_slow_path` is used in different contexts with wildly varying set of available registers (I tried to gather some data on that during my earlier experiments [1]). It heavily relies on spilling to shuffle values or allocate scratch registers when needed. And, speaking of C1, the arguments for subtype check slow path are also passed on stack to simplify implementation. So, performing more spills per se doesn't look like a show-stopper (when it happens outside C2-generated code). [1] https://github.com/iwanowww/jdk/blob/ssc.cuckoo.2seed/src/hotspot/cpu/x86/macroAssembler_x86.cpp#L4441 ------------- PR Comment: https://git.openjdk.org/jdk/pull/19426#issuecomment-2151474456 From amitkumar at openjdk.org Thu Jun 6 06:09:51 2024 From: amitkumar at openjdk.org (Amit Kumar) Date: Thu, 6 Jun 2024 06:09:51 GMT Subject: RFR: 8333412: [s390x] Add support for branch on count instruction Message-ID: Adds support for BCT, BCTG, BCTR instructions. ------------- Commit messages: - BCT, BCTG, BCTR instructions support Changes: https://git.openjdk.org/jdk/pull/19572/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19572&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333412 Stats: 28 lines in 2 files changed: 23 ins; 0 del; 5 mod Patch: https://git.openjdk.org/jdk/pull/19572.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19572/head:pull/19572 PR: https://git.openjdk.org/jdk/pull/19572 From chagedorn at openjdk.org Thu Jun 6 06:21:44 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Thu, 6 Jun 2024 06:21:44 GMT Subject: RFR: 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed In-Reply-To: References: Message-ID: <9_S_GHHcwIyh-DdvfD1yTni2uImkSSGHRkD5unsPMZg=.5b8c5c7a-ceb5-4881-91b7-0ed088b74844@github.com> On Wed, 5 Jun 2024 22:28:39 GMT, Vladimir Kozlov wrote: > Add missing check to `pd_call_destination()` similar to check in `pd_set_call_destination()` to avoid arithmetic with `(address)(-1)`. > > Tested tier1-3,stress,xcomp Looks good to me, too. ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19568#pullrequestreview-2100887615 From duke at openjdk.org Thu Jun 6 06:25:44 2024 From: duke at openjdk.org (kuaiwei) Date: Thu, 6 Jun 2024 06:25:44 GMT Subject: RFR: 8333410: [AArch64] Clean unused classes in nativeInst_aarch64.hpp In-Reply-To: <4d7ZF-5hU_zw0sVL8jWY6q6akdWzWL9IR2xvLzXwRNI=.57dc9e3e-700c-42ea-b3ff-7975926a771b@github.com> References: <4d7ZF-5hU_zw0sVL8jWY6q6akdWzWL9IR2xvLzXwRNI=.57dc9e3e-700c-42ea-b3ff-7975926a771b@github.com> Message-ID: On Thu, 6 Jun 2024 04:48:41 GMT, Julian Waters wrote: > I'll help sponsor, but I'm a little concerned about the "?? Found leading lowercase letter in issue title for 8333410: [AArch64] clean unused classes in nativeInst_aarch64.hpp" - Is that an issue? I didn't noticed the warning before. It can be fixed after changing title. Thanks. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19518#issuecomment-2151497721 From duke at openjdk.org Thu Jun 6 06:28:53 2024 From: duke at openjdk.org (kuaiwei) Date: Thu, 6 Jun 2024 06:28:53 GMT Subject: Integrated: 8333410: [AArch64] Clean unused classes in nativeInst_aarch64.hpp In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 10:43:11 GMT, kuaiwei wrote: > Some classes in nativeInst_aarch64.hpp are unused and can be removed. > > I checked with tier1 tests. This pull request has now been integrated. Changeset: 8f078532 Author: Kuai Wei Committer: Julian Waters URL: https://git.openjdk.org/jdk/commit/8f0785325d54fb5b68867788d1fa3b20a238eaad Stats: 265 lines in 2 files changed: 1 ins; 258 del; 6 mod 8333410: [AArch64] Clean unused classes in nativeInst_aarch64.hpp Reviewed-by: aph, chagedorn ------------- PR: https://git.openjdk.org/jdk/pull/19518 From epeter at openjdk.org Thu Jun 6 06:35:02 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 06:35:02 GMT Subject: RFR: 8333647: C2 SuperWord: some additional PopulateIndex tests [v2] In-Reply-To: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> References: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> Message-ID: > When I did the deep refactoring in https://github.com/openjdk/jdk/pull/19261, I wanted some more tests for `PopulateIndex`. I push them separately to keep the other RFE smaller. > > I filed a follow-up RFE for some cases that do not vectorize: https://bugs.openjdk.org/browse/JDK-8332878 Emanuel Peter has updated the pull request incrementally with one additional commit since the last revision: copyright ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19558/files - new: https://git.openjdk.org/jdk/pull/19558/files/a4c9f631..e73f3bb4 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19558&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19558&range=00-01 Stats: 2 lines in 2 files changed: 2 ins; 0 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/19558.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19558/head:pull/19558 PR: https://git.openjdk.org/jdk/pull/19558 From stuefe at openjdk.org Thu Jun 6 06:34:46 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Thu, 6 Jun 2024 06:34:46 GMT Subject: RFR: 8327240: Obsolete Tier2CompileThreshold/Tier2BackEdgeThreshold product flags [v3] In-Reply-To: <9GcIqBKgoA6aBHea2WAQYfmYxA8V1hPUmGwm8GW3OWk=.7bd916e4-c8a0-4a54-a29c-d7b4b5ac6579@github.com> References: <86N-93rC4Q2Q1d_YQSARfjQAHNNCEMvCXMq0_fk5A48=.9c621bb8-b724-40fb-afd7-835773a0e942@github.com> <9GcIqBKgoA6aBHea2WAQYfmYxA8V1hPUmGwm8GW3OWk=.7bd916e4-c8a0-4a54-a29c-d7b4b5ac6579@github.com> Message-ID: On Wed, 24 Apr 2024 20:17:45 GMT, Sonia Zaldana Calles wrote: >> Hi all, >> >> This PR removes the unused options ```Tier2CompileThreshold``` and ```Tier2BackEdgeThreshold```. >> >> Testing: >> - [x] Verified warning is issued as support was removed. >> >> Thanks, >> Sonia > > Sonia Zaldana Calles has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains five additional commits since the last revision: > > - formatting > - Merge master > - Adding to obsolete list > - Deleting usage of flag in test > - 8327240: Remove unused Tier2CompileThreshold/Tier2BackEdgeThreshold product flags @SoniaZaldana I would: - first raise the question on hotspot-compiler-dev. - since this is a product flag, it needs a CSR, so you need to create one. There are many examples in JBS, e.g. https://bugs.openjdk.org/browse/JDK-8320162 - then you can go ahead with this change. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18904#issuecomment-2151508706 From epeter at openjdk.org Thu Jun 6 06:42:43 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 06:42:43 GMT Subject: RFR: 8333647: C2 SuperWord: some additional PopulateIndex tests [v2] In-Reply-To: References: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> Message-ID: On Wed, 5 Jun 2024 16:45:40 GMT, Vladimir Kozlov wrote: > Add Oracle's copyright line to these files. Nice catch, fixed! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19558#issuecomment-2151519410 From duke at openjdk.org Thu Jun 6 06:42:46 2024 From: duke at openjdk.org (kuaiwei) Date: Thu, 6 Jun 2024 06:42:46 GMT Subject: RFR: 8333410: [AArch64] Clean unused classes in nativeInst_aarch64.hpp In-Reply-To: References: <4d7ZF-5hU_zw0sVL8jWY6q6akdWzWL9IR2xvLzXwRNI=.57dc9e3e-700c-42ea-b3ff-7975926a771b@github.com> Message-ID: On Thu, 6 Jun 2024 06:22:55 GMT, kuaiwei wrote: > /sponsor Thanks ------------- PR Comment: https://git.openjdk.org/jdk/pull/19518#issuecomment-2151519388 From dfenacci at openjdk.org Thu Jun 6 06:46:44 2024 From: dfenacci at openjdk.org (Damon Fenacci) Date: Thu, 6 Jun 2024 06:46:44 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 [v3] In-Reply-To: <4cMjqP88OTYsjdIMqNva2ZOjie86WoKEzYh1IMHRrCM=.b3021b47-e456-4767-98d8-94e993b00d9b@github.com> References: <4cMjqP88OTYsjdIMqNva2ZOjie86WoKEzYh1IMHRrCM=.b3021b47-e456-4767-98d8-94e993b00d9b@github.com> Message-ID: On Thu, 6 Jun 2024 03:47:52 GMT, Gui Cao wrote: >> Hi, VectorGatherMaskFoldingTest.java Test fails when max vector bits is 64, when max vector bits is 64, LongVector.SPECIES_MAX.length() and DoubleVector.SPECIES_MAX.length() is 1. >> >> We can reproduce this problem in two ways: >> 1. We can use riscv without rvv1.0 board to reproduce this problem >> 2. Run VectorGatherMaskFoldingTest.java on aarch64 client mode without `-XX:+IncrementalInlineForceCleanup` Option, the `-XX:+IncrementalInlineForceCleanup` is C2 Option, so we need to remove this Option from the VectorGatherMaskFoldingTest.main method. error message: >> >> Base Test: @Test testDoubleVectorStoreLoadMaskedVector: >> compiler.lib.ir_framework.shared.TestRunException: There was an error while invoking @Test method public static void compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(). Target: null. Arguments: >> at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:84) >> at compiler.lib.ir_framework.test.BaseTest.invokeTest(BaseTest.java:71) >> at compiler.lib.ir_framework.test.AbstractTest.run(AbstractTest.java:98) >> at compiler.lib.ir_framework.test.TestVM.runTests(TestVM.java:861) >> at compiler.lib.ir_framework.test.TestVM.start(TestVM.java:252) >> at compiler.lib.ir_framework.test.TestVM.main(TestVM.java:165) >> Caused by: java.lang.reflect.InvocationTargetException >> at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:118) >> at java.base/java.lang.reflect.Method.invoke(Method.java:580) >> at compiler.lib.ir_framework.test.BaseTest.invokeTestMethod(BaseTest.java:80) >> ... 5 more >> Caused by: java.lang.RuntimeException: assertNotEquals: expected [1.0] to not equal [1.0] >> at jdk.test.lib.Asserts.fail(Asserts.java:691) >> at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:451) >> at jdk.test.lib.Asserts.assertNotEquals(Asserts.java:435) >> at compiler.vectorapi.VectorGatherMaskFoldingTest.testDoubleVectorStoreLoadMaskedVector(VectorGatherMaskFoldingTest.java:1089) >> at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103) >> ... 7 more >> >> >> For example, the following method will be failed: >> >> private static final VectorSpecies L_SPECIES = LongVector.SPECIES_MAX; >> private static final VectorSpecies D_SPECIES = DoubleVector.SPECIES_MAX; >> ... >> @Test >> @IR(counts = { IRNode.STORE_VECTOR_MASKED, ">= 1", IRNode.LOAD_VECTOR_MASKED, ">= 1" }, apply... > > Gui Cao has updated the pull request incrementally with one additional commit since the last revision: > > Fix for some missed It looks good now, thanks! Have you tried running the test again with a 64-bit vector platform? I'd also update the description of this PR. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19473#issuecomment-2151524105 From gcao at openjdk.org Thu Jun 6 06:57:43 2024 From: gcao at openjdk.org (Gui Cao) Date: Thu, 6 Jun 2024 06:57:43 GMT Subject: RFR: 8333248: VectorGatherMaskFoldingTest.java failed when maximum vector bits is 64 [v3] In-Reply-To: References: <4cMjqP88OTYsjdIMqNva2ZOjie86WoKEzYh1IMHRrCM=.b3021b47-e456-4767-98d8-94e993b00d9b@github.com> Message-ID: On Thu, 6 Jun 2024 06:43:41 GMT, Damon Fenacci wrote: > It looks good now, thanks! Have you tried running the test again with a 64-bit vector platform? I'd also update the description of this PR. Yes. This case still pass on 64-bit vector platforms, including aarch64 client/server build and linux-riscv64 (with/without vector extension). ------------- PR Comment: https://git.openjdk.org/jdk/pull/19473#issuecomment-2151539981 From chagedorn at openjdk.org Thu Jun 6 07:00:49 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Thu, 6 Jun 2024 07:00:49 GMT Subject: Integrated: 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 13:33:27 GMT, Christian Hagedorn wrote: > In the patch of [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386), I've added a test case that took the code path in `clone_loop_handle_data_uses()` with the new `OpaqueInitializedAssertionPredicateNode`. So, it proved that we should also add a case for `OpaqueInitializedAssertionPredicate` - yet somehow, I forgot to do that. Unfortunately, I could not come up with a failing test if we do not handle this case separately and the mistake went unnoticed. > > But the fuzzer has now found such a case where we crash when creating the Mach graph because we have an `If` with a `Phi` input instead of a `Bool`. This happens exactly because of not handling `OpaqueInitializedAssertionPredicate` in `clone_loop_handle_data_uses()`. > > We have an outside use of a `4843 Bool` (`4193 RangeCheck`) which is also an input into a `OpaqueInitializedAssertionPredicate` node: > > ![image](https://github.com/openjdk/jdk/assets/17833009/7f43d987-6e29-449e-8392-58e76e111e80) > > We should now clone this `Bool/Cmp` down to avoid having a `Phi` between the `OpaqueInitializedAssertionPredicate` and > the `Bool` node which currently wrongly happens: > > ![image](https://github.com/openjdk/jdk/assets/17833009/a10554a0-d48a-406c-a510-fef9666c45af) > > and we crash later. With this simple patch, this is being avoided. > > The good thing is that we now have a test that makes sure that this new condition is properly tested. > > Thanks, > Christian This pull request has now been integrated. Changeset: 7ef28312 Author: Christian Hagedorn URL: https://git.openjdk.org/jdk/commit/7ef283129388413b362942fb45af48d1f7393b67 Stats: 52 lines in 2 files changed: 44 ins; 1 del; 7 mod 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi Reviewed-by: thartmann, kvn ------------- PR: https://git.openjdk.org/jdk/pull/19561 From chagedorn at openjdk.org Thu Jun 6 07:00:48 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Thu, 6 Jun 2024 07:00:48 GMT Subject: RFR: 8333644: C2: assert(is_Bool()) failed: invalid node class: Phi [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 14:48:12 GMT, Christian Hagedorn wrote: >> In the patch of [JDK-8330386](https://bugs.openjdk.org/browse/JDK-8330386), I've added a test case that took the code path in `clone_loop_handle_data_uses()` with the new `OpaqueInitializedAssertionPredicateNode`. So, it proved that we should also add a case for `OpaqueInitializedAssertionPredicate` - yet somehow, I forgot to do that. Unfortunately, I could not come up with a failing test if we do not handle this case separately and the mistake went unnoticed. >> >> But the fuzzer has now found such a case where we crash when creating the Mach graph because we have an `If` with a `Phi` input instead of a `Bool`. This happens exactly because of not handling `OpaqueInitializedAssertionPredicate` in `clone_loop_handle_data_uses()`. >> >> We have an outside use of a `4843 Bool` (`4193 RangeCheck`) which is also an input into a `OpaqueInitializedAssertionPredicate` node: >> >> ![image](https://github.com/openjdk/jdk/assets/17833009/7f43d987-6e29-449e-8392-58e76e111e80) >> >> We should now clone this `Bool/Cmp` down to avoid having a `Phi` between the `OpaqueInitializedAssertionPredicate` and >> the `Bool` node which currently wrongly happens: >> >> ![image](https://github.com/openjdk/jdk/assets/17833009/a10554a0-d48a-406c-a510-fef9666c45af) >> >> and we crash later. With this simple patch, this is being avoided. >> >> The good thing is that we now have a test that makes sure that this new condition is properly tested. >> >> Thanks, >> Christian > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Update test/hotspot/jtreg/compiler/predicates/assertion/TestOpaqueInitializedAssertionPredicateNode.java > > Co-authored-by: Tobias Hartmann Thanks Vladimir for your review! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19561#issuecomment-2151543429 From chagedorn at openjdk.org Thu Jun 6 07:02:49 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Thu, 6 Jun 2024 07:02:49 GMT Subject: RFR: 8333647: C2 SuperWord: some additional PopulateIndex tests [v2] In-Reply-To: References: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> Message-ID: On Thu, 6 Jun 2024 06:35:02 GMT, Emanuel Peter wrote: >> When I did the deep refactoring in https://github.com/openjdk/jdk/pull/19261, I wanted some more tests for `PopulateIndex`. I push them separately to keep the other RFE smaller. >> >> I filed a follow-up RFE for some cases that do not vectorize: https://bugs.openjdk.org/browse/JDK-8332878 > > Emanuel Peter has updated the pull request incrementally with one additional commit since the last revision: > > copyright Looks good! ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19558#pullrequestreview-2100969315 From epeter at openjdk.org Thu Jun 6 07:04:55 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 07:04:55 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v4] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 15:18:12 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Add randomized tests Nice work with the proofs, good work @chhagedorn ! src/hotspot/share/opto/loopopts.cpp line 3055: > 3053: // exit-region | > 3054: // | | > 3055: // dummy-if | Still, I don't see this `dummy-if` mentioned in any of the comments. Can you add a comment line about what this is, and where it comes from? ------------- Marked as reviewed by epeter (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19522#pullrequestreview-2100967423 PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1628879026 From amitkumar at openjdk.org Thu Jun 6 07:04:58 2024 From: amitkumar at openjdk.org (Amit Kumar) Date: Thu, 6 Jun 2024 07:04:58 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v6] In-Reply-To: References: Message-ID: <2g9metV58pvf8V6QKDS22U_JScvVDILoUjpjmSkq4fQ=.a8275968-66a0-4bf6-a3d6-c38bce96091a@github.com> On Wed, 5 Jun 2024 20:07:17 GMT, Richard Reingruber wrote: >> This pr adds a few tweaks to [JDK-8318446](https://bugs.openjdk.org/browse/JDK-8318446) which allows enabling it also on big endian platforms (e.g. AIX, S390). JDK-8318446 introduced a C2 optimization to replace consecutive stores to a primitive array with just one store. >> >> By example (from `TestMergeStores.java`): >> >> >> static Object[] test2a(byte[] a, int offset, long v) { >> if (IS_BIG_ENDIAN) { >> a[offset + 0] = (byte)(v >> 56); >> a[offset + 1] = (byte)(v >> 48); >> a[offset + 2] = (byte)(v >> 40); >> a[offset + 3] = (byte)(v >> 32); >> a[offset + 4] = (byte)(v >> 24); >> a[offset + 5] = (byte)(v >> 16); >> a[offset + 6] = (byte)(v >> 8); >> a[offset + 7] = (byte)(v >> 0); >> } else { >> a[offset + 0] = (byte)(v >> 0); >> a[offset + 1] = (byte)(v >> 8); >> a[offset + 2] = (byte)(v >> 16); >> a[offset + 3] = (byte)(v >> 24); >> a[offset + 4] = (byte)(v >> 32); >> a[offset + 5] = (byte)(v >> 40); >> a[offset + 6] = (byte)(v >> 48); >> a[offset + 7] = (byte)(v >> 56); >> } >> return new Object[]{ a }; >> } >> >> >> Depending on the endianess 8 bytes are stored into an array. The order of the stores is the same as the order of an 8-byte-store therefore 8 1-byte-stores can be replaced with just one 8-byte-store (if there aren't too many range checks). >> >> Additionally I've fixed a few comments and a test bug. >> >> The optimization seems to be a little bit more effective on big endian platforms. >> >> Again by example: >> >> >> static Object[] test800a(byte[] a, int offset, long v) { >> if (IS_BIG_ENDIAN) { >> a[offset + 0] = (byte)(v >> 40); // Removed from candidate list >> a[offset + 1] = (byte)(v >> 32); // Removed from candidate list >> a[offset + 2] = (byte)(v >> 24); // Merged >> a[offset + 3] = (byte)(v >> 16); // Merged >> a[offset + 4] = (byte)(v >> 8); // Merged >> a[offset + 5] = (byte)(v >> 0); // Merged >> } else { >> a[offset + 0] = (byte)(v >> 0); // Removed from candidate list >> a[offset + 1] = (byte)(v >> 8); // Removed from candidate list >> a[offset + 2] = (byte)(v >> 16); // Not merged >> a[offset + 3] = (byte)(v >> 24); // Not merged >> a[offset + 4] = (byte)(v >> 32); // Not merge... > > Richard Reingruber has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains 10 additional commits since the last revision: > > - Merge branch 'master' into 8331311_merge_stores_on_big_endian > - Feedback Emanuel > - Eliminate IS_BIG_ENDIAN and always execute both variants > - test2BE: big endian version of test2 > - Improve make_merged_input_value based on Emanuel's feedback > - Improve comment > - Improve comment > - Add bug id > - Typo > - 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store I did another round of testing on s390x. looks good. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19218#issuecomment-2151550321 From duke at openjdk.org Thu Jun 6 07:32:38 2024 From: duke at openjdk.org (MaxXing) Date: Thu, 6 Jun 2024 07:32:38 GMT Subject: RFR: 8333334: C2: Make result of `Node::dominates` more precise to enhance scalar replacement [v2] In-Reply-To: References: Message-ID: > This patch changes the algorithm of `Node::dominates` to make the result more precise, and allows the iterators of `ConcurrentHashMap` to be scalar replaced. > > The previous algorithm will return a conservative result when encountering a dead control flow, and only try the first two input paths of a multi-input Region node, which may prevent the scalar replacement in some cases. > > For example, with G1 GC enabled, C2 generates GC barriers for `ConcurrentHashMap` iteration operations at some early phases, and then eliminates them in a later IGVN, but `LoadNode` is also idealized in the same IGVN. This causes `LoadNode::Ideal` to see some dead barrier control flows, and refuse to split some instance field loads through Phi due to the conservative result of `Node::dominates`, and thus the scalar replacement can not be applied to iterators in the later macro elimination phase. > > This patch allows `Node::dominates` to try other paths of the last multi-input Region node when the first path is dead, and makes `ConcurrentHashMap` iteration ~30% faster: > > > Benchmark (nkeys) Mode Cnt Score Error Units > Maps.testConcurrentHashMapIterators 10000 avgt 15 414099.085 ? 33230.945 ns/op # baseline > Maps.testConcurrentHashMapIterators 10000 avgt 15 315490.281 ? 3037.056 ns/op # patch > > > Testing: tier1-4. MaxXing has updated the pull request incrementally with one additional commit since the last revision: Revert last commit, and push the `LoadNode` back to the worklist to wait for the dead code to be removed. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19496/files - new: https://git.openjdk.org/jdk/pull/19496/files/e3330ece..b5db38dc Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19496&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19496&range=00-01 Stats: 107 lines in 4 files changed: 39 ins; 34 del; 34 mod Patch: https://git.openjdk.org/jdk/pull/19496.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19496/head:pull/19496 PR: https://git.openjdk.org/jdk/pull/19496 From duke at openjdk.org Thu Jun 6 07:32:39 2024 From: duke at openjdk.org (MaxXing) Date: Thu, 6 Jun 2024 07:32:39 GMT Subject: RFR: 8333334: C2: Make result of `Node::dominates` more precise to enhance scalar replacement [v2] In-Reply-To: References: Message-ID: <4ch5Wd2SW4LvRD8lDW1UWuAnG0I50kwaXQB_1xW3eFQ=.1351c57b-cef7-4e2c-bf91-368dedf0a8b4@github.com> On Wed, 5 Jun 2024 05:40:12 GMT, Tobias Hartmann wrote: >> MaxXing has updated the pull request incrementally with one additional commit since the last revision: >> >> Revert last commit, and push the `LoadNode` back to the worklist to wait for the dead code to be removed. > > Impressive results! I haven't looked at the change yet but here are a few questions / requests: > - Could you add a screenshot of the IR of the case you are describing? > - Wouldn't it help to add the LoadNode back to the IGVN worklist and wait for the dead path to be removed? > - Could you add an [IR framework](https://github.com/openjdk/jdk/blob/master/test/hotspot/jtreg/compiler/lib/ir_framework/README.md) test that verifies that the optimization works as expected? > > Thanks, > Tobias @TobiHartmann Hi Tobias, thanks for your reply! > Could you add a screenshot of the IR of the case you are describing? Sure. Here's a simple example of iterating over the keys of `ConcurrentHashMap`: public long sumMapKeys() { long sum = 0; Enumeration it = map.keys(); while (it.hasMoreElements()) { sum += (Long) it.nextElement(); } return sum; } And here's what `-XX:+PrintEscapeAnalysis -XX:+PrintEliminateAllocations` says: JavaObject(6) NoEscape(NoEscape) [ 183F 189F 205F 196F 191F 179F 186F 202F 208F 233F 637F 1069F [ 104 109 ]] 92 Allocate === 76 6 69 8 1 (90 89 24 1 1 1 22 1 1 43 77 87 ) [[ 93 94 95 102 103 104 ]] rawptr:NotNull ( int:>=0, java/lang/Object:NotNull *, bool, top, bool ) ConcurrentHashMap::keys @ bci:16 (line 2152) MyBenchmark::sumMapKeys @ bci:6 (line 105) !jvms: ConcurrentHashMap::keys @ bci:16 (line 2152) MyBenchmark::sumMapKeys @ bci:6 (line 105) LocalVar(60) [ 92P [ 109 183b 189b 205b ]] 104 Proj === 92 [[ 105 109 183 189 205 ]] #5 !jvms: ConcurrentHashMap::keys @ bci:16 (line 2152) MyBenchmark::sumMapKeys @ bci:6 (line 105) LocalVar(103) [ 104 92P [ 196b 191b 179b 186b 202b 208b 233b 637b 1069b ]] 109 CheckCastPP === 106 104 [[ 1801 1771 1754 1584 1503 1503 1490 1490 1466 1466 1451 1451 208 1393 1381 1381 179 179 186 186 208 196 191 191 196 202 202 228 297 233 233 1363 1363 1393 1321 1321 1311 637 637 648 648 998 988 1311 1301 477 477 487 487 497 497 569 1301 672 685 767 672 978 920 539 539 1069 1069 557 557 1128 569 631 1061 685 631 ]] #java/util/concurrent/ConcurrentHashMap$KeyIterator (java/util/Iterator,java/util/Enumeration):NotNull:exact * Oop:java/util/concurrent/ConcurrentHashMap$KeyIterator (java/util/Iterator,java/util/Enumeration):NotNull:exact * !jvms: ConcurrentHashMap::keys @ bci:16 (line 2152) MyBenchmark::sumMapKeys @ bci:6 (line 105) NotScalar (Field load) 109 CheckCastPP === 106 104 [[ 1801 1771 1754 1584 1503 1503 1490 1490 1128 569 1451 1451 208 1393 1381 1381 672 685 1069 1069 208 196 191 191 196 978 685 631 297 767 672 569 1301 1393 1321 1321 1311 557 557 631 1061 998 988 1311 1301 477 477 487 487 497 497 ]] #java/util/concurrent/ConcurrentHashMap$KeyIterator (java/util/Iterator,java/util/Enumeration):NotNull:exact *,iid=92 Oop:java/util/concurrent/ConcurrentHashMap$KeyIterator (java/util/Iterator,java/util/Enumeration):NotNull:exact *,iid=92 !jvms: ConcurrentHashMap::keys @ bci:16 (line 2152) MyBenchmark::sumMapKeys @ bci:6 (line 105) >>>> 2186 LoadI === _ 1973 191 [[ 2185 ]] @java/util/concurrent/ConcurrentHashMap$Traverser+12 *, name=index, idx=9; #int !orig=[2178],[2171],[1373] !jvms: ConcurrentHashMap$Traverser::advance @ bci:51 (line 3369) ConcurrentHashMap$KeyIterator::next @ bci:28 (line 3468) ConcurrentHashMap$KeyIterator::nextElement @ bci:1 (line 3472) MyBenchmark::sumMapKeys @ bci:21 (line 107) It shows that scalar replacement is aborted due to field load 2186: ir-diff As we can see its memory is a Phi, and it should be split by `LoadNode::split_through_phi` if its address `109 CheckCastPP` dominates the control flow of the Phi node `1330 Region`. The control node of `CheckCastPP` is `106 Proj`: cast And it does dominate 1330, although not that obvious: cfg But `Node::dominates` think it doesn't because of the dead control flow. > Wouldn't it help to add the LoadNode back to the IGVN worklist and wait for the dead path to be removed? I tried to revert the change of main algorithm of `Node::dominates`, and just add some code to add the LoadNode back to the worklist if we met dead path. It works, still makes the iteration ~30% faster: Benchmark (nkeys) Mode Cnt Score Error Units Maps.testConcurrentHashMapIterators 10000 avgt 15 312720.415 ? 3255.500 ns/op Thanks for pointing this out. I updated this PR, and the latest commit is passing test tier1-4. > Could you add an [IR framework](https://github.com/openjdk/jdk/blob/master/test/hotspot/jtreg/compiler/lib/ir_framework/README.md) test that verifies that the optimization works as expected? I'm still trying to reproduce this case without using `ConcurrentHashMap`, but haven't found a way yet. Can I add an IR test that depends on `ConcurrentHashMap`? (It might need to be updated if the implementation of `ConcurrentHashMap` changes, I guess.) ------------- PR Comment: https://git.openjdk.org/jdk/pull/19496#issuecomment-2151588662 From mbaesken at openjdk.org Thu Jun 6 07:36:46 2024 From: mbaesken at openjdk.org (Matthias Baesken) Date: Thu, 6 Jun 2024 07:36:46 GMT Subject: RFR: 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 22:34:51 GMT, Vladimir Kozlov wrote: > @MBaesken, please verify the fix. Thanks! Fix looks good, ubsan-enabled linux x86_64 build with that fix included shows no issues in relocInfo_x86.cpp . ------------- PR Comment: https://git.openjdk.org/jdk/pull/19568#issuecomment-2151598076 From chagedorn at openjdk.org Thu Jun 6 07:41:16 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Thu, 6 Jun 2024 07:41:16 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v5] In-Reply-To: References: Message-ID: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: Added note about dummy-if ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19522/files - new: https://git.openjdk.org/jdk/pull/19522/files/e0a18f06..fe4b8131 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=04 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19522&range=03-04 Stats: 3 lines in 1 file changed: 3 ins; 0 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/19522.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19522/head:pull/19522 PR: https://git.openjdk.org/jdk/pull/19522 From epeter at openjdk.org Thu Jun 6 07:41:16 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 07:41:16 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v5] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 07:36:56 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Added note about dummy-if Marked as reviewed by epeter (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19522#pullrequestreview-2101101030 From chagedorn at openjdk.org Thu Jun 6 07:41:16 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Thu, 6 Jun 2024 07:41:16 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v4] In-Reply-To: References: Message-ID: <2Z0Psvoy1a0nE3siLEKgiVpzYfgS1aFMFHcx9GNNQXI=.286c00fc-28dc-450e-972a-2c28f3b5ad39@github.com> On Wed, 5 Jun 2024 15:18:12 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Add randomized tests Thanks Vladimir and Emanuel for your reviews! I've submitted some sanity performance testing since we could now be bailing out more. I'm not sure though how much else we could do if there are some performance regressions. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19522#issuecomment-2151599588 From chagedorn at openjdk.org Thu Jun 6 07:41:16 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Thu, 6 Jun 2024 07:41:16 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v4] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 06:59:13 GMT, Emanuel Peter wrote: >> Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: >> >> Add randomized tests > > src/hotspot/share/opto/loopopts.cpp line 3055: > >> 3053: // exit-region | >> 3054: // | | >> 3055: // dummy-if | > > Still, I don't see this `dummy-if` mentioned in any of the comments. Can you add a comment line about what this is, and where it comes from? Fair point, added a note. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19522#discussion_r1628920348 From rrich at openjdk.org Thu Jun 6 08:40:44 2024 From: rrich at openjdk.org (Richard Reingruber) Date: Thu, 6 Jun 2024 08:40:44 GMT Subject: RFR: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store [v6] In-Reply-To: <2g9metV58pvf8V6QKDS22U_JScvVDILoUjpjmSkq4fQ=.a8275968-66a0-4bf6-a3d6-c38bce96091a@github.com> References: <2g9metV58pvf8V6QKDS22U_JScvVDILoUjpjmSkq4fQ=.a8275968-66a0-4bf6-a3d6-c38bce96091a@github.com> Message-ID: On Thu, 6 Jun 2024 07:02:25 GMT, Amit Kumar wrote: > I did another round of testing on s390x. looks good. Thanks Amit. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19218#issuecomment-2151723102 From aph at openjdk.org Thu Jun 6 09:11:57 2024 From: aph at openjdk.org (Andrew Haley) Date: Thu, 6 Jun 2024 09:11:57 GMT Subject: RFR: 8331658: secondary_super_cache does not scale well: C1 [v2] In-Reply-To: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> References: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> Message-ID: On Wed, 29 May 2024 09:32:41 GMT, Andrew Haley wrote: >> This is the C1 version of [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). >> >> The new logic in this PR is as simple as I can make it. It is a somewhat-simplified version of the C2 change in [JDK-8180450](https://bugs.openjdk.org/browse/JDK-8180450). In order to reduce risk I haven't touched the existing slow subtype stub. >> The register allocation logic in the existing code is pretty gnarly, and I have no desire to break anything at this point in the release cycle, so I have allocated just one register more than the existing code does. >> >> Performance is pretty good. Before and after: >> >> x64, AMD 2950X, 8 cores: >> >> >> Benchmark Mode Cnt Score Error Units >> SecondarySuperCacheHits.test avgt 5 0.959 ? 0.091 ns/op >> SecondarySuperCacheInterContention.test avgt 5 42.931 ? 6.951 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 42.397 ? 7.708 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 43.466 ? 8.238 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 74.660 ? 0.127 ns/op >> >> SecondarySuperCacheHits.test avgt 5 1.480 ? 0.077 ns/op >> SecondarySuperCacheInterContention.test avgt 5 1.461 ? 0.063 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 1.767 ? 0.078 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 1.155 ? 0.052 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 1.421 ? 0.002 ns/op >> >> AArch64, Mac M3, 8 cores: >> >> >> Benchmark Mode Cnt Score Error Units >> SecondarySuperCacheHits.test avgt 5 0.835 ? 0.021 ns/op >> SecondarySuperCacheInterContention.test avgt 5 74.078 ? 18.095 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 81.863 ? 42.492 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 66.293 ? 11.254 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 335.563 ? 6.171 ns/op >> >> SecondarySuperCacheHits.test avgt 5 1.212 ? 0.004 ns/op >> SecondarySuperCacheInterContention.test avgt 5 0.871 ? 0.002 ns/op >> SecondarySuperCacheInterContention.test:t1 avgt 5 0.626 ? 0.003 ns/op >> SecondarySuperCacheInterContention.test:t2 avgt 5 1.115 ? 0.006 ns/op >> SecondarySuperCacheIntraContention.test avgt 5 0.696 ? 0.001 ns/op >> >> >> >> The first test, `SecondarySuperCacheHits`, showns a small regression. It's... > > Andrew Haley has updated the pull request incrementally with one additional commit since the last revision: > > JDK-8331658: secondary_super_cache does not scale well: C1 I agree with most of what you write. Some responses to points made: > Thinking more about the proposal itself (JDK-8331658) I'm curious how relevant scalability issues with SSC are for Client (C1-only) VM. I'd expect it to be deployed in constrained environments where contention has much smaller effects (if present at all). Maybe it's fine to leave SSC as is in Client VM and focus performance work on Tiered VM? Maybe, but it's hard to speculate well about a bunch of code we'll never see. The end goal, I hope, is to remove the `secondary_super_cache` field altogether because it does nothing useful. Apart from anything else, we'd remove some cruft in the VM. So, C1, interpreter, and runtime is next. > > lookup_secondary_supers_table needs to use fixed registers, and quite a lot of them. This patch is a version of the table lookup that uses as few registers as possible, and none of them are fixed. > > The main reason why `lookup_secondary_supers_table` uses pre-defined registers is calling conventions between fast path checks and the stub on slow path. If slow path is inlined, the register set can be chosen arbitrarily. Certainly. > Still, I agree that table lookup needs more scratch registers to operate. > > FTR `MacroAssembler::check_klass_subtype_slow_path` also has some constraints (at least, on x86), but that's because it relies on `SCAS` instruction. Still, `MacroAssembler::check_klass_subtype_slow_path` is used in different contexts with wildly varying set of available registers (I tried to gather some data on that during my earlier experiments [1]). It heavily relies on spilling to shuffle values or allocate scratch registers when needed. Indeed. And when the actual work done by the table lookup takes on the order of a nanosecond, it seems to me to be rather disproportionate to surround it by shuffles. But yes, that works. I'm doing some experiments which seem to show that even with some spilling, hash table lookup is still a win in such cases. That's a bit surprising to me, but it's what the measurements seem to say. > And, speaking of C1, the arguments for subtype check slow path are also passed on stack to simplify implementation. So, performing more spills per se doesn't look like a show-stopper (when it happens outside C2-generated code). Sure, I get that, but I'm trying to make the common cases perform as well as I can. I get it, there's a balance to be struck between the complexity of the VM and the efficiency of the runtime code, but I think the best way to reduce complexity is to get rid of all uses of the `secondary_super_cache` field. I'll come back when I have more. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19426#issuecomment-2151789488 From thartmann at openjdk.org Thu Jun 6 10:45:44 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Thu, 6 Jun 2024 10:45:44 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 15:40:16 GMT, Galder Zamarre?o wrote: >> Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. >> >> The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. >> >> There's no barrier added on x86 c1 macro assembler for nothing to do there. >> >> I've run the following tests: >> * tier 1 on darwin/aarch64 >> * tier 1 on linux/x86_64 >> * `hotspot_compiler` tests on darwin/aarch64 >> * `copy.clone.arrays` jcstress tests on darwin/aarch64. >> >> I tried but was unable to create a standalone test for the jdk source tree that would fail. >> >> FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. > > Galder Zamarre?o has updated the pull request incrementally with one additional commit since the last revision: > > Keep storestore barrier for array allocation > > * Having c1 array clone use 2 storestore barriers > has no performance impact, so it's safer to keep it in place. This looks good to me too and passed our testing. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19538#pullrequestreview-2101598129 From epeter at openjdk.org Thu Jun 6 11:31:04 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 11:31:04 GMT Subject: RFR: 8333713: C2 SuperWord: cleanup in vectornode.cpp/hpp Message-ID: Removed dead code and renamed `superword` -> `auto_vectorization`, just like I had done in [JDK-8324750](https://bugs.openjdk.org/browse/JDK-8324750). ------------- Commit messages: - 8333713 Changes: https://git.openjdk.org/jdk/pull/19575/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19575&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333713 Stats: 16 lines in 2 files changed: 0 ins; 10 del; 6 mod Patch: https://git.openjdk.org/jdk/pull/19575.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19575/head:pull/19575 PR: https://git.openjdk.org/jdk/pull/19575 From chagedorn at openjdk.org Thu Jun 6 11:31:05 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Thu, 6 Jun 2024 11:31:05 GMT Subject: RFR: 8333713: C2 SuperWord: cleanup in vectornode.cpp/hpp In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 11:22:24 GMT, Emanuel Peter wrote: > Removed dead code and renamed `superword` -> `auto_vectorization`, just like I had done in [JDK-8324750](https://bugs.openjdk.org/browse/JDK-8324750). Looks good and trivial. ------------- Marked as reviewed by chagedorn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19575#pullrequestreview-2101685871 From aph at openjdk.org Thu Jun 6 12:31:44 2024 From: aph at openjdk.org (Andrew Haley) Date: Thu, 6 Jun 2024 12:31:44 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 15:40:16 GMT, Galder Zamarre?o wrote: >> Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. >> >> The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. >> >> There's no barrier added on x86 c1 macro assembler for nothing to do there. >> >> I've run the following tests: >> * tier 1 on darwin/aarch64 >> * tier 1 on linux/x86_64 >> * `hotspot_compiler` tests on darwin/aarch64 >> * `copy.clone.arrays` jcstress tests on darwin/aarch64. >> >> I tried but was unable to create a standalone test for the jdk source tree that would fail. >> >> FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. > > Galder Zamarre?o has updated the pull request incrementally with one additional commit since the last revision: > > Keep storestore barrier for array allocation > > * Having c1 array clone use 2 storestore barriers > has no performance impact, so it's safer to keep it in place. Marked as reviewed by aph (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19538#pullrequestreview-2101810432 From fgao at openjdk.org Thu Jun 6 12:42:50 2024 From: fgao at openjdk.org (Fei Gao) Date: Thu, 6 Jun 2024 12:42:50 GMT Subject: RFR: 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug" [v3] In-Reply-To: <2c48X45JWEQrzKj_rwUsCeBV_c8Ee7r80urMm8haK1Q=.fcea07d4-7635-4f83-b8d6-882b210ba84d@github.com> References: <16J-lJ2AceGTVcRWBcP15yKcwO-1IA1XsngyOuNjf7k=.0776f081-ae2c-4279-87cf-d909806c2bc4@github.com> <2c48X45JWEQrzKj_rwUsCeBV_c8Ee7r80urMm8haK1Q=.fcea07d4-7635-4f83-b8d6-882b210ba84d@github.com> Message-ID: On Wed, 5 Jun 2024 13:38:37 GMT, Andrew Haley wrote: > Oh, I see. I really don't think that matters. Current AArch64 processors often add an extra cycle anyway for offsetted addresses, so all we'd add is a slight code size expansion for misaligned loads. Sorry, did you mean loading from base plus offset, like `ldr x0, [x6, #8]` or `ldr x0, [x6, x7]`, takes one more cycle than loading from base register only, like `ldr x0, [x6]`? Does the address addition take one cycle? Thanks. ------------- PR Comment: https://git.openjdk.org/jdk/pull/16991#issuecomment-2152307490 From roland at openjdk.org Thu Jun 6 13:31:10 2024 From: roland at openjdk.org (Roland Westrelin) Date: Thu, 6 Jun 2024 13:31:10 GMT Subject: RFR: 8333721: C2: vectorization causes incorrect execution with unsafe and negative scale Message-ID: This was initially a regresion from 8324517 (C2: crash in compiled code because of dependency on removed range check CastIIs): 8332677 (jck test api/java_math/BigInteger/Bitwise.html fails (c2) on aarch64). A simplified test case for that one is: private static void test1(byte[] array, int start) { for (int i = start; i < array.length; i++) { array[array.length - i - 1] = 0x42; } } That method is vectorized but with 8324517, the resulting compiled code is incorrect. I don't think that failure can be reproduced without 8324517 other than by using unsafe which is what the included test case does (I'll include the test method above in the redo of 8324517). The bug is that `VPointer::scaled_iv_plus_offset()` computes an incorrect offset when `n` is a `Sub` node and the scaled iv is on input 2 of the `Sub` node and input 2 also includes an offset component. In that case, the offset from input 2 is added to the `VPointer` instead of being subtracted. ------------- Commit messages: - whitespace and copyright - test & fix Changes: https://git.openjdk.org/jdk/pull/19577/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19577&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333721 Stats: 151 lines in 2 files changed: 149 ins; 0 del; 2 mod Patch: https://git.openjdk.org/jdk/pull/19577.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19577/head:pull/19577 PR: https://git.openjdk.org/jdk/pull/19577 From epeter at openjdk.org Thu Jun 6 13:37:49 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 13:37:49 GMT Subject: RFR: 8320725: C2: Add "requires_strict_order" flag for floating-point add-reduction [v8] In-Reply-To: <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> References: <8-_t7nWbR9gZ2_QkfFNuf5M0Q4PMkKJKgwS3ZbHcCxI=.32dc4f11-dec5-468d-afc8-3b4dae285dcb@github.com> <2y-Ag6MxVDJfYl6kM0FYjQA-kzSCekUgAMWAZmkECyQ=.2a2a0a8e-fc67-42a4-bd67-b4ae3b60bcea@github.com> <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> Message-ID: <0ulT10xZHL0A_apbv2v7FWIekWh5e7ZvprwN2VpItYg=.7c6943f7-5ba5-49ac-801d-de141cbed033@github.com> On Mon, 3 Jun 2024 08:35:07 GMT, Bhavana Kilambi wrote: >> @Bhavana-Kilambi >> I know we have the tests in `test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java`, and some other reduction tests. But these do not do the specific think I would like to see. >> >> I would like this: >> - Add `no_strict_order` vs `requires_strict_order` or similar to `dump_spec`. >> - IR match not just that there is the correct `ReductionNode`, but also that it has the `no_strict_order` or `requires_strict_order` in its dump. You can do that by using a custom regex string, rather than `IRNode.STORE_VECTOR` or similar. >> - Then, create different tests, some where we expect ordered, some unordered vectors. Use Vector API and SuperWord examples. >> >> Does that make sense? > > Hi @eme64 , I have modified the tests as suggested. Please review :) @Bhavana-Kilambi The code looks good now from my side. You have a title mismatch though. I'll run testing again, please ping me again if I don't come back within 24h. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2152525743 From simonis at openjdk.org Thu Jun 6 13:41:58 2024 From: simonis at openjdk.org (Volker Simonis) Date: Thu, 6 Jun 2024 13:41:58 GMT Subject: RFR: 8333722: Fix CompilerDirectevies for non-compiler JVM variants Message-ID: `DirectivesStack::getMatchingDirective()` relies on the fact that the default directives set is always enabled. And that's indeed the case for normal builds with C1 and C2 compilers (see `DirectivesStack::init()` in `compilerDirectives.cpp`): // Create a new dirstack and push a default directive void DirectivesStack::init() { CompilerDirectives* _default_directives = new CompilerDirectives(); char str[] = "*.*"; const char* error_msg = nullptr; _default_directives->add_match(str, error_msg); #if defined(COMPILER1) || INCLUDE_JVMCI _default_directives->_c1_store->EnableOption = true; #endif #ifdef COMPILER2 if (CompilerConfig::is_c2_enabled()) { _default_directives->_c2_store->EnableOption = true; } #endif assert(error_msg == nullptr, "Must succeed."); push(_default_directives); } However, if we're building a JVM configuration without compilers (e.g. `--with-jvm-variants=core`), this is not the case and `DirectivesStack::getMatchingDirective()` will return the base directive set without incrementing the reference count of its directive: CompilerDirectives* dir = _top; assert(dir != nullptr, "Must be initialized"); while (dir != nullptr) { if (dir->is_default_directive() || dir->match(method)) { match = dir->get_for(comp); assert(match != nullptr, "Consistency"); if (match->EnableOption) { // The directiveSet for this compile is also enabled -> success dir->inc_refcount(); break; } } dir = dir->next(); } } guarantee(match != nullptr, "There should always be a default directive that matches"); // Check for legacy compile commands update, without DirectivesStack_lock return match->compilecommand_compatibility_init(method); If this directive set will be released, it will delete the corresponding base directive and subsequent usages of the base directive will lead to a segmentation fault. After [JDK-8329421: Native methods can not be selectively printed](https://bugs.openjdk.org/browse/JDK-8329421) which replaced the call to DirectiveSet* directive = DirectivesStack::getDefaultDirective(CompileBroker::compiler(CompLevel_simple)); by DirectiveSet* directive = DirectivesStack::getMatchingDirective(method, CompileBroker::compiler(CompLevel_simple)); in `sharedRuntime.cpp` this issue is now triggered at JVM startup for non-compiler configurations when native wrappers are generated (see https://github.com/openjdk/jdk/pull/18567#issuecomment-2149408243). The fix is trivial. Just increment the reference count of a compiler directive in `DirectivesStack::getMatchingDirective()` if it is the base directive, even if it is not enabled. ------------- Commit messages: - 8333722: Fix CompilerDirectevies for non-compiler JVM variants Changes: https://git.openjdk.org/jdk/pull/19578/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19578&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333722 Stats: 1 line in 1 file changed: 0 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19578.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19578/head:pull/19578 PR: https://git.openjdk.org/jdk/pull/19578 From epeter at openjdk.org Thu Jun 6 13:44:50 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 13:44:50 GMT Subject: RFR: 8320725: C2: Add "requires_strict_order" flag for floating-point add-reduction [v8] In-Reply-To: <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> References: <8-_t7nWbR9gZ2_QkfFNuf5M0Q4PMkKJKgwS3ZbHcCxI=.32dc4f11-dec5-468d-afc8-3b4dae285dcb@github.com> <2y-Ag6MxVDJfYl6kM0FYjQA-kzSCekUgAMWAZmkECyQ=.2a2a0a8e-fc67-42a4-bd67-b4ae3b60bcea@github.com> <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> Message-ID: On Mon, 3 Jun 2024 08:35:07 GMT, Bhavana Kilambi wrote: >> @Bhavana-Kilambi >> I know we have the tests in `test/hotspot/jtreg/compiler/c2/irTests/TestDisableAutoVectOpcodes.java`, and some other reduction tests. But these do not do the specific think I would like to see. >> >> I would like this: >> - Add `no_strict_order` vs `requires_strict_order` or similar to `dump_spec`. >> - IR match not just that there is the correct `ReductionNode`, but also that it has the `no_strict_order` or `requires_strict_order` in its dump. You can do that by using a custom regex string, rather than `IRNode.STORE_VECTOR` or similar. >> - Then, create different tests, some where we expect ordered, some unordered vectors. Use Vector API and SuperWord examples. >> >> Does that make sense? > > Hi @eme64 , I have modified the tests as suggested. Please review :) @Bhavana-Kilambi can you merge with master? I think your last merge is a while ago. I'll run testing after. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2152554670 From kvn at openjdk.org Thu Jun 6 13:44:50 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 13:44:50 GMT Subject: RFR: 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 22:28:39 GMT, Vladimir Kozlov wrote: > Add missing check to `pd_call_destination()` similar to check in `pd_set_call_destination()` to avoid arithmetic with `(address)(-1)`. > > Tested tier1-3,stress,xcomp Thank you all for reviews and testing. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19568#issuecomment-2152548663 From kvn at openjdk.org Thu Jun 6 13:44:50 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 13:44:50 GMT Subject: Integrated: 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 22:28:39 GMT, Vladimir Kozlov wrote: > Add missing check to `pd_call_destination()` similar to check in `pd_set_call_destination()` to avoid arithmetic with `(address)(-1)`. > > Tested tier1-3,stress,xcomp This pull request has now been integrated. Changeset: 33fd6ae9 Author: Vladimir Kozlov URL: https://git.openjdk.org/jdk/commit/33fd6ae98638d2a4b33d18cc4acee4f0daaa9b35 Stats: 5 lines in 1 file changed: 4 ins; 0 del; 1 mod 8333622: ubsan: relocInfo_x86.cpp:101:56: runtime error: pointer index expression with base (-1) overflowed Reviewed-by: thartmann, chagedorn ------------- PR: https://git.openjdk.org/jdk/pull/19568 From kvn at openjdk.org Thu Jun 6 13:51:45 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 13:51:45 GMT Subject: RFR: 8333647: C2 SuperWord: some additional PopulateIndex tests [v2] In-Reply-To: References: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> Message-ID: On Thu, 6 Jun 2024 06:35:02 GMT, Emanuel Peter wrote: >> When I did the deep refactoring in https://github.com/openjdk/jdk/pull/19261, I wanted some more tests for `PopulateIndex`. I push them separately to keep the other RFE smaller. >> >> I filed a follow-up RFE for some cases that do not vectorize: https://bugs.openjdk.org/browse/JDK-8332878 > > Emanuel Peter has updated the pull request incrementally with one additional commit since the last revision: > > copyright Good. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19558#pullrequestreview-2102035925 From kvn at openjdk.org Thu Jun 6 13:55:44 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 13:55:44 GMT Subject: RFR: 8333713: C2 SuperWord: cleanup in vectornode.cpp/hpp In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 11:22:24 GMT, Emanuel Peter wrote: > Removed dead code and renamed `superword` -> `auto_vectorization`, just like I had done in [JDK-8324750](https://bugs.openjdk.org/browse/JDK-8324750). Good. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19575#pullrequestreview-2102055077 From mdoerr at openjdk.org Thu Jun 6 14:13:45 2024 From: mdoerr at openjdk.org (Martin Doerr) Date: Thu, 6 Jun 2024 14:13:45 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 15:40:16 GMT, Galder Zamarre?o wrote: >> Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. >> >> The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. >> >> There's no barrier added on x86 c1 macro assembler for nothing to do there. >> >> I've run the following tests: >> * tier 1 on darwin/aarch64 >> * tier 1 on linux/x86_64 >> * `hotspot_compiler` tests on darwin/aarch64 >> * `copy.clone.arrays` jcstress tests on darwin/aarch64. >> >> I tried but was unable to create a standalone test for the jdk source tree that would fail. >> >> FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. > > Galder Zamarre?o has updated the pull request incrementally with one additional commit since the last revision: > > Keep storestore barrier for array allocation > > * Having c1 array clone use 2 storestore barriers > has no performance impact, so it's safer to keep it in place. Seems like the `/integrate` got lost. I can see it in my email. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2152645668 From dcubed at openjdk.org Thu Jun 6 14:27:44 2024 From: dcubed at openjdk.org (Daniel D. Daugherty) Date: Thu, 6 Jun 2024 14:27:44 GMT Subject: RFR: 8333722: Fix CompilerDirectevies for non-compiler JVM variants In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 13:36:25 GMT, Volker Simonis wrote: > `DirectivesStack::getMatchingDirective()` relies on the fact that the default directives set is always enabled. And that's indeed the case for normal builds with C1 and C2 compilers (see `DirectivesStack::init()` in `compilerDirectives.cpp`): > > > // Create a new dirstack and push a default directive > void DirectivesStack::init() { > CompilerDirectives* _default_directives = new CompilerDirectives(); > char str[] = "*.*"; > const char* error_msg = nullptr; > _default_directives->add_match(str, error_msg); > #if defined(COMPILER1) || INCLUDE_JVMCI > _default_directives->_c1_store->EnableOption = true; > #endif > #ifdef COMPILER2 > if (CompilerConfig::is_c2_enabled()) { > _default_directives->_c2_store->EnableOption = true; > } > #endif > assert(error_msg == nullptr, "Must succeed."); > push(_default_directives); > } > > > However, if we're building a JVM configuration without compilers (e.g. `--with-jvm-variants=core`), this is not the case and `DirectivesStack::getMatchingDirective()` will return the base directive set without incrementing the reference count of its directive: > > > CompilerDirectives* dir = _top; > assert(dir != nullptr, "Must be initialized"); > > while (dir != nullptr) { > if (dir->is_default_directive() || dir->match(method)) { > match = dir->get_for(comp); > assert(match != nullptr, "Consistency"); > if (match->EnableOption) { > // The directiveSet for this compile is also enabled -> success > dir->inc_refcount(); > break; > } > } > dir = dir->next(); > } > } > guarantee(match != nullptr, "There should always be a default directive that matches"); > > // Check for legacy compile commands update, without DirectivesStack_lock > return match->compilecommand_compatibility_init(method); > > > If this directive set will be released, it will delete the corresponding base directive and subsequent usages of the base directive will lead to a segmentation fault. > > After [JDK-8329421: Native methods can not be selectively printed](https://bugs.openjdk.org/browse/JDK-8329421) which replaced the call to > > DirectiveSet* directive = DirectivesStack::getDefaultDirective(CompileBroker::compiler(CompLevel_simple)); > > by > > DirectiveSet* directive = DirectivesStack::getMatchingDirective(method, CompileBroker::compiler(CompLevel_simple)); > > in `sharedRuntime.cpp` this issue is now triggered at JVM startup for non-compiler configurations when native wrappers are generated (see https://github.com/openjdk/jdk/pull/18567#... I fixed the typo in the bug's synopsis. The easiest way to update this PR to match is: `/issue JDK-8333722` ------------- PR Comment: https://git.openjdk.org/jdk/pull/19578#issuecomment-2152676477 From galder at openjdk.org Thu Jun 6 14:31:45 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Thu, 6 Jun 2024 14:31:45 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: References: Message-ID: <79Mt49JLcBGnIEvgu8jYQw6x0ODZcMExBfkA7XNLbB4=.e8ff6a95-d1c9-4dd4-972a-f208fae5052b@github.com> On Thu, 6 Jun 2024 14:11:35 GMT, Martin Doerr wrote: >> Galder Zamarre?o has updated the pull request incrementally with one additional commit since the last revision: >> >> Keep storestore barrier for array allocation >> >> * Having c1 array clone use 2 storestore barriers >> has no performance impact, so it's safer to keep it in place. > > Seems like the `/integrate` got lost. I can see it in my email. @TheRealMDoerr I removed it because I had just noticed the test failure in CI, looking into it. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2152685855 From fgao at openjdk.org Thu Jun 6 14:35:29 2024 From: fgao at openjdk.org (Fei Gao) Date: Thu, 6 Jun 2024 14:35:29 GMT Subject: RFR: 8321308: AArch64: Fix matching predication for cbz/cbnz [v2] In-Reply-To: References: Message-ID: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> > For array length check like: > > if (a.length > 0) { > [Block 1] > } else { > [Block 2] > } > > > Since `a.length` is unsigned, it's semantically equivalent to: > > if (a.length != 0) { > [Block 1] > } else { > [Block 2] > } > > > On aarch64 port, we can do the conversion like above, during c2 compiler instruction matching, for certain unsigned integral comparisons. > > For example, > > cmpw w11, #0 # unsigned > bls label # unsigned > [Block 1] > > label: > [Block 2] > > > can be converted to: > > cbz w11, label > [Block 1] > > label: > [Block 2] > > > Currently, we have some matching rules to do the conversion [[1]](https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L16179). But the predicate here [[2]](https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L6140) matches wrong `BoolTest` masks, so these rules fail to convert. I guess it's a typo introduced in [JDK-8160006](https://bugs.openjdk.org/browse/JDK-8160006). The patch fixes it. Fei Gao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: - Redefine the interface for cmpOpUEqNeLeGt - Merge branch 'master' into fg8321308 - 8321308: AArch64: Fix matching predication for cbz/cbnz For array length check like: ``` if (a.length > 0) { [Block 1] } else { [Block 2] } ``` Since `a.length` is unsigned, it's semantically equivalent to: ``` if (a.length != 0) { [Block 1] } else { [Block 2] } ``` On aarch64 port, we can do the conversion like above, during c2 compiler instruction matching, for certain unsigned integral comparisons. For example, ``` cmpw w11, #0 # unsigned bls label # unsigned [Block 1] label: [Block 2] ``` can be converted to: ``` cbz w11, label [Block 1] label: [Block 2] ``` Currently, we have some matching rules to do the conversion[1]. But the predicate here[2] matches wrong `BoolTest` masks, so these rules fail to convert. I guess it's a typo introduced in JDK-8160006. The patch fixes it. [1] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L16179 [2] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L6140 ------------- Changes: - all: https://git.openjdk.org/jdk/pull/16989/files - new: https://git.openjdk.org/jdk/pull/16989/files/066134a2..c49553b9 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=16989&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=16989&range=00-01 Stats: 871263 lines in 11702 files changed: 263300 ins; 226217 del; 381746 mod Patch: https://git.openjdk.org/jdk/pull/16989.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/16989/head:pull/16989 PR: https://git.openjdk.org/jdk/pull/16989 From fgao at openjdk.org Thu Jun 6 14:39:51 2024 From: fgao at openjdk.org (Fei Gao) Date: Thu, 6 Jun 2024 14:39:51 GMT Subject: RFR: 8321308: AArch64: Fix matching predication for cbz/cbnz [v2] In-Reply-To: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> References: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> Message-ID: On Thu, 6 Jun 2024 14:35:29 GMT, Fei Gao wrote: >> For array length check like: >> >> if (a.length > 0) { >> [Block 1] >> } else { >> [Block 2] >> } >> >> >> Since `a.length` is unsigned, it's semantically equivalent to: >> >> if (a.length != 0) { >> [Block 1] >> } else { >> [Block 2] >> } >> >> >> On aarch64 port, we can do the conversion like above, during c2 compiler instruction matching, for certain unsigned integral comparisons. >> >> For example, >> >> cmpw w11, #0 # unsigned >> bls label # unsigned >> [Block 1] >> >> label: >> [Block 2] >> >> >> can be converted to: >> >> cbz w11, label >> [Block 1] >> >> label: >> [Block 2] >> >> >> Currently, we have some matching rules to do the conversion [[1]](https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L16179). But the predicate here [[2]](https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L6140) matches wrong `BoolTest` masks, so these rules fail to convert. I guess it's a typo introduced in [JDK-8160006](https://bugs.openjdk.org/browse/JDK-8160006). The patch fixes it. > > Fei Gao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: > > - Redefine the interface for cmpOpUEqNeLeGt > - Merge branch 'master' into fg8321308 > - 8321308: AArch64: Fix matching predication for cbz/cbnz > > For array length check like: > ``` > if (a.length > 0) { > [Block 1] > } else { > [Block 2] > } > ``` > > Since `a.length` is unsigned, it's semantically equivalent to: > ``` > if (a.length != 0) { > [Block 1] > } else { > [Block 2] > } > ``` > > On aarch64 port, we can do the conversion like above, during c2 > compiler instruction matching, for certain unsigned integral > comparisons. > > For example, > ``` > cmpw w11, #0 # unsigned > bls label # unsigned > [Block 1] > > label: > [Block 2] > ``` > > can be converted to: > ``` > cbz w11, label > [Block 1] > > label: > [Block 2] > ``` > > Currently, we have some matching rules to do the conversion[1]. > But the predicate here[2] matches wrong `BoolTest` masks, > so these rules fail to convert. I guess it's a typo introduced > in JDK-8160006. The patch fixes it. > > [1] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L16179 > [2] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L6140 Thanks for all your comments. > @fg1417 Regarding the rework, see my response to @dean-long which explains how the interface for `cmpOpUEqNeLeGt` should be redefined (also how the rules can be retained as currently defined). In the new commit, I redefined the interface for `cmpOpUEqNeLeGt` and also kept the rules besides adding assertion lines. Thanks @adinn . ------------- PR Comment: https://git.openjdk.org/jdk/pull/16989#issuecomment-2152703543 From mdoerr at openjdk.org Thu Jun 6 14:45:45 2024 From: mdoerr at openjdk.org (Martin Doerr) Date: Thu, 6 Jun 2024 14:45:45 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: References: Message-ID: <42Nk3qrMqj-R_Q2IqDEuY_kkCM0YPCj6M0qyLES3r_w=.fbbe0112-aa66-45b8-affe-fe6dbd87f548@github.com> On Thu, 6 Jun 2024 14:11:35 GMT, Martin Doerr wrote: >> Galder Zamarre?o has updated the pull request incrementally with one additional commit since the last revision: >> >> Keep storestore barrier for array allocation >> >> * Having c1 array clone use 2 storestore barriers >> has no performance impact, so it's safer to keep it in place. > > Seems like the `/integrate` got lost. I can see it in my email. > @TheRealMDoerr I removed it because I had just noticed the test failure in CI, looking into it. The same test has failed in one of my PRs. I don't think it is related. Can be ignored. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2152713258 From epeter at openjdk.org Thu Jun 6 14:47:53 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 14:47:53 GMT Subject: RFR: 8332537: C2: High memory usage reported for compiler/loopopts/superword/TestAlignVectorFuzzer.java Message-ID: This bug was a regression of: [JDK-8324517](https://bugs.openjdk.org/browse/JDK-8324517); C2: crash in compiled code because of dependency on removed range check CastIIs That change was backed out with: [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs I `git revert` ed the BACKOUT change: [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs And only then this reproduced (not on master, only with Roland's CastII regression code): `~/Documents/jtreg/bin/jtreg -va -s -jdk:/oracle-work/jdk-fork2/build/linux-x64-debug/jdk -javaoptions:"-Djdk.test.lib.random.seed=3249981201344669190" -J-Djavatest.maxOutputSize=10000000 /oracle-work/jdk-fork2/open/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java` Now that we know that it is a regression of a backed-out change, we can enable the MemLimit check again. Also the stack-traces were related to CastII code, and looked very similar to those in another duplicate: [JDK-8332765](https://bugs.openjdk.org/browse/JDK-8332765): Test compiler/loopopts/superword/TestAlignVectorFuzzer.java still times out after JDK-8327978 ------------- Commit messages: - JDK-8332537 Changes: https://git.openjdk.org/jdk/pull/19580/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19580&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8332537 Stats: 4 lines in 1 file changed: 0 ins; 4 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/19580.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19580/head:pull/19580 PR: https://git.openjdk.org/jdk/pull/19580 From kvn at openjdk.org Thu Jun 6 14:47:53 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 14:47:53 GMT Subject: RFR: 8332537: C2: High memory usage reported for compiler/loopopts/superword/TestAlignVectorFuzzer.java In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 14:22:48 GMT, Emanuel Peter wrote: > This bug was a regression of: > [JDK-8324517](https://bugs.openjdk.org/browse/JDK-8324517); C2: crash in compiled code because of dependency on removed range check CastIIs > > That change was backed out with: > [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs > > I `git revert` ed the BACKOUT change: > [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs > > And only then this reproduced (not on master, only with Roland's CastII regression code): > `~/Documents/jtreg/bin/jtreg -va -s -jdk:/oracle-work/jdk-fork2/build/linux-x64-debug/jdk -javaoptions:"-Djdk.test.lib.random.seed=3249981201344669190" -J-Djavatest.maxOutputSize=10000000 /oracle-work/jdk-fork2/open/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java` > > Now that we know that it is a regression of a backed-out change, we can enable the MemLimit check again. > > Also the stack-traces were related to CastII code, and looked very similar to those in another duplicate: > [JDK-8332765](https://bugs.openjdk.org/browse/JDK-8332765): Test compiler/loopopts/superword/TestAlignVectorFuzzer.java still times out after JDK-8327978 Good and trivial. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19580#pullrequestreview-2102195080 From thartmann at openjdk.org Thu Jun 6 14:52:42 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Thu, 6 Jun 2024 14:52:42 GMT Subject: RFR: 8332537: C2: High memory usage reported for compiler/loopopts/superword/TestAlignVectorFuzzer.java In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 14:22:48 GMT, Emanuel Peter wrote: > This bug was a regression of: > [JDK-8324517](https://bugs.openjdk.org/browse/JDK-8324517); C2: crash in compiled code because of dependency on removed range check CastIIs > > That change was backed out with: > [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs > > I `git revert` ed the BACKOUT change: > [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs > > And only then this reproduced (not on master, only with Roland's CastII regression code): > `~/Documents/jtreg/bin/jtreg -va -s -jdk:/oracle-work/jdk-fork2/build/linux-x64-debug/jdk -javaoptions:"-Djdk.test.lib.random.seed=3249981201344669190" -J-Djavatest.maxOutputSize=10000000 /oracle-work/jdk-fork2/open/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java` > > Now that we know that it is a regression of a backed-out change, we can enable the MemLimit check again. > > Also the stack-traces were related to CastII code, and looked very similar to those in another duplicate: > [JDK-8332765](https://bugs.openjdk.org/browse/JDK-8332765): Test compiler/loopopts/superword/TestAlignVectorFuzzer.java still times out after JDK-8327978 Marked as reviewed by thartmann (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19580#pullrequestreview-2102219215 From galder at openjdk.org Thu Jun 6 15:04:51 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Thu, 6 Jun 2024 15:04:51 GMT Subject: RFR: 8332670: C1 clone intrinsic needs memory barriers [v2] In-Reply-To: <42Nk3qrMqj-R_Q2IqDEuY_kkCM0YPCj6M0qyLES3r_w=.fbbe0112-aa66-45b8-affe-fe6dbd87f548@github.com> References: <42Nk3qrMqj-R_Q2IqDEuY_kkCM0YPCj6M0qyLES3r_w=.fbbe0112-aa66-45b8-affe-fe6dbd87f548@github.com> Message-ID: On Thu, 6 Jun 2024 14:41:20 GMT, Martin Doerr wrote: > > @TheRealMDoerr I removed it because I had just noticed the test failure in CI, looking into it. > > The same test has failed in one of my PRs. I don't think it is related. Can be ignored. Ah ok. I've tried to replicate it on a linux env but didn't have luck. I'll go ahead and try to integrate. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19538#issuecomment-2152758788 From galder at openjdk.org Thu Jun 6 15:04:52 2024 From: galder at openjdk.org (Galder =?UTF-8?B?WmFtYXJyZcOxbw==?=) Date: Thu, 6 Jun 2024 15:04:52 GMT Subject: Integrated: 8332670: C1 clone intrinsic needs memory barriers In-Reply-To: References: Message-ID: <8tYk9qb9sPaAsXhRUH4NhwhMAeHH27ctKU6QIyz_1Ec=.b13532be-3415-4b98-a6d9-56753ee53945@github.com> On Tue, 4 Jun 2024 08:10:59 GMT, Galder Zamarre?o wrote: > Adds a storestore barrier after copying the contents in the primitive array intrinsic (credit @shipilev). The barrier is a no-op in platforms where not needed so no need for an ifdef. > > The barrier after new array creation is only added if zeroing the array on aarch64 (credit @dean-long). Since the primitive array clone intrinsic does not zero the array, that means there's a single barrier added for this use case. > > There's no barrier added on x86 c1 macro assembler for nothing to do there. > > I've run the following tests: > * tier 1 on darwin/aarch64 > * tier 1 on linux/x86_64 > * `hotspot_compiler` tests on darwin/aarch64 > * `copy.clone.arrays` jcstress tests on darwin/aarch64. > > I tried but was unable to create a standalone test for the jdk source tree that would fail. > > FYI @bulasevich @TheRealMDoerr @RealFYang @RealLucy similar platform specific c1 macro assembler changes might be required for other platforms. This pull request has now been integrated. Changeset: 606df441 Author: Galder Zamarre?o Committer: Martin Doerr URL: https://git.openjdk.org/jdk/commit/606df441410a69034b4c113e85ce21937d1a0808 Stats: 1 line in 1 file changed: 1 ins; 0 del; 0 mod 8332670: C1 clone intrinsic needs memory barriers Reviewed-by: shade, thartmann, aph ------------- PR: https://git.openjdk.org/jdk/pull/19538 From mdoerr at openjdk.org Thu Jun 6 15:06:48 2024 From: mdoerr at openjdk.org (Martin Doerr) Date: Thu, 6 Jun 2024 15:06:48 GMT Subject: RFR: 8331935: Add support for primitive array C1 clone intrinsic in PPC [v5] In-Reply-To: References: Message-ID: On Mon, 3 Jun 2024 05:36:12 GMT, Varada M wrote: >> https://bugs.openjdk.org/browse/JDK-8302850 port for PPC64 >> >> JMH Benchmark Results >> >> >> Before : >> >> Benchmark (size) Mode Cnt Score Error Units >> ArrayClone.byteArraycopy 0 avgt 15 114.107 ? 1.337 ns/op >> ArrayClone.byteArraycopy 10 avgt 15 130.492 ? 0.991 ns/op >> ArrayClone.byteArraycopy 100 avgt 15 139.103 ? 1.913 ns/op >> ArrayClone.byteArraycopy 1000 avgt 15 321.688 ? 6.033 ns/op >> ArrayClone.byteClone 0 avgt 15 227.602 ? 3.393 ns/op >> ArrayClone.byteClone 10 avgt 15 237.624 ? 2.996 ns/op >> ArrayClone.byteClone 100 avgt 15 239.219 ? 2.835 ns/op >> >> ArrayClone.byteClone 1000 avgt 15 355.571 ? 2.946 ns/op >> ArrayClone.intArraycopy 0 avgt 15 113.275 ? 1.099 ns/op >> ArrayClone.intArraycopy 10 avgt 15 129.763 ? 1.458 ns/op >> ArrayClone.intArraycopy 100 avgt 15 213.327 ? 2.524 ns/op >> ArrayClone.intArraycopy 1000 avgt 15 449.650 ? 7.338 ns/op >> ArrayClone.intClone 0 avgt 15 225.682 ? 3.048 ns/op >> ArrayClone.intClone 10 avgt 15 234.532 ? 2.817 ns/op >> ArrayClone.intClone 100 avgt 15 295.934 ? 4.925 ns/op >> ArrayClone.intClone 1000 avgt 15 573.368 ? 5.739 ns/op >> Finished running test 'micro:java.lang.ArrayClone' >> Test report is stored in build/aix-ppc64-server-release/test-results/micro_java_lang_ArrayClone >> >> ============================== >> Test summary >> ============================== >> TEST TOTAL PASS FAIL ERROR >> micro:java.lang.ArrayClone 1 1 0 0 >> ============================== >> TEST SUCCESS >> >> Finished building target 'test' in configuration 'aix-ppc64-server-release' >> >> >> >> >> After: >> >> Benchmark (size) Mode Cnt Score Error Units >> ArrayClone.byteArraycopy 0 avgt 15 113.894 ? 0.993 ns/op >> ArrayClone.byteArraycopy 10 avgt 15 131.455 ? 0.956 ns/op >> ArrayClone.byteArraycopy 100 avgt 15 139.145 ? 3.002 ns/op >> ArrayClone.byteArraycopy 1000 avgt 15 315.957 ? 14.591 ns/op >> ArrayClone.byteClone 0 avgt 15 43.753 ? 3.669 ns/op >> ArrayClone.byteClone 10 avgt 15 52.329 ? 1.041 ns/op >> ArrayClone.byteClone 100 avgt 15 127.711 ? 3.938 ns/op >> >> ArrayClone.byteClone 1000 avgt 15 225.937 ? 1.987 ns/op >> Arr... > > Varada M has updated the pull request with a new target base due to a merge or a rebase. The pull request now contains six commits: > > - Merge branch 'master' into arryClone > - Merge branch 'master' into arryClone > - Add support for primitive array C1 clone intrinsic > - Add support for primitive array C1 clone intrinsic > - Add support for primitive array C1 clone intrinsic > - Add support for primitive array C1 clone intrinsic https://github.com/openjdk/jdk/pull/19538 is integrated, so we can ship this one, too. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19250#issuecomment-2152767272 From simonis at openjdk.org Thu Jun 6 15:21:44 2024 From: simonis at openjdk.org (Volker Simonis) Date: Thu, 6 Jun 2024 15:21:44 GMT Subject: RFR: 8333722: Fix CompilerDirectives for non-compiler JVM variants In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 14:24:53 GMT, Daniel D. Daugherty wrote: > I fixed the typo in the bug's synopsis. The easiest way to update this PR to match is: `/issue JDK-8333722` Thanks @dcubed-ojdk ! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19578#issuecomment-2152799219 From epeter at openjdk.org Thu Jun 6 15:24:50 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 15:24:50 GMT Subject: RFR: 8333647: C2 SuperWord: some additional PopulateIndex tests [v2] In-Reply-To: References: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> Message-ID: On Thu, 6 Jun 2024 13:48:57 GMT, Vladimir Kozlov wrote: >> Emanuel Peter has updated the pull request incrementally with one additional commit since the last revision: >> >> copyright > > Good. Thanks @vnkozlov @chhagedorn for the reviews! @vnkozlov said I can integrate before the fork, since this is only test changes. I'll do that. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19558#issuecomment-2152801272 From epeter at openjdk.org Thu Jun 6 15:24:51 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Thu, 6 Jun 2024 15:24:51 GMT Subject: Integrated: 8333647: C2 SuperWord: some additional PopulateIndex tests In-Reply-To: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> References: <9YKc1ChKG2imh-5gFgL1QHhSOYO5T0BBlRCbJgAIXXU=.b2c8cd72-f1b2-43cd-99f4-9334ac1ae8d3@github.com> Message-ID: On Wed, 5 Jun 2024 13:06:10 GMT, Emanuel Peter wrote: > When I did the deep refactoring in https://github.com/openjdk/jdk/pull/19261, I wanted some more tests for `PopulateIndex`. I push them separately to keep the other RFE smaller. > > I filed a follow-up RFE for some cases that do not vectorize: https://bugs.openjdk.org/browse/JDK-8332878 This pull request has now been integrated. Changeset: 487c4771 Author: Emanuel Peter URL: https://git.openjdk.org/jdk/commit/487c4771818999749bfd507ab85777795bba0832 Stats: 99 lines in 2 files changed: 99 ins; 0 del; 0 mod 8333647: C2 SuperWord: some additional PopulateIndex tests Reviewed-by: kvn, chagedorn ------------- PR: https://git.openjdk.org/jdk/pull/19558 From stuefe at openjdk.org Thu Jun 6 15:27:45 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Thu, 6 Jun 2024 15:27:45 GMT Subject: RFR: 8332537: C2: High memory usage reported for compiler/loopopts/superword/TestAlignVectorFuzzer.java In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 14:22:48 GMT, Emanuel Peter wrote: > This bug was a regression of: > [JDK-8324517](https://bugs.openjdk.org/browse/JDK-8324517); C2: crash in compiled code because of dependency on removed range check CastIIs > > That change was backed out with: > [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs > > I `git revert` ed the BACKOUT change: > [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs > > And only then this reproduced (not on master, only with Roland's CastII regression code): > `~/Documents/jtreg/bin/jtreg -va -s -jdk:/oracle-work/jdk-fork2/build/linux-x64-debug/jdk -javaoptions:"-Djdk.test.lib.random.seed=3249981201344669190" -J-Djavatest.maxOutputSize=10000000 /oracle-work/jdk-fork2/open/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java` > > Now that we know that it is a regression of a backed-out change, we can enable the MemLimit check again. > > Also the stack-traces were related to CastII code, and looked very similar to those in another duplicate: > [JDK-8332765](https://bugs.openjdk.org/browse/JDK-8332765): Test compiler/loopopts/superword/TestAlignVectorFuzzer.java still times out after JDK-8327978 Thanks for looking into this. ------------- Marked as reviewed by stuefe (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19580#pullrequestreview-2102304515 From kvn at openjdk.org Thu Jun 6 15:32:44 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 15:32:44 GMT Subject: RFR: 8333722: Fix CompilerDirectives for non-compiler JVM variants In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 13:36:25 GMT, Volker Simonis wrote: > `DirectivesStack::getMatchingDirective()` relies on the fact that the default directives set is always enabled. And that's indeed the case for normal builds with C1 and C2 compilers (see `DirectivesStack::init()` in `compilerDirectives.cpp`): > > > // Create a new dirstack and push a default directive > void DirectivesStack::init() { > CompilerDirectives* _default_directives = new CompilerDirectives(); > char str[] = "*.*"; > const char* error_msg = nullptr; > _default_directives->add_match(str, error_msg); > #if defined(COMPILER1) || INCLUDE_JVMCI > _default_directives->_c1_store->EnableOption = true; > #endif > #ifdef COMPILER2 > if (CompilerConfig::is_c2_enabled()) { > _default_directives->_c2_store->EnableOption = true; > } > #endif > assert(error_msg == nullptr, "Must succeed."); > push(_default_directives); > } > > > However, if we're building a JVM configuration without compilers (e.g. `--with-jvm-variants=core`), this is not the case and `DirectivesStack::getMatchingDirective()` will return the base directive set without incrementing the reference count of its directive: > > > CompilerDirectives* dir = _top; > assert(dir != nullptr, "Must be initialized"); > > while (dir != nullptr) { > if (dir->is_default_directive() || dir->match(method)) { > match = dir->get_for(comp); > assert(match != nullptr, "Consistency"); > if (match->EnableOption) { > // The directiveSet for this compile is also enabled -> success > dir->inc_refcount(); > break; > } > } > dir = dir->next(); > } > } > guarantee(match != nullptr, "There should always be a default directive that matches"); > > // Check for legacy compile commands update, without DirectivesStack_lock > return match->compilecommand_compatibility_init(method); > > > If this directive set will be released, it will delete the corresponding base directive and subsequent usages of the base directive will lead to a segmentation fault. > > After [JDK-8329421: Native methods can not be selectively printed](https://bugs.openjdk.org/browse/JDK-8329421) which replaced the call to > > DirectiveSet* directive = DirectivesStack::getDefaultDirective(CompileBroker::compiler(CompLevel_simple)); > > by > > DirectiveSet* directive = DirectivesStack::getMatchingDirective(method, CompileBroker::compiler(CompLevel_simple)); > > in `sharedRuntime.cpp` this issue is now triggered at JVM startup for non-compiler configurations when native wrappers are generated (see https://github.com/openjdk/jdk/pull/18567#... Okay. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19578#pullrequestreview-2102318374 From aph at openjdk.org Thu Jun 6 15:41:58 2024 From: aph at openjdk.org (Andrew Haley) Date: Thu, 6 Jun 2024 15:41:58 GMT Subject: RFR: 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug" [v3] In-Reply-To: References: <16J-lJ2AceGTVcRWBcP15yKcwO-1IA1XsngyOuNjf7k=.0776f081-ae2c-4279-87cf-d909806c2bc4@github.com> Message-ID: On Wed, 29 May 2024 08:46:51 GMT, Fei Gao wrote: >> On LP64 systems, if the heap can be moved into low virtual address space (below 4GB) and the heap size is smaller than the interesting threshold of 4 GB, we can use unscaled decoding pattern for narrow klass decoding. It means that a generic field reference can be decoded by: >> >> cast<64> (32-bit compressed reference) + field_offset >> >> >> When the `field_offset` is an immediate, on aarch64 platform, the unscaled decoding pattern can match perfectly with a direct addressing mode, i.e., `base_plus_offset`, supported by `LDR/STR` instructions. But for certain data width, not all immediates can be encoded in the instruction field of `LDR/STR` [[1]](https://github.com/openjdk/jdk/blob/8db7bad992a0f31de9c7e00c2657c18670539102/src/hotspot/cpu/aarch64/assembler_aarch64.inline.hpp#L33). The ranges are different as data widths vary. >> >> For example, when we try to load a value of long type at offset of `1030`, the address expression is `(AddP (DecodeN base) 1030)`. Before the patch, the expression was matching with `operand indOffIN()`. But, for 64-bit `LDR/STR`, signed immediate byte offset must be in the range -256 to 255 or positive immediate byte offset must be a multiple of 8 in the range 0 to 32760 [[2]](https://developer.arm.com/documentation/ddi0602/2023-09/Base-Instructions/LDR--immediate---Load-Register--immediate--?lang=en). `1030` can't be encoded in the instruction field. So, after matching, when we do checking for instruction encoding, the assertion would fail. >> >> In this patch, we're going to filter out invalid immediates when deciding if current addressing mode can be matched as `base_plus_offset`. We introduce `indOffIN4/indOffLN4` and `indOffIN8/indOffLN8` for 32-bit data type and 64-bit data type separately in the patch. E.g., for `memory4`, we remove the generic `indOffIN/indOffLN`, which matches wrong unscaled immediate range, and replace them with `indOffIN4/indOffLN4` instead. >> >> Since 8-bit and 16-bit `LDR/STR` instructions also support the unscaled decoding pattern, we add the addressing mode in the lists of `memory1` and `memory2` by introducing `indOffIN1/indOffLN1` and `indOffIN2/indOffLN2`. >> >> We also remove unused operands `indOffI/indOffl/indOffIN/indOffLN` to avoid misuse. >> >> Tier 1-3 passed on aarch64. > > Fei Gao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains five additional commits since the last revision: > > - Add the assertion back and merge matchrules with a better predicate > - Merge branch 'master' into fg8319690 > - Remove unused immIOffset/immLOffset > - Merge branch 'master' into fg8319690 > - 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug" > > On LP64 systems, if the heap can be moved into low virtual > address space (below 4GB) and the heap size is smaller than the > interesting threshold of 4 GB, we can use unscaled decoding > pattern for narrow klass decoding. It means that a generic field > reference can be decoded by: > ``` > cast<64> (32-bit compressed reference) + field_offset > ``` > > When the `field_offset` is an immediate, on aarch64 platform, the > unscaled decoding pattern can match perfectly with a direct > addressing mode, i.e., `base_plus_offset`, supported by LDR/STR > instructions. But for certain data width, not all immediates can > be encoded in the instruction field of LDR/STR[1]. The ranges are > different as data widths vary. > > For example, when we try to load a value of long type at offset of > `1030`, the address expression is `(AddP (DecodeN base) 1030)`. > Before the patch, the expression was matching with > `operand indOffIN()`. But, for 64-bit LDR/STR, signed immediate > byte offset must be in the range -256 to 255 or positive immediate > byte offset must be a multiple of 8 in the range 0 to 32760[2]. > `1030` can't be encoded in the instruction field. So, after > matching, when we do checking for instruction encoding, the > assertion would fail. > > In this patch, we're going to filter out invalid immediates > when deciding if current addressing mode can be matched as > `base_plus_offset`. We introduce `indOffIN4/indOffLN4` and > `indOffIN8/indOffLN8` for 32-bit data type and 64-bit data > type separately in the patch. E.g., for `memory4`, we remove > the generic `indOffIN/indOffLN`, which matches wrong unscaled > immediate range, and replace them with `indOffIN4/indOffLN4` > instead. > > Since 8-bit and 16-bit LDR/STR instructions also support the > unscaled decoding pattern, we add the addressing mode in the > lists of `memory1` and `memory2` by introducing > `indOffIN1/indOffLN1` and `indOffIN2/... On 6/6/24 13:42, Fei Gao wrote: > Sorry, did you mean loading from base plus offset, like `ldr x0, [x6, > #8]` or `ldr x0, [x6, x7]`, takes one more cycle than loading from base > register only, like `ldr x0, [x6]`? Does the address addition take one > cycle? We know that, on many Arm cores, Store ?OPs are split into address and data ?OPs which are executed separately. That doesn't usually cause any additional delay, because cores execute many operations in parallel, so an address generation ?OP for base+offset very probably will execute in parallel with some previous instructions, meaning that the target address is ready before it is needed. This split of address generation must happen regardless of whether a store (or a load) is a single instruction `str x0, [x1, #80]` or a pair of instructions `add r8, x1, #80; str x0, [x8]`. Of course, a pair of instructions occupies twice as much icache space, and you can run out of instruction decode bandwidth. However, in the case of Unsafe operations, I don't believe that an occasional unnecessary two-instruction operation will result in a performance regression. ------------- PR Comment: https://git.openjdk.org/jdk/pull/16991#issuecomment-2152841468 From simonis at openjdk.org Thu Jun 6 15:49:43 2024 From: simonis at openjdk.org (Volker Simonis) Date: Thu, 6 Jun 2024 15:49:43 GMT Subject: RFR: 8333722: Fix CompilerDirectives for non-compiler JVM variants In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 15:30:36 GMT, Vladimir Kozlov wrote: > Okay. Thanks @vnkozlov ! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19578#issuecomment-2152857816 From sgibbons at openjdk.org Thu Jun 6 17:44:05 2024 From: sgibbons at openjdk.org (Scott Gibbons) Date: Thu, 6 Jun 2024 17:44:05 GMT Subject: RFR: 8320448: Accelerate IndexOf using AVX2 [v52] In-Reply-To: References: Message-ID: On Thu, 30 May 2024 16:16:45 GMT, Scott Gibbons wrote: >> Re-write the IndexOf code without the use of the pcmpestri instruction, only using AVX2 instructions. This change accelerates String.IndexOf on average 1.3x for AVX2. The benchmark numbers: >> >> >> Benchmark Score Latest >> StringIndexOf.advancedWithMediumSub 343.573 317.934 0.925375393x >> StringIndexOf.advancedWithShortSub1 1039.081 1053.96 1.014319384x >> StringIndexOf.advancedWithShortSub2 55.828 110.541 1.980027943x >> StringIndexOf.constantPattern 9.361 11.906 1.271872663x >> StringIndexOf.searchCharLongSuccess 4.216 4.218 1.000474383x >> StringIndexOf.searchCharMediumSuccess 3.133 3.216 1.02649218x >> StringIndexOf.searchCharShortSuccess 3.76 3.761 1.000265957x >> StringIndexOf.success 9.186 9.713 1.057369911x >> StringIndexOf.successBig 14.341 46.343 3.231504079x >> StringIndexOfChar.latin1_AVX2_String 6220.918 12154.52 1.953814533x >> StringIndexOfChar.latin1_AVX2_char 5503.556 5540.044 1.006629895x >> StringIndexOfChar.latin1_SSE4_String 6978.854 6818.689 0.977049957x >> StringIndexOfChar.latin1_SSE4_char 5657.499 5474.624 0.967675646x >> StringIndexOfChar.latin1_Short_String 7132.541 6863.359 0.962260014x >> StringIndexOfChar.latin1_Short_char 16013.389 16162.437 1.009307711x >> StringIndexOfChar.latin1_mixed_String 7386.123 14771.622 1.999915517x >> StringIndexOfChar.latin1_mixed_char 9901.671 9782.245 0.987938803 > > Scott Gibbons has updated the pull request incrementally with one additional commit since the last revision: > > Fix copyright & a couple of comment typos Hi, everyone. I see that JDK 23 has now been forked, and new commits go into the JDK 24 branch. I would like to get this in as soon as possible to have as much time with fuzzers, etc. for everyone to be confident in the code. I have 3 positive reviews on this PR and would like to integrate. Please reply as soon as you reasonably can with objections or approval and I will integrate. Thanks. ------------- PR Comment: https://git.openjdk.org/jdk/pull/16753#issuecomment-2153072708 From liach at openjdk.org Thu Jun 6 17:51:44 2024 From: liach at openjdk.org (Chen Liang) Date: Thu, 6 Jun 2024 17:51:44 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 07:47:46 GMT, SendaoYan wrote: > Hi all, > This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. > > Thanks. Changes requested by liach (Author). test/jdk/java/rmi/reliability/benchmark/bench/rmi/Makefile line 1: > 1: # This file change is dubious: 1. It does not have any trailing whitespace that can fail the skara checks. 2. If the duplicate blank lines in the end of this Makefile is indeed problematic (as fixed here), please fix the only other occasion in the JDK, which is the Makefile in the parent directory. (Checked with `\n$^\n$\Z` pattern in all Makefiles) Recommended actions: Either 1. Revert changes in this file; 2. Also update `test/jdk/java/rmi/reliability/benchmark/bench/Makefile` to remove the trailing blank line. ------------- PR Review: https://git.openjdk.org/jdk/pull/19537#pullrequestreview-2102735910 PR Review Comment: https://git.openjdk.org/jdk/pull/19537#discussion_r1629981196 From szaldana at openjdk.org Thu Jun 6 17:58:56 2024 From: szaldana at openjdk.org (Sonia Zaldana Calles) Date: Thu, 6 Jun 2024 17:58:56 GMT Subject: RFR: 8327240: Obsolete Tier2CompileThreshold/Tier2BackEdgeThreshold product flags [v4] In-Reply-To: <86N-93rC4Q2Q1d_YQSARfjQAHNNCEMvCXMq0_fk5A48=.9c621bb8-b724-40fb-afd7-835773a0e942@github.com> References: <86N-93rC4Q2Q1d_YQSARfjQAHNNCEMvCXMq0_fk5A48=.9c621bb8-b724-40fb-afd7-835773a0e942@github.com> Message-ID: > Hi all, > > This PR removes the unused options ```Tier2CompileThreshold``` and ```Tier2BackEdgeThreshold```. > > Testing: > - [x] Verified warning is issued as support was removed. > > Thanks, > Sonia Sonia Zaldana Calles has updated the pull request with a new target base due to a merge or a rebase. The pull request now contains six commits: - Merge branch 'openjdk:master' into JDK-8327240 - formatting - Merge master - Adding to obsolete list - Deleting usage of flag in test - 8327240: Remove unused Tier2CompileThreshold/Tier2BackEdgeThreshold product flags ------------- Changes: https://git.openjdk.org/jdk/pull/18904/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=18904&range=03 Stats: 11 lines in 3 files changed: 2 ins; 8 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/18904.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/18904/head:pull/18904 PR: https://git.openjdk.org/jdk/pull/18904 From kvn at openjdk.org Thu Jun 6 18:28:07 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 18:28:07 GMT Subject: RFR: 8320448: Accelerate IndexOf using AVX2 [v52] In-Reply-To: References: Message-ID: On Thu, 30 May 2024 16:16:45 GMT, Scott Gibbons wrote: >> Re-write the IndexOf code without the use of the pcmpestri instruction, only using AVX2 instructions. This change accelerates String.IndexOf on average 1.3x for AVX2. The benchmark numbers: >> >> >> Benchmark Score Latest >> StringIndexOf.advancedWithMediumSub 343.573 317.934 0.925375393x >> StringIndexOf.advancedWithShortSub1 1039.081 1053.96 1.014319384x >> StringIndexOf.advancedWithShortSub2 55.828 110.541 1.980027943x >> StringIndexOf.constantPattern 9.361 11.906 1.271872663x >> StringIndexOf.searchCharLongSuccess 4.216 4.218 1.000474383x >> StringIndexOf.searchCharMediumSuccess 3.133 3.216 1.02649218x >> StringIndexOf.searchCharShortSuccess 3.76 3.761 1.000265957x >> StringIndexOf.success 9.186 9.713 1.057369911x >> StringIndexOf.successBig 14.341 46.343 3.231504079x >> StringIndexOfChar.latin1_AVX2_String 6220.918 12154.52 1.953814533x >> StringIndexOfChar.latin1_AVX2_char 5503.556 5540.044 1.006629895x >> StringIndexOfChar.latin1_SSE4_String 6978.854 6818.689 0.977049957x >> StringIndexOfChar.latin1_SSE4_char 5657.499 5474.624 0.967675646x >> StringIndexOfChar.latin1_Short_String 7132.541 6863.359 0.962260014x >> StringIndexOfChar.latin1_Short_char 16013.389 16162.437 1.009307711x >> StringIndexOfChar.latin1_mixed_String 7386.123 14771.622 1.999915517x >> StringIndexOfChar.latin1_mixed_char 9901.671 9782.245 0.987938803 > > Scott Gibbons has updated the pull request incrementally with one additional commit since the last revision: > > Fix copyright & a couple of comment typos Let me do quick testing with latest mainline (JDK 24 now). ------------- PR Comment: https://git.openjdk.org/jdk/pull/16753#issuecomment-2153142794 From jbhateja at openjdk.org Thu Jun 6 19:31:03 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Thu, 6 Jun 2024 19:31:03 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: > Summary of changes include with the patch:- > > 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) > 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. > > Kindly review and share your feedback. > > Best Regards, > Jatin Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: Review comments resolutions. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/18562/files - new: https://git.openjdk.org/jdk/pull/18562/files/0881e43c..b5da0938 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=18562&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=18562&range=00-01 Stats: 34 lines in 3 files changed: 14 ins; 3 del; 17 mod Patch: https://git.openjdk.org/jdk/pull/18562.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/18562/head:pull/18562 PR: https://git.openjdk.org/jdk/pull/18562 From jbhateja at openjdk.org Thu Jun 6 19:31:03 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Thu, 6 Jun 2024 19:31:03 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) In-Reply-To: References: Message-ID: <-A6ONIqT2KzcT9yycwfiA2sBVevnWknhyvIRRysV6mU=.dbc05862-d8b1-4022-93b2-99646095bd89@github.com> On Mon, 1 Apr 2024 12:01:27 GMT, Jatin Bhateja wrote: > Summary of changes include with the patch:- > > 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) > 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. > > Kindly review and share your feedback. > > Best Regards, > Jatin Hi @vnkozlov , Please let us know if its good to land in 23. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18562#issuecomment-2153258321 From jbhateja at openjdk.org Thu Jun 6 19:31:04 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Thu, 6 Jun 2024 19:31:04 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 23:58:50 GMT, Sandhya Viswanathan wrote: >> Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: >> >> Review comments resolutions. > > src/hotspot/cpu/x86/vm_version_x86.cpp line 113: > >> 111: VM_Version_StubGenerator(CodeBuffer *c) : StubCodeGenerator(c) {} >> 112: >> 113: address clear_apx_test_state() { > > Why do we need to clear_apx_test_state? r16 onwards are not callee saved. And checking r15 save/restore is not needed so we could remove r15 changes altogether. Yes, EGPRs are call clobbered registers, but here we are trying to ascertain if their values are preserved across signal handling. Explicit clearing of r16 and r31 during signal handling guarantees that preserved register values post signal handling were re-instantiated by operating system and not because they were not modified externally. > src/hotspot/cpu/x86/vm_version_x86.cpp line 447: > >> 445: /* FIXME: Uncomment after integration of JDK-8328998 >> 446: __ mov64(rax, VM_Version::egpr_test_value()); >> 447: __ cmpq(rax, r15); > > Likewise r15 validation can be removed. r15 validation showed contrasting results in comparison to r16 currently, But its fair enough to remove it. DONE > src/hotspot/cpu/x86/vm_version_x86.cpp line 456: > >> 454: // Generate SEGV to signal unsuccessful save/restore. >> 455: __ bind(apx_save_restore_error); >> 456: __ lea(rax, ExternalAddress(VM_Version::_apx_state_restore_error_handler)); > > Generating an error message here won't be the right thing (especially since this is default by feature detection). It should only result in setting UseAPX feature to false. DONE ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630107399 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630107493 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630108400 From kvn at openjdk.org Thu Jun 6 19:45:45 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 19:45:45 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) In-Reply-To: <-A6ONIqT2KzcT9yycwfiA2sBVevnWknhyvIRRysV6mU=.dbc05862-d8b1-4022-93b2-99646095bd89@github.com> References: <-A6ONIqT2KzcT9yycwfiA2sBVevnWknhyvIRRysV6mU=.dbc05862-d8b1-4022-93b2-99646095bd89@github.com> Message-ID: On Thu, 6 Jun 2024 19:27:47 GMT, Jatin Bhateja wrote: > Hi @vnkozlov , Please let us know if its good to land in 23. No, I don't see the urgency. We need extensive testing that everything works with APX. It is actually good time to push it into JDK 24 to have long testing period before next release. Let us review it and test before integrating. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18562#issuecomment-2153275647 From kvn at openjdk.org Thu Jun 6 19:45:46 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 19:45:46 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 19:31:03 GMT, Jatin Bhateja wrote: >> Summary of changes include with the patch:- >> >> 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) >> 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review comments resolutions. Actually we can't even fully test it until VM start using all registers provided by APX. And we don't have HW currently. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18562#issuecomment-2153278275 PR Comment: https://git.openjdk.org/jdk/pull/18562#issuecomment-2153280371 From mli at openjdk.org Thu Jun 6 20:14:28 2024 From: mli at openjdk.org (Hamlin Li) Date: Thu, 6 Jun 2024 20:14:28 GMT Subject: RFR: 8333649: Allow different NativeCall encodings In-Reply-To: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Message-ID: On Wed, 5 Jun 2024 12:35:47 GMT, Robbin Ehn wrote: > Hi all, please consider! > > We want to have different selectable NativeCalls. > These are not the same size, shared code should query instead of using the enum directly. > > Sanity build and tested RV/x86, hoping GHA will catch anything else. > > Thanks, Robbin Marked as reviewed by mli (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19556#pullrequestreview-2103118794 From mli at openjdk.org Thu Jun 6 20:22:13 2024 From: mli at openjdk.org (Hamlin Li) Date: Thu, 6 Jun 2024 20:22:13 GMT Subject: RFR: 8333649: Allow different NativeCall encodings In-Reply-To: References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Message-ID: On Wed, 5 Jun 2024 20:11:32 GMT, Dean Long wrote: > Do you want to make the enum private so it can't be accessed directly? I guess it's fine, as it's not changable on other platforms exception on riscv, and on riscv subsequent pr could remove the `instruction_size` definition in class NativeCall, and only have the method `byte_size`? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19556#issuecomment-2153341095 From kvn at openjdk.org Thu Jun 6 20:31:19 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 20:31:19 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 19:31:03 GMT, Jatin Bhateja wrote: >> Summary of changes include with the patch:- >> >> 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) >> 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review comments resolutions. Few comments src/hotspot/cpu/x86/vm_version_x86.cpp line 1052: > 1050: > 1051: // Currently APX support is only enabled for targets supporting AVX512VL feature. > 1052: if (UseAPX && (!supports_apx_f() || !supports_avx512vl())) { This code should be after UseAVX checks. src/hotspot/cpu/x86/vm_version_x86.cpp line 1062: > 1060: if (UseAVX < 2) { > 1061: _features &= ~CPU_AVX2; > 1062: _features &= ~CPU_AVX_IFMA; Since value of UseAVX affects avx512vl it should affect UseAPX/CPU_APX_F too. src/hotspot/cpu/x86/vm_version_x86.hpp line 337: > 335: static address _cpuinfo_cont_addr; // address of instruction after the one which causes SEGV > 336: static address _cpuinfo_segv_addr_apx; // address of instruction which causes APX specific SEGV > 337: static address _cpuinfo_cont_addr_apx; // address of instruction which causes APX specific SEGV Duplicated comment. It should continuation address comment. ------------- PR Review: https://git.openjdk.org/jdk/pull/18562#pullrequestreview-2103120144 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630190641 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630185873 PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630187773 From kvn at openjdk.org Thu Jun 6 20:31:20 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 20:31:20 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 19:25:02 GMT, Jatin Bhateja wrote: >> src/hotspot/cpu/x86/vm_version_x86.cpp line 113: >> >>> 111: VM_Version_StubGenerator(CodeBuffer *c) : StubCodeGenerator(c) {} >>> 112: >>> 113: address clear_apx_test_state() { >> >> Why do we need to clear_apx_test_state? r16 onwards are not callee saved. And checking r15 save/restore is not needed so we could remove r15 changes altogether. > > Yes, EGPRs are call clobbered registers, but here we are trying to ascertain if their values are preserved across signal handling. Explicit clearing of r16 and r31 during signal handling guarantees that preserved register values post signal handling were re-instantiated by operating system and not because they were not modified externally. Please, add comment about that. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630170720 From kvn at openjdk.org Thu Jun 6 20:40:28 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Thu, 6 Jun 2024 20:40:28 GMT Subject: RFR: 8320448: Accelerate IndexOf using AVX2 [v52] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 17:41:20 GMT, Scott Gibbons wrote: >> Scott Gibbons has updated the pull request incrementally with one additional commit since the last revision: >> >> Fix copyright & a couple of comment typos > > Hi, everyone. I see that JDK 23 has now been forked, and new commits go into the JDK 24 branch. I would like to get this in as soon as possible to have as much time with fuzzers, etc. for everyone to be confident in the code. > > I have 3 positive reviews on this PR and would like to integrate. Please reply as soon as you reasonably can with objections or approval and I will integrate. Thanks. @asgibbons, my testing almost finished. No new failures. I think this can be pushed now. Thank you for waiting! ------------- PR Comment: https://git.openjdk.org/jdk/pull/16753#issuecomment-2153366787 From sgibbons at openjdk.org Thu Jun 6 21:47:26 2024 From: sgibbons at openjdk.org (Scott Gibbons) Date: Thu, 6 Jun 2024 21:47:26 GMT Subject: RFR: 8320448: Accelerate IndexOf using AVX2 [v49] In-Reply-To: <9Gep5o1EEF96gprsHB1vDiw8KSQON-c6uh_9gBJyq9c=.43962158-2f23-4929-9e72-d4827a4fa5e8@github.com> References: <9PIuILHZnLHrZf1sz0Dsq6iup6qgyXw50mD0nGVS04c=.63bd0afd-d818-46fa-a082-a3d2066829cd@github.com> <4ZM8wZFYPZjIbjb_O6n6DNAlpYOa2EHfmhSZHVUAXNA=.b923e319-f143-4a4c-9916-face36f337db@github.com> <9Gep5o1EEF96gprsHB1vDiw8KSQON-c6uh_9gBJyq9c=.43962158-2f23-4929-9e72-d4827a4fa5e8@github.com> Message-ID: <8kmAaqEcZiqqRB0MSsNG2jbHkgQ-9p3DH_AHBZsBwr0=.be5d30cd-03b4-4446-8105-1d694cd3d7e4@github.com> On Thu, 30 May 2024 16:20:02 GMT, Emanuel Peter wrote: >> @vnkozlov OK. I'll defer to you all. I've contacted the author of the fuzzer to see what I can do to set up a local instance. Would this be sufficient to increase confidence for future submissions? We can run it perpetually on fixes (provided I can set it up). Had I done that, we could have had 6 months of fuzzing on top of our tests. Would that have alleviated this concern? > > @asgibbons I generally just stop pushing ANY RFE's a week or two before the fork. Even if you did run the fuzzer with it - there are often last-minute changes. And your code here is rather large, so even if you are confident, there must be at least one bug hiding. > > Running the fuzzer is nice as pre-integration, but it mostly only catches things post-integration. @eme64 Are you OK with me integrating? ------------- PR Comment: https://git.openjdk.org/jdk/pull/16753#issuecomment-2153456076 From vlivanov at openjdk.org Thu Jun 6 22:55:21 2024 From: vlivanov at openjdk.org (Vladimir Ivanov) Date: Thu, 6 Jun 2024 22:55:21 GMT Subject: RFR: 8331658: secondary_super_cache does not scale well: C1 [v2] In-Reply-To: References: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> Message-ID: On Thu, 6 Jun 2024 09:09:34 GMT, Andrew Haley wrote: > The end goal, I hope, is to remove the secondary_super_cache field altogether because it does nothing useful. Apart from anything else, we'd remove some cruft in the VM. So, C1, interpreter, and runtime is next. Sure, I fully support complete removal of SSC. Thanks a lot for taking care of it! It looks like we are already on the same page. I just tried to make a point that the rest of the JVM (outside C2) seems much less bothered about performance of subtype checks. It doesn't mean these's no need to optimize those scenarios, but as part of SSC removal I'd prefer to prioritize simplicity of the implementation over peak performance. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19426#issuecomment-2153524330 From jbhateja at openjdk.org Fri Jun 7 02:16:27 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Fri, 7 Jun 2024 02:16:27 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v3] In-Reply-To: References: Message-ID: > Summary of changes include with the patch:- > > 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) > 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. > > Kindly review and share your feedback. > > Best Regards, > Jatin Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: Review comments addressed. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/18562/files - new: https://git.openjdk.org/jdk/pull/18562/files/b5da0938..68df08ce Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=18562&range=02 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=18562&range=01-02 Stats: 8 lines in 2 files changed: 6 ins; 1 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/18562.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/18562/head:pull/18562 PR: https://git.openjdk.org/jdk/pull/18562 From jbhateja at openjdk.org Fri Jun 7 02:16:27 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Fri, 7 Jun 2024 02:16:27 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 20:26:43 GMT, Vladimir Kozlov wrote: >> Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: >> >> Review comments resolutions. > > src/hotspot/cpu/x86/vm_version_x86.cpp line 1052: > >> 1050: >> 1051: // Currently APX support is only enabled for targets supporting AVX512VL feature. >> 1052: if (UseAPX && (!supports_apx_f() || !supports_avx512vl())) { > > This code should be after UseAVX checks. Its purposefully placed after modifications to CPU_* features flags if user explicitly sets UseAVX < 3. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630536165 From jbhateja at openjdk.org Fri Jun 7 02:19:12 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Fri, 7 Jun 2024 02:19:12 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 19:41:25 GMT, Vladimir Kozlov wrote: > Actually we can't even fully test it until VM start using all registers provided by APX. Hi @vnkozlov , EGPR state restoration across signal handling can only be validated after OS support, CPUID and UseAPX validation has been done using [Intel? Software Development Emulator](https://www.intel.com/content/www/us/en/download/684897/intel-software-development-emulator.html), other comments addressed. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18562#issuecomment-2153731083 From kvn at openjdk.org Fri Jun 7 02:23:16 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Fri, 7 Jun 2024 02:23:16 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: <-GsFrqzgkkbVyNAp7-mi4aIOYdwgehyRe8nI3xrV1tw=.4c723742-1d7f-4b98-859a-9d87230859fe@github.com> On Fri, 7 Jun 2024 02:12:31 GMT, Jatin Bhateja wrote: >> src/hotspot/cpu/x86/vm_version_x86.cpp line 1052: >> >>> 1050: >>> 1051: // Currently APX support is only enabled for targets supporting AVX512VL feature. >>> 1052: if (UseAPX && (!supports_apx_f() || !supports_avx512vl())) { >> >> This code should be after UseAVX checks. > > Its purposefully placed after modifications to CPU_* features flags if user explicitly sets UseAVX < 3. Got it. I missed that we have separate UseAVX checks for <3 and < 2 and < 1. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630542872 From kvn at openjdk.org Fri Jun 7 02:47:13 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Fri, 7 Jun 2024 02:47:13 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v3] In-Reply-To: References: Message-ID: On Fri, 7 Jun 2024 02:16:27 GMT, Jatin Bhateja wrote: >> Summary of changes include with the patch:- >> >> 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) >> 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review comments addressed. src/hotspot/cpu/x86/vm_version_x86.cpp line 443: > 441: > 442: /* FIXME: Uncomment while integrating JDK-8329032 > 443: bool save_apx = UseAPX; What are you missing to uncomment this code? 8329032 is about `.ad` file changes. It should not affect execution of this code. You need changes in `register_x86.*` files and may be somewhere else but you don't need C2 changes for this code to work. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630559908 From jbhateja at openjdk.org Fri Jun 7 03:55:13 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Fri, 7 Jun 2024 03:55:13 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v3] In-Reply-To: References: Message-ID: On Fri, 7 Jun 2024 02:45:01 GMT, Vladimir Kozlov wrote: >> Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: >> >> Review comments addressed. > > src/hotspot/cpu/x86/vm_version_x86.cpp line 443: > >> 441: >> 442: /* FIXME: Uncomment while integrating JDK-8329032 >> 443: bool save_apx = UseAPX; > > What are you missing to uncomment this code? > 8329032 is about `.ad` file changes. It should not affect execution of this code. > You need changes in `register_x86.*` files and may be somewhere else but you don't need C2 changes for this code to work. Yes, we already have that in place with https://github.com/openjdk/jdk/pull/19042, which will be open for review after this patch. I added it in comments since this piece of logic is centered around CPUID feature check and pertinent to this patch. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1630611657 From epeter at openjdk.org Fri Jun 7 05:04:47 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Fri, 7 Jun 2024 05:04:47 GMT Subject: RFR: 8325155: C2 SuperWord: remove alignment boundaries [v6] In-Reply-To: <_usou5aJb--Azf87Jzu9pATR_JDc7JeSe4PhlwnVtHw=.e6d64f25-0e81-4d75-9351-813c0dc9c647@github.com> References: <_usou5aJb--Azf87Jzu9pATR_JDc7JeSe4PhlwnVtHw=.e6d64f25-0e81-4d75-9351-813c0dc9c647@github.com> Message-ID: On Wed, 29 May 2024 09:17:55 GMT, Christian Hagedorn wrote: >> Emanuel Peter has updated the pull request incrementally with one additional commit since the last revision: >> >> Update src/hotspot/share/opto/superword.cpp >> >> Co-authored-by: Christian Hagedorn > > Marked as reviewed by chagedorn (Reviewer). Thanks @chhagedorn @vnkozlov for the reviews! ------------- PR Comment: https://git.openjdk.org/jdk/pull/18822#issuecomment-2154000732 From epeter at openjdk.org Fri Jun 7 05:04:48 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Fri, 7 Jun 2024 05:04:48 GMT Subject: Integrated: 8325155: C2 SuperWord: remove alignment boundaries In-Reply-To: References: Message-ID: On Wed, 17 Apr 2024 17:58:53 GMT, Emanuel Peter wrote: > I have tried for a very long time to get rid of all the `alignment(n)` code that is all over the SuperWord code. With lots of previous work, I am now finally ready to remove it. > > I was able to remove lots of VM code, about 300 lines. And the removed code is I think much more complicated than the new code. > > This is what I did in this PR: > - Removal of `_node_info`: used to have many fields, which I refactored out to the `VLoopAnalyzer` modules. `alignment` is the last component, which I now remove. > - Changed the implementation of `SuperWord::find_adjacent_refs`, now `SuperWord::find_adjacent_memop_pairs`, completely: > - It used to be an algorithm that would scan over all `memops` repeatedly, try to find some `mem_ref` and see which other memops were comparable, and then pack pairs for all of those, by comparing all-vs-all memops. This algorithm is at least quadratic, if not much worse. > - I now add all `memops` into a single array, sort them by groups (those that are comparable with each other and could be packed into vectors), and inside the groups by ascending offset. This allows me to split off the groups much more efficiently, and also the sorting by offset allows me finding adjacent pairs much more efficiently. In the most cases this reduces the cost to `O(n log n)` for sort, and a linear scan for finding adjacent memops. > - I removed the "alignment boundaries" created in `SuperWord::memory_alignment` by `int off_rem = offset % vw;`. > - This used to have the effect that all offsets were computed modulo the vector width. Hence, pairs could not be packed across this boundary (e.g. we have nodes with offsets `31, 32`, which are adjacent in theory, but if we have a `vw = 32`, then the modulo-offsets are `31, 0`, and they are not detected as adjacent). > - These "alignment boundaries" used to be required for correctness about a year ago, before I fixed and relaxed much of the alignment code. > - The `alignment` used to have another important task: Ensuring compatibility of the input-size of a use node, with the output-size of the def-node. > - This was done by giving all nodes an `alignment`, even the non-memop nodes. This `alignment` was then scaled up and down at type casts (e.g. int `0, 4, 8, 12` -> long `0, 8, 16, 24`). If the output-size of the def-node did not match the input-size of the use-node, then the `alignment` would not match up, and we would not pack. > - This is why we used to have checks like `alignment(s1) + data_size(s1) == alignment(s2)` ... This pull request has now been integrated. Changeset: 944aeb81 Author: Emanuel Peter URL: https://git.openjdk.org/jdk/commit/944aeb81b16e3e7a3019cafdefe67b797fa6be96 Stats: 1073 lines in 7 files changed: 609 ins; 357 del; 107 mod 8325155: C2 SuperWord: remove alignment boundaries Reviewed-by: chagedorn, kvn ------------- PR: https://git.openjdk.org/jdk/pull/18822 From epeter at openjdk.org Fri Jun 7 05:06:29 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Fri, 7 Jun 2024 05:06:29 GMT Subject: RFR: 8333713: C2 SuperWord: cleanup in vectornode.cpp/hpp In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 13:52:56 GMT, Vladimir Kozlov wrote: >> Removed dead code and renamed `superword` -> `auto_vectorization`, just like I had done in [JDK-8324750](https://bugs.openjdk.org/browse/JDK-8324750). > > Good. Thanks @vnkozlov @chhagedorn for the reviews! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19575#issuecomment-2154010958 From epeter at openjdk.org Fri Jun 7 05:06:30 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Fri, 7 Jun 2024 05:06:30 GMT Subject: Integrated: 8333713: C2 SuperWord: cleanup in vectornode.cpp/hpp In-Reply-To: References: Message-ID: <4HrLZDxGOO4ujvK0yT0jhovQq_W-dnu4wiypDrRjXHk=.3dc1cb9e-613d-4bd9-97d1-dddcbb505f66@github.com> On Thu, 6 Jun 2024 11:22:24 GMT, Emanuel Peter wrote: > Removed dead code and renamed `superword` -> `auto_vectorization`, just like I had done in [JDK-8324750](https://bugs.openjdk.org/browse/JDK-8324750). This pull request has now been integrated. Changeset: e5383d71 Author: Emanuel Peter URL: https://git.openjdk.org/jdk/commit/e5383d710c0727181a2f0b569a881de2492e3683 Stats: 16 lines in 2 files changed: 0 ins; 10 del; 6 mod 8333713: C2 SuperWord: cleanup in vectornode.cpp/hpp Reviewed-by: chagedorn, kvn ------------- PR: https://git.openjdk.org/jdk/pull/19575 From epeter at openjdk.org Fri Jun 7 05:08:26 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Fri, 7 Jun 2024 05:08:26 GMT Subject: RFR: 8320448: Accelerate IndexOf using AVX2 [v49] In-Reply-To: <8kmAaqEcZiqqRB0MSsNG2jbHkgQ-9p3DH_AHBZsBwr0=.be5d30cd-03b4-4446-8105-1d694cd3d7e4@github.com> References: <9PIuILHZnLHrZf1sz0Dsq6iup6qgyXw50mD0nGVS04c=.63bd0afd-d818-46fa-a082-a3d2066829cd@github.com> <4ZM8wZFYPZjIbjb_O6n6DNAlpYOa2EHfmhSZHVUAXNA=.b923e319-f143-4a4c-9916-face36f337db@github.com> <9Gep5o1EEF96gprsHB1vDiw8KSQON-c6uh_9gBJyq9c=.43962158-2f23-4929-9e72-d4827a4fa5e8@github.com> <8kmAaqEcZiqqRB0MSsNG2jbHkgQ-9p3DH_AHBZsBwr0=.be5d30cd-03b4-4446-8105-1d694cd3d7e4@github.com> Message-ID: <0XVuJ7gECpxt76s5lju6aMOqcZK9MJ07dtlumvonwZw=.3f199acf-047c-4aee-bcc6-e3fa9f4f4bf5@github.com> On Thu, 6 Jun 2024 21:44:44 GMT, Scott Gibbons wrote: >> @asgibbons I generally just stop pushing ANY RFE's a week or two before the fork. Even if you did run the fuzzer with it - there are often last-minute changes. And your code here is rather large, so even if you are confident, there must be at least one bug hiding. >> >> Running the fuzzer is nice as pre-integration, but it mostly only catches things post-integration. > > @eme64 Are you OK with me integrating? @asgibbons yes, ship it! ? Thanks for waiting! ------------- PR Comment: https://git.openjdk.org/jdk/pull/16753#issuecomment-2154015592 From epeter at openjdk.org Fri Jun 7 05:09:17 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Fri, 7 Jun 2024 05:09:17 GMT Subject: RFR: 8332537: C2: High memory usage reported for compiler/loopopts/superword/TestAlignVectorFuzzer.java In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 14:50:26 GMT, Tobias Hartmann wrote: >> This bug was a regression of: >> [JDK-8324517](https://bugs.openjdk.org/browse/JDK-8324517); C2: crash in compiled code because of dependency on removed range check CastIIs >> >> That change was backed out with: >> [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs >> >> I `git revert` ed the BACKOUT change: >> [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs >> >> And only then this reproduced (not on master, only with Roland's CastII regression code): >> `~/Documents/jtreg/bin/jtreg -va -s -jdk:/oracle-work/jdk-fork2/build/linux-x64-debug/jdk -javaoptions:"-Djdk.test.lib.random.seed=3249981201344669190" -J-Djavatest.maxOutputSize=10000000 /oracle-work/jdk-fork2/open/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java` >> >> Now that we know that it is a regression of a backed-out change, we can enable the MemLimit check again. >> >> Also the stack-traces were related to CastII code, and looked very similar to those in another duplicate: >> [JDK-8332765](https://bugs.openjdk.org/browse/JDK-8332765): Test compiler/loopopts/superword/TestAlignVectorFuzzer.java still times out after JDK-8327978 > > Marked as reviewed by thartmann (Reviewer). Thanks @TobiHartmann @tstuefe @vnkozlov for the reviews! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19580#issuecomment-2154017910 From epeter at openjdk.org Fri Jun 7 05:09:17 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Fri, 7 Jun 2024 05:09:17 GMT Subject: Integrated: 8332537: C2: High memory usage reported for compiler/loopopts/superword/TestAlignVectorFuzzer.java In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 14:22:48 GMT, Emanuel Peter wrote: > This bug was a regression of: > [JDK-8324517](https://bugs.openjdk.org/browse/JDK-8324517); C2: crash in compiled code because of dependency on removed range check CastIIs > > That change was backed out with: > [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs > > I `git revert` ed the BACKOUT change: > [JDK-8332829](https://bugs.openjdk.org/browse/JDK-8332829): [BACKOUT] C2: crash in compiled code because of dependency on removed range check CastIIs > > And only then this reproduced (not on master, only with Roland's CastII regression code): > `~/Documents/jtreg/bin/jtreg -va -s -jdk:/oracle-work/jdk-fork2/build/linux-x64-debug/jdk -javaoptions:"-Djdk.test.lib.random.seed=3249981201344669190" -J-Djavatest.maxOutputSize=10000000 /oracle-work/jdk-fork2/open/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVectorFuzzer.java` > > Now that we know that it is a regression of a backed-out change, we can enable the MemLimit check again. > > Also the stack-traces were related to CastII code, and looked very similar to those in another duplicate: > [JDK-8332765](https://bugs.openjdk.org/browse/JDK-8332765): Test compiler/loopopts/superword/TestAlignVectorFuzzer.java still times out after JDK-8327978 This pull request has now been integrated. Changeset: b4beda21 Author: Emanuel Peter URL: https://git.openjdk.org/jdk/commit/b4beda21b487886b022e04766e140e6d1df1038a Stats: 4 lines in 1 file changed: 0 ins; 4 del; 0 mod 8332537: C2: High memory usage reported for compiler/loopopts/superword/TestAlignVectorFuzzer.java Reviewed-by: kvn, thartmann, stuefe ------------- PR: https://git.openjdk.org/jdk/pull/19580 From rrich at openjdk.org Fri Jun 7 06:19:17 2024 From: rrich at openjdk.org (Richard Reingruber) Date: Fri, 7 Jun 2024 06:19:17 GMT Subject: Integrated: 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store In-Reply-To: References: Message-ID: On Mon, 13 May 2024 15:53:52 GMT, Richard Reingruber wrote: > This pr adds a few tweaks to [JDK-8318446](https://bugs.openjdk.org/browse/JDK-8318446) which allows enabling it also on big endian platforms (e.g. AIX, S390). JDK-8318446 introduced a C2 optimization to replace consecutive stores to a primitive array with just one store. > > By example (from `TestMergeStores.java`): > > > static Object[] test2a(byte[] a, int offset, long v) { > if (IS_BIG_ENDIAN) { > a[offset + 0] = (byte)(v >> 56); > a[offset + 1] = (byte)(v >> 48); > a[offset + 2] = (byte)(v >> 40); > a[offset + 3] = (byte)(v >> 32); > a[offset + 4] = (byte)(v >> 24); > a[offset + 5] = (byte)(v >> 16); > a[offset + 6] = (byte)(v >> 8); > a[offset + 7] = (byte)(v >> 0); > } else { > a[offset + 0] = (byte)(v >> 0); > a[offset + 1] = (byte)(v >> 8); > a[offset + 2] = (byte)(v >> 16); > a[offset + 3] = (byte)(v >> 24); > a[offset + 4] = (byte)(v >> 32); > a[offset + 5] = (byte)(v >> 40); > a[offset + 6] = (byte)(v >> 48); > a[offset + 7] = (byte)(v >> 56); > } > return new Object[]{ a }; > } > > > Depending on the endianess 8 bytes are stored into an array. The order of the stores is the same as the order of an 8-byte-store therefore 8 1-byte-stores can be replaced with just one 8-byte-store (if there aren't too many range checks). > > Additionally I've fixed a few comments and a test bug. > > The optimization seems to be a little bit more effective on big endian platforms. > > Again by example: > > > static Object[] test800a(byte[] a, int offset, long v) { > if (IS_BIG_ENDIAN) { > a[offset + 0] = (byte)(v >> 40); // Removed from candidate list > a[offset + 1] = (byte)(v >> 32); // Removed from candidate list > a[offset + 2] = (byte)(v >> 24); // Merged > a[offset + 3] = (byte)(v >> 16); // Merged > a[offset + 4] = (byte)(v >> 8); // Merged > a[offset + 5] = (byte)(v >> 0); // Merged > } else { > a[offset + 0] = (byte)(v >> 0); // Removed from candidate list > a[offset + 1] = (byte)(v >> 8); // Removed from candidate list > a[offset + 2] = (byte)(v >> 16); // Not merged > a[offset + 3] = (byte)(v >> 24); // Not merged > a[offset + 4] = (byte)(v >> 32); // Not merged > a[offset + 5] = (byte)(v >> 40); // Not merged > } > return new Object[]{ a };... This pull request has now been integrated. Changeset: f7862bd6 Author: Richard Reingruber URL: https://git.openjdk.org/jdk/commit/f7862bd6b9994814c6dfd43d471122408601f288 Stats: 695 lines in 3 files changed: 666 ins; 2 del; 27 mod 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store Reviewed-by: epeter, kvn ------------- PR: https://git.openjdk.org/jdk/pull/19218 From rehn at openjdk.org Fri Jun 7 06:42:11 2024 From: rehn at openjdk.org (Robbin Ehn) Date: Fri, 7 Jun 2024 06:42:11 GMT Subject: RFR: 8333649: Allow different NativeCall encodings In-Reply-To: References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Message-ID: On Thu, 6 Jun 2024 20:19:37 GMT, Hamlin Li wrote: > Do you want to make the enum private so it can't be accessed directly? I would want that yes, and probably do so for all NativeXX::instruction_size => ::byte_size(). But I don't want cause a lot of churn in the CPU code for all platforms. Hope you are fine with this. Thanks @vnkozlov @Hamlin-Li ! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19556#issuecomment-2154190762 From syan at openjdk.org Fri Jun 7 07:29:39 2024 From: syan at openjdk.org (SendaoYan) Date: Fri, 7 Jun 2024 07:29:39 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: Message-ID: > Hi all, > This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. > > Thanks. SendaoYan has updated the pull request incrementally with one additional commit since the last revision: delete extra empty trailing blank line in test/jdk/java/rmi/reliability/benchmark/bench/Makefile ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19537/files - new: https://git.openjdk.org/jdk/pull/19537/files/0d2be363..e80b98da Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19537&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19537&range=00-01 Stats: 2 lines in 1 file changed: 0 ins; 1 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19537.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19537/head:pull/19537 PR: https://git.openjdk.org/jdk/pull/19537 From syan at openjdk.org Fri Jun 7 07:29:39 2024 From: syan at openjdk.org (SendaoYan) Date: Fri, 7 Jun 2024 07:29:39 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: Message-ID: <9rsVuFT4B5tg9CFepE9m-CsdBHMsfEPu4pWsWxZhkCk=.f3db4eff-1335-4c4f-a7d6-5970c4a544f7@github.com> On Thu, 6 Jun 2024 17:49:08 GMT, Chen Liang wrote: >> SendaoYan has updated the pull request incrementally with one additional commit since the last revision: >> >> delete extra empty trailing blank line in test/jdk/java/rmi/reliability/benchmark/bench/Makefile > > test/jdk/java/rmi/reliability/benchmark/bench/rmi/Makefile line 1: > >> 1: # > > This file change is dubious: > 1. It does not have any trailing whitespace that can fail the skara checks. > 2. If the duplicate blank lines in the end of this Makefile is indeed problematic (as fixed here), please fix the only other occasion in the JDK, which is the Makefile in the parent directory. (Checked with `\n$^\n$\Z` pattern in all Makefiles) > > Recommended actions: Either > 1. Revert changes in this file; > 2. Also update `test/jdk/java/rmi/reliability/benchmark/bench/Makefile` to remove the trailing blank line. Thanks for the suggestion, the trailing blank line of `test/jdk/java/rmi/reliability/benchmark/bench/Makefile` has been removed. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19537#discussion_r1630767547 From varadam at openjdk.org Fri Jun 7 08:27:18 2024 From: varadam at openjdk.org (Varada M) Date: Fri, 7 Jun 2024 08:27:18 GMT Subject: Integrated: 8331935: Add support for primitive array C1 clone intrinsic in PPC In-Reply-To: References: Message-ID: On Wed, 15 May 2024 13:50:27 GMT, Varada M wrote: > https://bugs.openjdk.org/browse/JDK-8302850 port for PPC64 > > JMH Benchmark Results > > > Before : > > Benchmark (size) Mode Cnt Score Error Units > ArrayClone.byteArraycopy 0 avgt 15 114.107 ? 1.337 ns/op > ArrayClone.byteArraycopy 10 avgt 15 130.492 ? 0.991 ns/op > ArrayClone.byteArraycopy 100 avgt 15 139.103 ? 1.913 ns/op > ArrayClone.byteArraycopy 1000 avgt 15 321.688 ? 6.033 ns/op > ArrayClone.byteClone 0 avgt 15 227.602 ? 3.393 ns/op > ArrayClone.byteClone 10 avgt 15 237.624 ? 2.996 ns/op > ArrayClone.byteClone 100 avgt 15 239.219 ? 2.835 ns/op > > ArrayClone.byteClone 1000 avgt 15 355.571 ? 2.946 ns/op > ArrayClone.intArraycopy 0 avgt 15 113.275 ? 1.099 ns/op > ArrayClone.intArraycopy 10 avgt 15 129.763 ? 1.458 ns/op > ArrayClone.intArraycopy 100 avgt 15 213.327 ? 2.524 ns/op > ArrayClone.intArraycopy 1000 avgt 15 449.650 ? 7.338 ns/op > ArrayClone.intClone 0 avgt 15 225.682 ? 3.048 ns/op > ArrayClone.intClone 10 avgt 15 234.532 ? 2.817 ns/op > ArrayClone.intClone 100 avgt 15 295.934 ? 4.925 ns/op > ArrayClone.intClone 1000 avgt 15 573.368 ? 5.739 ns/op > Finished running test 'micro:java.lang.ArrayClone' > Test report is stored in build/aix-ppc64-server-release/test-results/micro_java_lang_ArrayClone > > ============================== > Test summary > ============================== > TEST TOTAL PASS FAIL ERROR > micro:java.lang.ArrayClone 1 1 0 0 > ============================== > TEST SUCCESS > > Finished building target 'test' in configuration 'aix-ppc64-server-release' > > > > > After: > > Benchmark (size) Mode Cnt Score Error Units > ArrayClone.byteArraycopy 0 avgt 15 113.894 ? 0.993 ns/op > ArrayClone.byteArraycopy 10 avgt 15 131.455 ? 0.956 ns/op > ArrayClone.byteArraycopy 100 avgt 15 139.145 ? 3.002 ns/op > ArrayClone.byteArraycopy 1000 avgt 15 315.957 ? 14.591 ns/op > ArrayClone.byteClone 0 avgt 15 43.753 ? 3.669 ns/op > ArrayClone.byteClone 10 avgt 15 52.329 ? 1.041 ns/op > ArrayClone.byteClone 100 avgt 15 127.711 ? 3.938 ns/op > > ArrayClone.byteClone 1000 avgt 15 225.937 ? 1.987 ns/op > ArrayClone.intArraycopy 0 avgt 15 113.788 ? 0.770 ns/op > ArrayClone.intArraycopy 10 avgt 1... This pull request has now been integrated. Changeset: 6968770b Author: Varada M Committer: Amit Kumar URL: https://git.openjdk.org/jdk/commit/6968770b1e918c74fc009e3562a827bb4acbe2d7 Stats: 64 lines in 6 files changed: 27 ins; 3 del; 34 mod 8331935: Add support for primitive array C1 clone intrinsic in PPC Reviewed-by: mdoerr, amitkumar ------------- PR: https://git.openjdk.org/jdk/pull/19250 From bkilambi at openjdk.org Fri Jun 7 08:47:22 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Fri, 7 Jun 2024 08:47:22 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v11] In-Reply-To: References: Message-ID: <6N3LjYxVGIWUeEzKsDAUAuNJxsBB2hs44w3p7cb0k0Y=.24b83484-f361-48ac-8a9e-f46c6ffd96cb@github.com> On Mon, 3 Jun 2024 08:35:44 GMT, Bhavana Kilambi wrote: >> Floating-point addition is non-associative, that is adding floating-point elements in arbitrary order may get different value. Specially, Vector API does not define the order of reduction intentionally, which allows platforms to generate more efficient codes [1]. So that needs a node to represent non strictly-ordered add-reduction for floating-point type in C2. >> >> To avoid introducing new nodes, this patch adds a bool field in `AddReductionVF/D` to distinguish whether they require strict order. It also removes `UnorderedReductionNode` and adds a virtual function `bool requires_strict_order()` in `ReductionNode`. Besides `AddReductionVF/D`, other reduction nodes' `requires_strict_order()` have a fixed value. >> >> With this patch, Vector API would always generate non strictly-ordered `AddReductionVF/D' on SVE machines with vector length <= 16B as it is more beneficial to generate non-strictly ordered instructions on such machines compared to strictly ordered ones. >> >> [AArch64] >> On Neon, non strictly-ordered `AddReductionVF/D` cannot be generated. Auto-vectorization has already banned these nodes in JDK-8275275 [2]. >> >> This patch adds matching rules for non strictly-ordered `AddReductionVF/D`. >> >> No effects on other platforms. >> >> [Performance] >> FloatMaxVector.ADDLanes [3] measures the performance of add reduction for floating-point type. With this patch, it improves ~3x on my SVE machine (128-bit). >> >> ADDLanes >> >> Benchmark Before After Unit >> FloatMaxVector.ADDLanes 1789.513 5264.226 ops/ms >> >> >> Final code is as below: >> >> Before: >> ` fadda z17.s, p7/m, z17.s, z16.s >> ` >> After: >> >> faddp v17.4s, v21.4s, v21.4s >> faddp s18, v17.2s >> fadd s18, s18, s19 >> >> >> >> >> [Test] >> Full jtreg passed on AArch64 and x86. >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2529 >> [2] https://bugs.openjdk.org/browse/JDK-8275275 >> [3] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/micro/org/openjdk/bench/jdk/incubator/vector/operation/FloatMaxVector.java#L316 > > Bhavana Kilambi has updated the pull request incrementally with one additional commit since the last revision: > > Make changes in IR rules for JTREG tests Thanks for pointing it out. I've changed the title and will soon update patch with a merge with master. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2154377906 From mli at openjdk.org Fri Jun 7 09:36:17 2024 From: mli at openjdk.org (Hamlin Li) Date: Fri, 7 Jun 2024 09:36:17 GMT Subject: RFR: 8321010: RISC-V: C2 RoundVF [v6] In-Reply-To: References: Message-ID: On Fri, 19 Apr 2024 12:09:13 GMT, Hamlin Li wrote: >> Hi, >> Can you have a review on this patch to add RoundVF/RoundDF intrinsics? >> Thanks! >> >> ## Tests >> >> test/hotspot/jtreg/compiler/vectorization/TestRoundVectRiscv64.java test/hotspot/jtreg/compiler/c2/cr6340864/TestFloatVect.java test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java test/hotspot/jtreg/compiler/floatingpoint/TestRound.java >> >> test/jdk/java/lang/Math/RoundTests.java > > Hamlin Li has updated the pull request with a new target base due to a merge or a rebase. The pull request now contains 14 commits: > > - Merge branch 'master' into round-F+D-v > - Merge branch 'master' into round-F+D-v > - restore round mode back to rne > - Merge branch 'master' into round-F+D-v > - fix minors > - merge master > - fix space > - add tests > - add test cases > - v2: (src + 0.5) + rdn > - ... and 4 more: https://git.openjdk.org/jdk/compare/177092b9...2b57205f in progress... ------------- PR Comment: https://git.openjdk.org/jdk/pull/17745#issuecomment-2154466857 From mli at openjdk.org Fri Jun 7 09:37:18 2024 From: mli at openjdk.org (Hamlin Li) Date: Fri, 7 Jun 2024 09:37:18 GMT Subject: RFR: 8321003: RISC-V: C2 MulReductionVI In-Reply-To: References: Message-ID: On Tue, 30 Apr 2024 09:48:11 GMT, Hamlin Li wrote: > Hi, > Can you help to review this patch to implement MulReductionVI/MulReductionVL/MulReductionVF/MulReductionVD? > On riscv, there is no straightforward instructions to do it, but we can do it with a reduction tree, which could reduce the time complexity to lg(N). > Thanks > > ## Performance > TBD in progress... ------------- PR Comment: https://git.openjdk.org/jdk/pull/19015#issuecomment-2154467169 From mli at openjdk.org Fri Jun 7 09:37:20 2024 From: mli at openjdk.org (Hamlin Li) Date: Fri, 7 Jun 2024 09:37:20 GMT Subject: RFR: 8321008: RISC-V: C2 MulAddVS2VI [v2] In-Reply-To: References: Message-ID: On Mon, 29 Apr 2024 14:21:20 GMT, Hamlin Li wrote: >> Hi, >> Can you help to review the patch? >> >> The motivation is to implement `MulAddVS2VI`. >> But to enable `MulAddVS2VI`, `MulAddS2I` is prerequisite, although `MulAddS2I` does not bring extra benefit on riscv as we don't have an specific instruction of muladd on riscv. >> So, this patch implement both `MulAddVS2VI` and `MulAddS2I`. >> >> Thanks > > Hamlin Li has updated the pull request incrementally with one additional commit since the last revision: > > fix t1 usage in progress... ------------- PR Comment: https://git.openjdk.org/jdk/pull/18919#issuecomment-2154467503 From lucy at openjdk.org Fri Jun 7 10:48:11 2024 From: lucy at openjdk.org (Lutz Schmidt) Date: Fri, 7 Jun 2024 10:48:11 GMT Subject: RFR: 8333412: [s390x] Add support for branch on count instruction In-Reply-To: References: Message-ID: <88c3aszf4XJI_OtNmoVaWxMu0-dSYzci9jiukOrJtHA=.83280665-df50-4b09-8860-d9f242fc241f@github.com> On Thu, 6 Jun 2024 06:04:10 GMT, Amit Kumar wrote: > Adds support for BCT, BCTG, BCTR instructions. LGTM. ------------- Marked as reviewed by lucy (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19572#pullrequestreview-2104225114 From liach at openjdk.org Fri Jun 7 10:58:13 2024 From: liach at openjdk.org (Chen Liang) Date: Fri, 7 Jun 2024 10:58:13 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: Message-ID: On Fri, 7 Jun 2024 07:29:39 GMT, SendaoYan wrote: >> Hi all, >> This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. >> >> Thanks. > > SendaoYan has updated the pull request incrementally with one additional commit since the last revision: > > delete extra empty trailing blank line in test/jdk/java/rmi/reliability/benchmark/bench/Makefile Thank you for the fix. ------------- Marked as reviewed by liach (Author). PR Review: https://git.openjdk.org/jdk/pull/19537#pullrequestreview-2104241367 From syan at openjdk.org Fri Jun 7 12:31:13 2024 From: syan at openjdk.org (SendaoYan) Date: Fri, 7 Jun 2024 12:31:13 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: Message-ID: On Fri, 7 Jun 2024 07:29:39 GMT, SendaoYan wrote: >> Hi all, >> This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. >> >> Thanks. > > SendaoYan has updated the pull request incrementally with one additional commit since the last revision: > > delete extra empty trailing blank line in test/jdk/java/rmi/reliability/benchmark/bench/Makefile Thanks all for the review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19537#issuecomment-2154735598 From jwaters at openjdk.org Fri Jun 7 12:40:14 2024 From: jwaters at openjdk.org (Julian Waters) Date: Fri, 7 Jun 2024 12:40:14 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: <9rsVuFT4B5tg9CFepE9m-CsdBHMsfEPu4pWsWxZhkCk=.f3db4eff-1335-4c4f-a7d6-5970c4a544f7@github.com> References: <9rsVuFT4B5tg9CFepE9m-CsdBHMsfEPu4pWsWxZhkCk=.f3db4eff-1335-4c4f-a7d6-5970c4a544f7@github.com> Message-ID: On Fri, 7 Jun 2024 07:26:39 GMT, SendaoYan wrote: >> test/jdk/java/rmi/reliability/benchmark/bench/rmi/Makefile line 1: >> >>> 1: # >> >> This file change is dubious: >> 1. It does not have any trailing whitespace that can fail the skara checks. >> 2. If the duplicate blank lines in the end of this Makefile is indeed problematic (as fixed here), please fix the only other occasion in the JDK, which is the Makefile in the parent directory. (Checked with `\n$^\n$\Z` pattern in all Makefiles) >> >> Recommended actions: Either >> 1. Revert changes in this file; >> 2. Also update `test/jdk/java/rmi/reliability/benchmark/bench/Makefile` to remove the trailing blank line. > > Thanks for the suggestion, the trailing blank line of `test/jdk/java/rmi/reliability/benchmark/bench/Makefile` has been removed. Hmm, I'm inclined to keep the newlines at the EOF for both, what do the rest of you think? ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19537#discussion_r1631140457 From jwaters at openjdk.org Fri Jun 7 12:40:13 2024 From: jwaters at openjdk.org (Julian Waters) Date: Fri, 7 Jun 2024 12:40:13 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: Message-ID: <4cHhBehMcwLWREUH2qT-iK-uUHwJ0x5bMhyUnwL2gq8=.b9b607ff-f8e3-4e40-8ed7-4096d7e5ce50@github.com> On Fri, 7 Jun 2024 07:29:39 GMT, SendaoYan wrote: >> Hi all, >> This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. >> >> Thanks. > > SendaoYan has updated the pull request incrementally with one additional commit since the last revision: > > delete extra empty trailing blank line in test/jdk/java/rmi/reliability/benchmark/bench/Makefile Marked as reviewed by jwaters (Committer). test/jdk/java/rmi/reliability/benchmark/bench/Makefile line 50: > 48: clean: > 49: rm -f *.class .classes > 50: Hmm, shouldn't this newline at EOF be kept? Asking @ all the people who've reviewed this so far, no need to change it just yet ------------- PR Review: https://git.openjdk.org/jdk/pull/19537#pullrequestreview-2104439647 PR Review Comment: https://git.openjdk.org/jdk/pull/19537#discussion_r1631140418 From erikj at openjdk.org Fri Jun 7 12:51:17 2024 From: erikj at openjdk.org (Erik Joelsson) Date: Fri, 7 Jun 2024 12:51:17 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: <4cHhBehMcwLWREUH2qT-iK-uUHwJ0x5bMhyUnwL2gq8=.b9b607ff-f8e3-4e40-8ed7-4096d7e5ce50@github.com> References: <4cHhBehMcwLWREUH2qT-iK-uUHwJ0x5bMhyUnwL2gq8=.b9b607ff-f8e3-4e40-8ed7-4096d7e5ce50@github.com> Message-ID: <5YlIH2IloSdbb0dSta1qr9sb2e5Uyred24oKrMTvZFE=.b99a6c68-32f1-43d7-89f7-5205129f8e1f@github.com> On Fri, 7 Jun 2024 12:37:48 GMT, Julian Waters wrote: >> SendaoYan has updated the pull request incrementally with one additional commit since the last revision: >> >> delete extra empty trailing blank line in test/jdk/java/rmi/reliability/benchmark/bench/Makefile > > test/jdk/java/rmi/reliability/benchmark/bench/Makefile line 50: > >> 48: clean: >> 49: rm -f *.class .classes >> 50: > > Hmm, shouldn't this newline at EOF be kept? Asking @ all the people who've reviewed this so far, no need to change it just yet No, it's an extra newline. A file should end with a newline but one is enough. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19537#discussion_r1631152127 From erikj at openjdk.org Fri Jun 7 12:51:16 2024 From: erikj at openjdk.org (Erik Joelsson) Date: Fri, 7 Jun 2024 12:51:16 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: Message-ID: <_nZVUUX4_1rgtwNp8seD0YoqxnZLtORfjVz9m0VFNrQ=.315a1692-b84c-4eed-95db-3843f438431a@github.com> On Fri, 7 Jun 2024 07:29:39 GMT, SendaoYan wrote: >> Hi all, >> This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. >> >> Thanks. > > SendaoYan has updated the pull request incrementally with one additional commit since the last revision: > > delete extra empty trailing blank line in test/jdk/java/rmi/reliability/benchmark/bench/Makefile Marked as reviewed by erikj (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/19537#pullrequestreview-2104463763 From liach at openjdk.org Fri Jun 7 12:56:19 2024 From: liach at openjdk.org (Chen Liang) Date: Fri, 7 Jun 2024 12:56:19 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: <5YlIH2IloSdbb0dSta1qr9sb2e5Uyred24oKrMTvZFE=.b99a6c68-32f1-43d7-89f7-5205129f8e1f@github.com> References: <4cHhBehMcwLWREUH2qT-iK-uUHwJ0x5bMhyUnwL2gq8=.b9b607ff-f8e3-4e40-8ed7-4096d7e5ce50@github.com> <5YlIH2IloSdbb0dSta1qr9sb2e5Uyred24oKrMTvZFE=.b99a6c68-32f1-43d7-89f7-5205129f8e1f@github.com> Message-ID: On Fri, 7 Jun 2024 12:47:39 GMT, Erik Joelsson wrote: >> test/jdk/java/rmi/reliability/benchmark/bench/Makefile line 50: >> >>> 48: clean: >>> 49: rm -f *.class .classes >>> 50: >> >> Hmm, shouldn't this newline at EOF be kept? Asking @ all the people who've reviewed this so far, no need to change it just yet > > No, it's an extra newline. A file should end with a newline but one is enough. As confusing as they are, unfortunately GitHub UI does not render extra trailing newlines. This is the only one I could find with grepWin. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19537#discussion_r1631159094 From syan at openjdk.org Fri Jun 7 13:04:18 2024 From: syan at openjdk.org (SendaoYan) Date: Fri, 7 Jun 2024 13:04:18 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: <4cHhBehMcwLWREUH2qT-iK-uUHwJ0x5bMhyUnwL2gq8=.b9b607ff-f8e3-4e40-8ed7-4096d7e5ce50@github.com> <5YlIH2IloSdbb0dSta1qr9sb2e5Uyred24oKrMTvZFE=.b99a6c68-32f1-43d7-89f7-5205129f8e1f@github.com> Message-ID: On Fri, 7 Jun 2024 12:53:46 GMT, Chen Liang wrote: >> No, it's an extra newline. A file should end with a newline but one is enough. > > As confusing as they are, unfortunately GitHub UI does not render extra trailing newlines. This is the only one I could find with grepWin. I find the extra trailing newlines through below shell command: for i in `find . -iname "Makefile*" | sed "/./build/d"` ; do tail -n 2 $i | grep -c "^$" | grep -q "^1$" ; if [[ 0 -eq $? ]] ; then echo $i ; fi ; done There are only two files has been found: ./test/jdk/java/rmi/reliability/benchmark/bench/rmi/Makefile ./test/jdk/java/rmi/reliability/benchmark/bench/Makefile ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19537#discussion_r1631168243 From bkilambi at openjdk.org Fri Jun 7 13:30:35 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Fri, 7 Jun 2024 13:30:35 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v12] In-Reply-To: References: Message-ID: > Floating-point addition is non-associative, that is adding floating-point elements in arbitrary order may get different value. Specially, Vector API does not define the order of reduction intentionally, which allows platforms to generate more efficient codes [1]. So that needs a node to represent non strictly-ordered add-reduction for floating-point type in C2. > > To avoid introducing new nodes, this patch adds a bool field in `AddReductionVF/D` to distinguish whether they require strict order. It also removes `UnorderedReductionNode` and adds a virtual function `bool requires_strict_order()` in `ReductionNode`. Besides `AddReductionVF/D`, other reduction nodes' `requires_strict_order()` have a fixed value. > > With this patch, Vector API would always generate non strictly-ordered `AddReductionVF/D' on SVE machines with vector length <= 16B as it is more beneficial to generate non-strictly ordered instructions on such machines compared to strictly ordered ones. > > [AArch64] > On Neon, non strictly-ordered `AddReductionVF/D` cannot be generated. Auto-vectorization has already banned these nodes in JDK-8275275 [2]. > > This patch adds matching rules for non strictly-ordered `AddReductionVF/D`. > > No effects on other platforms. > > [Performance] > FloatMaxVector.ADDLanes [3] measures the performance of add reduction for floating-point type. With this patch, it improves ~3x on my SVE machine (128-bit). > > ADDLanes > > Benchmark Before After Unit > FloatMaxVector.ADDLanes 1789.513 5264.226 ops/ms > > > Final code is as below: > > Before: > ` fadda z17.s, p7/m, z17.s, z16.s > ` > After: > > faddp v17.4s, v21.4s, v21.4s > faddp s18, v17.2s > fadd s18, s18, s19 > > > > > [Test] > Full jtreg passed on AArch64 and x86. > > [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2529 > [2] https://bugs.openjdk.org/browse/JDK-8275275 > [3] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/micro/org/openjdk/bench/jdk/incubator/vector/operation/FloatMaxVector.java#L316 Bhavana Kilambi has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains 12 additional commits since the last revision: - Merge with master - Make changes in IR rules for JTREG tests - Modify JTREG IR rules and some style/format changes - Add dump_spec and JTREG IR tests for Add/Mul Reduction Nodes - Merge master - Adjust format for the backend rules changed in previous commit - Address some more review comments - Revert to previous indentation - Add comments, revert to requires_strict_order and other minor changes - Naming changes: replace strict/non-strict with more technical terms - ... and 2 more: https://git.openjdk.org/jdk/compare/64d68556...35e6258d ------------- Changes: - all: https://git.openjdk.org/jdk/pull/18034/files - new: https://git.openjdk.org/jdk/pull/18034/files/db88e3c9..35e6258d Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=18034&range=11 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=18034&range=10-11 Stats: 147395 lines in 3613 files changed: 93425 ins; 36681 del; 17289 mod Patch: https://git.openjdk.org/jdk/pull/18034.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/18034/head:pull/18034 PR: https://git.openjdk.org/jdk/pull/18034 From bkilambi at openjdk.org Fri Jun 7 13:30:35 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Fri, 7 Jun 2024 13:30:35 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v8] In-Reply-To: References: <8-_t7nWbR9gZ2_QkfFNuf5M0Q4PMkKJKgwS3ZbHcCxI=.32dc4f11-dec5-468d-afc8-3b4dae285dcb@github.com> <2y-Ag6MxVDJfYl6kM0FYjQA-kzSCekUgAMWAZmkECyQ=.2a2a0a8e-fc67-42a4-bd67-b4ae3b60bcea@github.com> <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> Message-ID: On Thu, 6 Jun 2024 13:42:36 GMT, Emanuel Peter wrote: >> Hi @eme64 , I have modified the tests as suggested. Please review :) > > @Bhavana-Kilambi can you merge with master? I think your last merge is a while ago. I'll run testing after. Hi @eme64 , have updated patch with a merge with master. Can you please run testing? Thank you! ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2154837245 From epeter at openjdk.org Fri Jun 7 13:30:35 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Fri, 7 Jun 2024 13:30:35 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v8] In-Reply-To: References: <8-_t7nWbR9gZ2_QkfFNuf5M0Q4PMkKJKgwS3ZbHcCxI=.32dc4f11-dec5-468d-afc8-3b4dae285dcb@github.com> <2y-Ag6MxVDJfYl6kM0FYjQA-kzSCekUgAMWAZmkECyQ=.2a2a0a8e-fc67-42a4-bd67-b4ae3b60bcea@github.com> <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> Message-ID: On Fri, 7 Jun 2024 13:26:01 GMT, Bhavana Kilambi wrote: >> @Bhavana-Kilambi can you merge with master? I think your last merge is a while ago. I'll run testing after. > > Hi @eme64 , have updated patch with a merge with master. Can you please run testing? Thank you! @Bhavana-Kilambi thanks for the merge, just launched it before you pinged me ;) Please ping me again after the weekend for the results! ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2154840211 From jwaters at openjdk.org Fri Jun 7 13:39:19 2024 From: jwaters at openjdk.org (Julian Waters) Date: Fri, 7 Jun 2024 13:39:19 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: <4cHhBehMcwLWREUH2qT-iK-uUHwJ0x5bMhyUnwL2gq8=.b9b607ff-f8e3-4e40-8ed7-4096d7e5ce50@github.com> <5YlIH2IloSdbb0dSta1qr9sb2e5Uyred24oKrMTvZFE=.b99a6c68-32f1-43d7-89f7-5205129f8e1f@github.com> Message-ID: On Fri, 7 Jun 2024 13:01:15 GMT, SendaoYan wrote: >> As confusing as they are, unfortunately GitHub UI does not render extra trailing newlines. This is the only one I could find with grepWin. > > I find the extra trailing newlines through below shell command: > > for i in `find . -iname "Makefile*" | sed "/./build/d"` ; do tail -n 2 $i | grep -c "^$" | grep -q "^1$" ; if [[ 0 -eq $? ]] ; then echo $i ; fi ; done > > > There are only two files has been found: > > ./test/jdk/java/rmi/reliability/benchmark/bench/rmi/Makefile > ./test/jdk/java/rmi/reliability/benchmark/bench/Makefile Ah, I had not realized that there was more than 1 newline. GitHub's UI confused me here, so we're good to go ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19537#discussion_r1631213656 From syan at openjdk.org Fri Jun 7 13:39:20 2024 From: syan at openjdk.org (SendaoYan) Date: Fri, 7 Jun 2024 13:39:20 GMT Subject: Integrated: 8333477: Delete extra empty spaces in Makefiles In-Reply-To: References: Message-ID: On Tue, 4 Jun 2024 07:47:46 GMT, SendaoYan wrote: > Hi all, > This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. > > Thanks. This pull request has now been integrated. Changeset: d130d2f4 Author: SendaoYan Committer: Julian Waters URL: https://git.openjdk.org/jdk/commit/d130d2f4f46d37a2b924343de19d012c129b0a55 Stats: 11 lines in 5 files changed: 0 ins; 2 del; 9 mod 8333477: Delete extra empty spaces in Makefiles Reviewed-by: erikj, chagedorn, liach, jwaters ------------- PR: https://git.openjdk.org/jdk/pull/19537 From syan at openjdk.org Fri Jun 7 14:17:19 2024 From: syan at openjdk.org (SendaoYan) Date: Fri, 7 Jun 2024 14:17:19 GMT Subject: RFR: 8333477: Delete extra empty spaces in Makefiles [v2] In-Reply-To: References: Message-ID: <5PwXrAonDGIth_VkfWPZMDl-XFCPTLAIRunMUPsp_4g=.45fa1560-bcfb-4ac0-9bd7-03734547b09d@github.com> On Fri, 7 Jun 2024 07:29:39 GMT, SendaoYan wrote: >> Hi all, >> This PR several extra empty spaces and extra empty lines in several Makefiles. It's trivial fix, no risk. >> >> Thanks. > > SendaoYan has updated the pull request incrementally with one additional commit since the last revision: > > delete extra empty trailing blank line in test/jdk/java/rmi/reliability/benchmark/bench/Makefile Thanks all for the review and sponsor. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19537#issuecomment-2154937099 From duke at openjdk.org Fri Jun 7 14:36:16 2024 From: duke at openjdk.org (Yuri Gaevsky) Date: Fri, 7 Jun 2024 14:36:16 GMT Subject: RFR: 8322174: RISC-V: C2 VectorizedHashCode RVV Version [v2] In-Reply-To: References: Message-ID: On Thu, 25 Jan 2024 14:47:47 GMT, Yuri Gaevsky wrote: >> The patch adds possibility to use RVV instructions for faster vectorizedHashCode calculations on RVV v1.0.0 capable hardware. >> >> Testing: hotspot/jtreg/compiler/ under QEMU-8.1 with RVV v1.0.0. > > Yuri Gaevsky has updated the pull request incrementally with two additional commits since the last revision: > > - num_8b_elems_in_vec --> nof_vec_elems > - Removed checks for (MaxVectorSize >= 16) per @RealFYang suggestion. . ------------- PR Comment: https://git.openjdk.org/jdk/pull/17413#issuecomment-2154974186 From amitkumar at openjdk.org Fri Jun 7 14:41:12 2024 From: amitkumar at openjdk.org (Amit Kumar) Date: Fri, 7 Jun 2024 14:41:12 GMT Subject: RFR: 8331117: [PPC64] secondary_super_cache does not scale well [v2] In-Reply-To: References: Message-ID: <4oTnnVeBbxCTfBDoQnldpIyHh8GlPcjXwVlmaPQPrrw=.5243b504-e336-4ff2-bb59-525766d78a34@github.com> On Tue, 28 May 2024 14:04:13 GMT, Martin Doerr wrote: >> Martin Doerr has updated the pull request incrementally with one additional commit since the last revision: >> >> Fix bit test and add assertion for array lenght. > > Performance seems to be not affected by that bug. Note that I have used https://github.com/openjdk/jdk/pull/19427 to run TypePollution micro benchmarks. @TheRealMDoerr I got one test failure on PPC with these changes: diff --git a/src/hotspot/share/runtime/globals.hpp b/src/hotspot/share/runtime/globals.hpp index 6bfb260606b..70897a1066e 100644 --- a/src/hotspot/share/runtime/globals.hpp +++ b/src/hotspot/share/runtime/globals.hpp @@ -1988,13 +1988,13 @@ const int ObjectAlignmentInBytes = 8; "rewriting/transformation independently of the JVMTI " \ "can_{retransform/redefine}_classes capabilities.") \ \ - product(bool, UseSecondarySupersCache, true, DIAGNOSTIC, \ + product(bool, UseSecondarySupersCache, false, DIAGNOSTIC, \ "Use secondary supers cache during subtype checks.") \ \ - product(bool, UseSecondarySupersTable, false, DIAGNOSTIC, \ + product(bool, UseSecondarySupersTable, true, DIAGNOSTIC, \ "Use hash table to lookup secondary supers.") \ \ - product(bool, VerifySecondarySupers, false, DIAGNOSTIC, \ + product(bool, VerifySecondarySupers, true, DIAGNOSTIC, \ "Check that linear and hashed secondary lookups return the same result.") \ \ product(bool, StressSecondarySupers, false, DIAGNOSTIC, \ ============================== Test summary ============================== TEST TOTAL PASS FAIL ERROR jtreg:./test/hotspot/jtreg/compiler/c2/irTests/ProfileAtTypeCheck.java >> 1 0 1 0 << ============================== TEST FAILURE But if I revert the changes I had done, then it passes. Same situation I'm facing on s390x. Is this expected ? failure log: [type_profile_failure.log](https://github.com/user-attachments/files/15741205/type_profile_failure.log) ------------- PR Comment: https://git.openjdk.org/jdk/pull/19368#issuecomment-2154983693 From amitkumar at openjdk.org Fri Jun 7 15:01:14 2024 From: amitkumar at openjdk.org (Amit Kumar) Date: Fri, 7 Jun 2024 15:01:14 GMT Subject: RFR: 8333412: [s390x] Add support for branch on count instruction In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 06:04:10 GMT, Amit Kumar wrote: > Adds support for BCT, BCTG, BCTR instructions. @TheRealMDoerr will you take a look at it ? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19572#issuecomment-2155016865 From kvn at openjdk.org Fri Jun 7 15:01:17 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Fri, 7 Jun 2024 15:01:17 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v3] In-Reply-To: References: Message-ID: <8U7x8J0qcT9KdK3Tah56KLzt-zi5NRuQDpJjsoKOpeE=.b43eac87-2ab5-404e-90ee-d04a6104a22c@github.com> On Fri, 7 Jun 2024 03:50:50 GMT, Jatin Bhateja wrote: >> src/hotspot/cpu/x86/vm_version_x86.cpp line 443: >> >>> 441: >>> 442: /* FIXME: Uncomment while integrating JDK-8329032 >>> 443: bool save_apx = UseAPX; >> >> What are you missing to uncomment this code? >> 8329032 is about `.ad` file changes. It should not affect execution of this code. >> You need changes in `register_x86.*` files and may be somewhere else but you don't need C2 changes for this code to work. > > Yes, we already have that in place with https://github.com/openjdk/jdk/pull/19042, which will be open for review after this patch. I added it in comments since this piece of logic is centered around CPUID feature check and pertinent to this patch. Okay. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1631347341 From kvn at openjdk.org Fri Jun 7 15:13:14 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Fri, 7 Jun 2024 15:13:14 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v3] In-Reply-To: References: Message-ID: <4X90ounLkky3ETOC0PMSFTfaqGWIPATNVAGr03Ig4OY=.42c3d5ee-8c06-4e95-9392-b19d021f1621@github.com> On Fri, 7 Jun 2024 02:16:27 GMT, Jatin Bhateja wrote: >> Summary of changes include with the patch:- >> >> 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) >> 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Review comments addressed. src/hotspot/cpu/x86/vm_version_x86.cpp line 882: > 880: > 881: void VM_Version::report_apx_state_restore_warning() { > 882: tty->print("warning: Unsuccessful EGPRs state restoration across signal handling, setting UseAPX to false.\n"); This print is fine during development but I would instead save some value in memory to indicate that OS does not save/restore APX. And then check it after we execute this assembler code. Similar how we do that for AVX. You would not need to do runtime call and this method then. Note: `tty->print()` can do "nasty"/unexpected things which you want to avoid. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1631362108 From mdoerr at openjdk.org Fri Jun 7 15:18:13 2024 From: mdoerr at openjdk.org (Martin Doerr) Date: Fri, 7 Jun 2024 15:18:13 GMT Subject: RFR: 8333412: [s390x] Add support for branch on count instruction In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 06:04:10 GMT, Amit Kumar wrote: > Adds support for BCT, BCTG, BCTR instructions. I assume the opcodes were already compared with Principles Of Operation. I didn't do that. In general, LGTM. ------------- Marked as reviewed by mdoerr (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19572#pullrequestreview-2104835910 From mdoerr at openjdk.org Fri Jun 7 15:20:13 2024 From: mdoerr at openjdk.org (Martin Doerr) Date: Fri, 7 Jun 2024 15:20:13 GMT Subject: RFR: 8331117: [PPC64] secondary_super_cache does not scale well [v2] In-Reply-To: References: Message-ID: On Wed, 29 May 2024 07:36:52 GMT, Andrew Haley wrote: >> Performance seems to be not affected by that bug. Note that I have used https://github.com/openjdk/jdk/pull/19427 to run TypePollution micro benchmarks. > >> Performance seems to be not affected by that bug. > > That is extremely suspicious. That doesn't look like a platform specific thing. I'm getting the same result on x86_64. @theRealAph: Is that a known limitation or is it worth a new JBS issue? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19368#issuecomment-2155051645 From roland at openjdk.org Fri Jun 7 15:30:40 2024 From: roland at openjdk.org (Roland Westrelin) Date: Fri, 7 Jun 2024 15:30:40 GMT Subject: RFR: 8333805: Replaying compilation with null static final fields results in a crash Message-ID: <5y7lfA7aRBJ9lgqp6Z-OfXnh-OP8oQGyhzyhVbY2ytU=.92e5a27a-bd1c-4d68-aef6-4b1cc218f450@github.com> When dumping a replay file, if a static field is null the "null" string is appended to the line for the field. When replaying the compilation, this breaks for: - string fields: instead of null, the field is initialized with a "null" string - object fields: the jvm crashes because it treats "null" as a class name. - array fields: the jvm crashes because it expects an array length where the "null" string is. This patch fixes it by: - leaving out the "null" string when the field is null. When the compilation is replayed, the missing field value is taken as meaning the field is null. - setting the length to -1 for a null array which on the replay side is used as an indication that the field is null. I also noticed, for object arrays, that the actual type of the array (for a non null field) is included in the replay file (given it can differ from the field type) but not used when replaying. I changed that. ------------- Commit messages: - test & fix Changes: https://git.openjdk.org/jdk/pull/19601/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19601&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333805 Stats: 156 lines in 3 files changed: 112 ins; 21 del; 23 mod Patch: https://git.openjdk.org/jdk/pull/19601.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19601/head:pull/19601 PR: https://git.openjdk.org/jdk/pull/19601 From sgibbons at openjdk.org Fri Jun 7 17:05:33 2024 From: sgibbons at openjdk.org (Scott Gibbons) Date: Fri, 7 Jun 2024 17:05:33 GMT Subject: Integrated: 8320448: Accelerate IndexOf using AVX2 In-Reply-To: References: Message-ID: On Tue, 21 Nov 2023 00:06:19 GMT, Scott Gibbons wrote: > Re-write the IndexOf code without the use of the pcmpestri instruction, only using AVX2 instructions. This change accelerates String.IndexOf on average 1.3x for AVX2. The benchmark numbers: > > > Benchmark Score Latest > StringIndexOf.advancedWithMediumSub 343.573 317.934 0.925375393x > StringIndexOf.advancedWithShortSub1 1039.081 1053.96 1.014319384x > StringIndexOf.advancedWithShortSub2 55.828 110.541 1.980027943x > StringIndexOf.constantPattern 9.361 11.906 1.271872663x > StringIndexOf.searchCharLongSuccess 4.216 4.218 1.000474383x > StringIndexOf.searchCharMediumSuccess 3.133 3.216 1.02649218x > StringIndexOf.searchCharShortSuccess 3.76 3.761 1.000265957x > StringIndexOf.success 9.186 9.713 1.057369911x > StringIndexOf.successBig 14.341 46.343 3.231504079x > StringIndexOfChar.latin1_AVX2_String 6220.918 12154.52 1.953814533x > StringIndexOfChar.latin1_AVX2_char 5503.556 5540.044 1.006629895x > StringIndexOfChar.latin1_SSE4_String 6978.854 6818.689 0.977049957x > StringIndexOfChar.latin1_SSE4_char 5657.499 5474.624 0.967675646x > StringIndexOfChar.latin1_Short_String 7132.541 6863.359 0.962260014x > StringIndexOfChar.latin1_Short_char 16013.389 16162.437 1.009307711x > StringIndexOfChar.latin1_mixed_String 7386.123 14771.622 1.999915517x > StringIndexOfChar.latin1_mixed_char 9901.671 9782.245 0.987938803 This pull request has now been integrated. Changeset: 8e72d7cf Author: Scott Gibbons Committer: Jatin Bhateja URL: https://git.openjdk.org/jdk/commit/8e72d7cf8e7dfc7eb9e66bc562f125f947e37f49 Stats: 3906 lines in 16 files changed: 3876 ins; 0 del; 30 mod 8320448: Accelerate IndexOf using AVX2 Reviewed-by: epeter, kvn, sviswanathan ------------- PR: https://git.openjdk.org/jdk/pull/16753 From jbhateja at openjdk.org Sat Jun 8 04:16:24 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Sat, 8 Jun 2024 04:16:24 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v4] In-Reply-To: References: Message-ID: <7FpzzRiVoeGjqOjIxTSfxdudzZcx20q7DcKTTVSWhQA=.60cb64ba-3806-4e51-8296-696bad19720d@github.com> > Summary of changes include with the patch:- > > 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) > 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. > > Kindly review and share your feedback. > > Best Regards, > Jatin Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: Lazy restored state comparison after OS signal handling. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/18562/files - new: https://git.openjdk.org/jdk/pull/18562/files/68df08ce..d8fcde93 Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=18562&range=03 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=18562&range=02-03 Stats: 40 lines in 2 files changed: 17 ins; 16 del; 7 mod Patch: https://git.openjdk.org/jdk/pull/18562.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/18562/head:pull/18562 PR: https://git.openjdk.org/jdk/pull/18562 From jbhateja at openjdk.org Sat Jun 8 04:16:24 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Sat, 8 Jun 2024 04:16:24 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v3] In-Reply-To: <4X90ounLkky3ETOC0PMSFTfaqGWIPATNVAGr03Ig4OY=.42c3d5ee-8c06-4e95-9392-b19d021f1621@github.com> References: <4X90ounLkky3ETOC0PMSFTfaqGWIPATNVAGr03Ig4OY=.42c3d5ee-8c06-4e95-9392-b19d021f1621@github.com> Message-ID: On Fri, 7 Jun 2024 15:10:57 GMT, Vladimir Kozlov wrote: >> Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: >> >> Review comments addressed. > > src/hotspot/cpu/x86/vm_version_x86.cpp line 882: > >> 880: >> 881: void VM_Version::report_apx_state_restore_warning() { >> 882: tty->print("warning: Unsuccessful EGPRs state restoration across signal handling, setting UseAPX to false.\n"); > > This print is fine during development but I would instead save some value in memory to indicate that OS does not save/restore APX. And then check it after we execute this assembler code. Similar how we do that for AVX. > You would not need to do runtime call and this method then. > Note: `tty->print()` can do "nasty"/unexpected things which you want to avoid. Hi @vnkozlov , doing a lazy restored state comparison now to align with existing AVX handling. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/18562#discussion_r1631865373 From amitkumar at openjdk.org Sat Jun 8 04:44:20 2024 From: amitkumar at openjdk.org (Amit Kumar) Date: Sat, 8 Jun 2024 04:44:20 GMT Subject: RFR: 8333412: [s390x] Add support for branch on count instruction In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 06:04:10 GMT, Amit Kumar wrote: > Adds support for BCT, BCTG, BCTR instructions. I did a recheck, performed builds. Also it felt irrelevant to run tier1 as we are not using these instruction, but performed anyway. Things look fine. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19572#issuecomment-2155805832 From amitkumar at openjdk.org Sat Jun 8 04:44:21 2024 From: amitkumar at openjdk.org (Amit Kumar) Date: Sat, 8 Jun 2024 04:44:21 GMT Subject: Integrated: 8333412: [s390x] Add support for branch on count instruction In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 06:04:10 GMT, Amit Kumar wrote: > Adds support for BCT, BCTG, BCTR instructions. This pull request has now been integrated. Changeset: a6fc2f83 Author: Amit Kumar URL: https://git.openjdk.org/jdk/commit/a6fc2f839a5e494b940ee473cbd942ec5f884324 Stats: 28 lines in 2 files changed: 23 ins; 0 del; 5 mod 8333412: [s390x] Add support for branch on count instruction Reviewed-by: lucy, mdoerr ------------- PR: https://git.openjdk.org/jdk/pull/19572 From dlong at openjdk.org Sat Jun 8 08:37:10 2024 From: dlong at openjdk.org (Dean Long) Date: Sat, 8 Jun 2024 08:37:10 GMT Subject: RFR: 8333649: Allow different NativeCall encodings In-Reply-To: References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Message-ID: On Fri, 7 Jun 2024 06:39:52 GMT, Robbin Ehn wrote: > Hope you are fine with this. OK. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19556#issuecomment-2155874697 From aph at openjdk.org Sat Jun 8 09:44:23 2024 From: aph at openjdk.org (Andrew Haley) Date: Sat, 8 Jun 2024 09:44:23 GMT Subject: RFR: 8331658: secondary_super_cache does not scale well: C1 [v2] In-Reply-To: References: <0rowz1jcBwDwG5peFhEj6CFKFUiPZcCgV3MGAEKH55Q=.35e88694-e170-4bf2-8cd7-1f309d0ab156@github.com> Message-ID: On Thu, 6 Jun 2024 22:52:39 GMT, Vladimir Ivanov wrote: > I just tried to make a point that the rest of the JVM (outside C2) seems much less bothered about performance of subtype checks. It doesn't mean these's no need to optimize those scenarios, but as part of SSC removal I'd prefer to prioritize simplicity of the implementation over peak performance. OK! Got that loud and clear. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19426#issuecomment-2155898814 From epeter at openjdk.org Sat Jun 8 16:06:40 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Sat, 8 Jun 2024 16:06:40 GMT Subject: RFR: 8333684: C2 SuperWord: multiple smaller refactorings in preparation for JDK-8332163 Message-ID: In preparation for https://github.com/openjdk/jdk/pull/19261, I made some smaller refactorings / moving code around: - `SuperWord::same_input` -> `PackSet::isa_unique_input_or_null` - Rename print-method tags: `SUPERWORD1_BEFORE_SCHEDULE` -> `AUTO_VECTORIZATION1_BEFORE_APPLY` etc. - Refactored `SuperWord::schedule / output` -> `SuperWord::schedule_and_apply`: - Reorganize so that we can separate out all methods that change the C2 graph into `SuperWord::apply`. - Move all `phase()->C->print_method` to `SuperWord::apply`. - Rename `SuperWord::schedule_reorder_memops` -> `SuperWord::apply_memops_reordering_with_schedule`. - Rename `SuperWord::output` -> `SuperWord::apply_vectorization`. - Move `SuperWord::vectors_should_be_aligned` -> `VLoop::vectors_should_be_aligned`. - Move `SuperWord::requires_long_to_int_conversion` -> `VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long`, and move comments. - Move `VectorNode::can_transform_shift_op` -> `VectorNode::can_use_RShiftI_instead_of_URShiftI`, and move comments. - Extract out `PackSet::get_bool_test` from `SuperWord::output / apply_vectorization`. - Extract opcode check to `VectorNode::is_scalar_unary_op_with_equal_input_and_output_types`. ------------- Commit messages: - rm a TODO - fix bad merge - manual merge - improve variable name - style fix - PackSet::get_bool_test - rename and move comments for can_use_RShiftI_instead_of_URShiftI - move SuperWord::requires_long_to_int_conversion -> VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long - rm code left from last commit - moved vectors_should_be_aligned - ... and 5 more: https://git.openjdk.org/jdk/compare/b4beda21...f1ec0fcc Changes: https://git.openjdk.org/jdk/pull/19573/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19573&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333684 Stats: 325 lines in 6 files changed: 145 ins; 107 del; 73 mod Patch: https://git.openjdk.org/jdk/pull/19573.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19573/head:pull/19573 PR: https://git.openjdk.org/jdk/pull/19573 From epeter at openjdk.org Sat Jun 8 16:06:42 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Sat, 8 Jun 2024 16:06:42 GMT Subject: RFR: 8333684: C2 SuperWord: multiple smaller refactorings in preparation for JDK-8332163 In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 07:31:53 GMT, Emanuel Peter wrote: > In preparation for https://github.com/openjdk/jdk/pull/19261, I made some smaller refactorings / moving code around: > > - `SuperWord::same_input` -> `PackSet::isa_unique_input_or_null` > - Rename print-method tags: `SUPERWORD1_BEFORE_SCHEDULE` -> `AUTO_VECTORIZATION1_BEFORE_APPLY` etc. > - Refactored `SuperWord::schedule / output` -> `SuperWord::schedule_and_apply`: > - Reorganize so that we can separate out all methods that change the C2 graph into `SuperWord::apply`. > - Move all `phase()->C->print_method` to `SuperWord::apply`. > - Rename `SuperWord::schedule_reorder_memops` -> `SuperWord::apply_memops_reordering_with_schedule`. > - Rename `SuperWord::output` -> `SuperWord::apply_vectorization`. > - Move `SuperWord::vectors_should_be_aligned` -> `VLoop::vectors_should_be_aligned`. > - Move `SuperWord::requires_long_to_int_conversion` -> `VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long`, and move comments. > - Move `VectorNode::can_transform_shift_op` -> `VectorNode::can_use_RShiftI_instead_of_URShiftI`, and move comments. > - Extract out `PackSet::get_bool_test` from `SuperWord::output / apply_vectorization`. > - Extract opcode check to `VectorNode::is_scalar_unary_op_with_equal_input_and_output_types`. src/hotspot/share/opto/phasetype.hpp line 74: > 72: flags(AUTO_VECTORIZATION2_AFTER_REORDER, "AutoVectorization 2, After Apply Memop Reordering") \ > 73: flags(AUTO_VECTORIZATION3_AFTER_ADJUST_LIMIT, "AutoVectorization 3, After Adjusting Pre-Loop Limit") \ > 74: flags(AUTO_VECTORIZATION4_AFTER_APPLY, "AutoVectorization 4, After Apply") \ Note: Renamed and added a 4th tag. src/hotspot/share/opto/superword.cpp line 488: > 486: > 487: DEBUG_ONLY(verify_packs();) > 488: DEBUG_ONLY(verify_no_extract()); Note: moved from `SuperWord::output` src/hotspot/share/opto/superword.cpp line 490: > 488: DEBUG_ONLY(verify_no_extract()); > 489: > 490: return schedule_and_apply(); Note: refactored `schedule` and `output` -> `schedule_and_apply`. src/hotspot/share/opto/superword.cpp line 1645: > 1643: return true; > 1644: default: > 1645: return false; Note: moved to `VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long` src/hotspot/share/opto/superword.cpp line 1659: > 1657: Node* pi_def = pi->in(idx); > 1658: if (p0_def != pi_def) { > 1659: return false; Note: refactored `SuperWord::same_inputs` -> `PackSet::isa_unique_input_or_null`. src/hotspot/share/opto/superword.cpp line 1779: > 1777: retValue = UseVectorCmov; > 1778: } else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) { > 1779: // Requires extra vector long -> int conversion. Note: moved comments to method definition. Chose better name. src/hotspot/share/opto/superword.cpp line 1783: > 1781: VectorCastNode::implemented(Op_ConvL2I, size, T_LONG, T_INT); > 1782: } else { > 1783: if (VectorNode::can_use_RShiftI_instead_of_URShiftI(p0, velt_basic_type(p0))) { Note: moved comments to method definition. Chose better name. src/hotspot/share/opto/superword.cpp line 1885: > 1883: // <==> VectorBlend( VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in2_blend, in1_blend) > 1884: mask = bol0->_test.negate(); > 1885: is_negated = true; Note: extracted from `CMove` code in `SuperWord::output` src/hotspot/share/opto/superword.cpp line 2090: > 2088: memops_schedule.dump(); > 2089: } > 2090: #endif Note: debug printing moved to `SuperWord::apply_memops_reordering_with_schedule` src/hotspot/share/opto/superword.cpp line 2208: > 2206: adjust_pre_loop_limit_to_align_main_loop_vectors(); > 2207: > 2208: DEBUG_ONLY(verify_no_extract()); Note: moved these out earlier. src/hotspot/share/opto/superword.cpp line 2316: > 2314: // (4) Apply the vectorization, including re-ordering the memops. > 2315: return apply(memops_schedule); > 2316: } Note: instead of just `schedule_reorder_memops`, we refactor and call `apply`, which does all the C2 graph hacking, including the `schedule_reorder_memops` code. src/hotspot/share/opto/superword.cpp line 2332: > 2330: > 2331: return is_success; > 2332: } Note: refactored out all the "apply" code into a nice list, and do the `print_method` code right there too. src/hotspot/share/opto/superword.cpp line 2434: > 2432: assert(cl->is_main_loop(), "SLP should only work on main loops"); > 2433: Compile* C = phase()->C; > 2434: assert(!_packset.is_empty(), "vectorization requires non-empty packset"); Note: runtime check already happens earlier/further up, in `SuperWord::schedule_and_apply`. src/hotspot/share/opto/superword.cpp line 2529: > 2527: > 2528: phase()->C->print_method(PHASE_SUPERWORD3_AFTER_OUTPUT, 4, cl); > 2529: Note: moved to `SuperWord::apply` src/hotspot/share/opto/superword.cpp line 2548: > 2546: if (bool_test._is_negated) { > 2547: // We can cancle out the negation by swapping the blend inputs. > 2548: swap(blend_in1, blend_in2); Note: refactored out to `_packset.get_bool_test`. This will make future refactorings around `SuperWord::apply_vectorization` easier. src/hotspot/share/opto/superword.cpp line 2597: > 2595: } > 2596: } else { > 2597: if (VectorNode::can_use_RShiftI_instead_of_URShiftI(n, velt_basic_type(n))) { Note: moved comments to method definition, and gave it a better name. src/hotspot/share/opto/superword.cpp line 2603: > 2601: vlen_in_bytes = vn->as_Vector()->length_in_bytes(); > 2602: } > 2603: } else if (VectorNode::is_scalar_unary_op_with_equal_input_and_output_types(opc)) { Note: captured opcode check to a dedicated method. src/hotspot/share/opto/superword.cpp line 2608: > 2606: vn = VectorNode::make(opc, in, nullptr, vlen, velt_basic_type(n)); > 2607: vlen_in_bytes = vn->as_Vector()->length_in_bytes(); > 2608: } else if (VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(opc)) { Note: moved comment to definition, gave it better name. src/hotspot/share/opto/vectorization.hpp line 1326: > 1324: VTransformBoolTest(const BoolTest::mask mask, bool is_negated) : > 1325: _mask(mask), _is_negated(is_negated) {} > 1326: }; Note: used by `PackSet::get_bool_test`. I gave it a `VTransform` name in anticipation of the `VTransform` refactoring in https://github.com/openjdk/jdk/pull/19261 ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629251270 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629252612 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629252165 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629255418 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629256254 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629253375 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629253937 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629256842 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629258095 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629263409 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629259838 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629261329 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629262955 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629267393 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629265060 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629265742 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629266323 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629266859 PR Review Comment: https://git.openjdk.org/jdk/pull/19573#discussion_r1629268921 From kvn at openjdk.org Sat Jun 8 16:17:14 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Sat, 8 Jun 2024 16:17:14 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v4] In-Reply-To: <7FpzzRiVoeGjqOjIxTSfxdudzZcx20q7DcKTTVSWhQA=.60cb64ba-3806-4e51-8296-696bad19720d@github.com> References: <7FpzzRiVoeGjqOjIxTSfxdudzZcx20q7DcKTTVSWhQA=.60cb64ba-3806-4e51-8296-696bad19720d@github.com> Message-ID: <1TZX__fVTCvGNWxwyFfucdxunEKOlYbJJ7Bvw46XDTQ=.5cc7ca7c-4bea-4cf9-b6ef-9c79dff84ef4@github.com> On Sat, 8 Jun 2024 04:16:24 GMT, Jatin Bhateja wrote: >> Summary of changes include with the patch:- >> >> 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) >> 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Lazy restored state comparison after OS signal handling. Good. Let me test it. ------------- PR Review: https://git.openjdk.org/jdk/pull/18562#pullrequestreview-2105876232 From kvn at openjdk.org Sat Jun 8 20:46:24 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Sat, 8 Jun 2024 20:46:24 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v4] In-Reply-To: <7FpzzRiVoeGjqOjIxTSfxdudzZcx20q7DcKTTVSWhQA=.60cb64ba-3806-4e51-8296-696bad19720d@github.com> References: <7FpzzRiVoeGjqOjIxTSfxdudzZcx20q7DcKTTVSWhQA=.60cb64ba-3806-4e51-8296-696bad19720d@github.com> Message-ID: On Sat, 8 Jun 2024 04:16:24 GMT, Jatin Bhateja wrote: >> Summary of changes include with the patch:- >> >> 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) >> 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. >> >> Kindly review and share your feedback. >> >> Best Regards, >> Jatin > > Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: > > Lazy restored state comparison after OS signal handling. My testing passed. ------------- Marked as reviewed by kvn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/18562#pullrequestreview-2105937938 From jbhateja at openjdk.org Sun Jun 9 00:50:21 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Sun, 9 Jun 2024 00:50:21 GMT Subject: RFR: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) [v2] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 19:42:52 GMT, Vladimir Kozlov wrote: >> Jatin Bhateja has updated the pull request incrementally with one additional commit since the last revision: >> >> Review comments resolutions. > > And we don't have HW currently. Thanks @vnkozlov and @sviswa7 ------------- PR Comment: https://git.openjdk.org/jdk/pull/18562#issuecomment-2156247123 From jbhateja at openjdk.org Sun Jun 9 00:50:21 2024 From: jbhateja at openjdk.org (Jatin Bhateja) Date: Sun, 9 Jun 2024 00:50:21 GMT Subject: Integrated: 8329031: CPUID feature detection for Advanced Performance Extensions =?UTF-8?B?KEludGVswq4=?= APX) In-Reply-To: References: Message-ID: On Mon, 1 Apr 2024 12:01:27 GMT, Jatin Bhateja wrote: > Summary of changes include with the patch:- > > 1) CPUID based feature detection check for Intel APX extension (https://www.intel.com/content/www/us/en/developer/articles/technical/advanced-performance-extensions-apx.html) > 2) Validation during VM initialization for extended GPRs state save / restoration by OS across context switches of java application threads executing JIT compiled code with new APX ISA. > > Kindly review and share your feedback. > > Best Regards, > Jatin This pull request has now been integrated. Changeset: a9413973 Author: Jatin Bhateja URL: https://git.openjdk.org/jdk/commit/a941397327972f130e683167a1b429f17603df46 Stats: 195 lines in 8 files changed: 169 ins; 10 del; 16 mod 8329031: CPUID feature detection for Advanced Performance Extensions (Intel? APX) Reviewed-by: sviswanathan, kvn ------------- PR: https://git.openjdk.org/jdk/pull/18562 From gcao at openjdk.org Mon Jun 10 08:07:12 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 10 Jun 2024 08:07:12 GMT Subject: RFR: 8333652: RISC-V: compiler/vectorapi/VectorGatherMaskFoldingTest.java fails when using RVV In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 05:31:57 GMT, Fei Yang wrote: >> Hi, We are experiencing test failures in test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java on Banana Pi BPI-F3 board (has RVV1.0), see jbs issue for exception information. >> >> related C2 instruct: >> https://github.com/openjdk/jdk/blob/326dbb1b139dd1ec1b8605339b91697cdf49da9a/src/hotspot/cpu/riscv/riscv_v.ad#L4805-L4811 >> >> As rvv1.0 manual requirements for vector indexed loads[1]: `Vector unit-stride and constant-stride use the EEW/EMUL encoded in the instruction for the data values, while vector indexed loads and stores use the EEW/EMUL encoded in the instruction for the index values and the SEW/LMUL encoded in vtype for the data values.` >> So in this case where a 64-bit vector index is used, we need to use the vluxei64_v (64-bit indexed load) >> >> ### Testing >> - [x] Run VectorGatherMaskFoldingTest.java on Banana Pi BPI-F3 board (with RVV1.0) >> - [x] test/jdk/jdk/incubator/vector on Banana Pi BPI-F3 board (with RVV1.0) >> - [x] Run VectorGatherMaskFoldingTest.java on SOPHON SG2042 (without RVV1.0) >> >> [1] https://github.com/riscv/riscv-v-spec/blob/v1.0/v-spec.adoc#sec-vector-loadstore-width-encoding > > Looks reasonable. @RealFYang : Thanks for the review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19564#issuecomment-2157613226 From bkilambi at openjdk.org Mon Jun 10 09:00:21 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Mon, 10 Jun 2024 09:00:21 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v8] In-Reply-To: References: <8-_t7nWbR9gZ2_QkfFNuf5M0Q4PMkKJKgwS3ZbHcCxI=.32dc4f11-dec5-468d-afc8-3b4dae285dcb@github.com> <2y-Ag6MxVDJfYl6kM0FYjQA-kzSCekUgAMWAZmkECyQ=.2a2a0a8e-fc67-42a4-bd67-b4ae3b60bcea@github.com> <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> Message-ID: On Fri, 7 Jun 2024 13:27:28 GMT, Emanuel Peter wrote: >> Hi @eme64 , have updated patch with a merge with master. Can you please run testing? Thank you! > > @Bhavana-Kilambi thanks for the merge, just launched it before you pinged me ;) > Please ping me again after the weekend for the results! Hi @eme64 , is the testing completed? ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2157757757 From epeter at openjdk.org Mon Jun 10 09:10:22 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Mon, 10 Jun 2024 09:10:22 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v12] In-Reply-To: References: Message-ID: <7Bp-_IAqbN_1dVYE5RNGRszPxechqxT4kAboH9zgoTg=.debe658c-f75a-4cb4-a5a5-2cc07c24b19e@github.com> On Fri, 7 Jun 2024 13:30:35 GMT, Bhavana Kilambi wrote: >> Floating-point addition is non-associative, that is adding floating-point elements in arbitrary order may get different value. Specially, Vector API does not define the order of reduction intentionally, which allows platforms to generate more efficient codes [1]. So that needs a node to represent non strictly-ordered add-reduction for floating-point type in C2. >> >> To avoid introducing new nodes, this patch adds a bool field in `AddReductionVF/D` to distinguish whether they require strict order. It also removes `UnorderedReductionNode` and adds a virtual function `bool requires_strict_order()` in `ReductionNode`. Besides `AddReductionVF/D`, other reduction nodes' `requires_strict_order()` have a fixed value. >> >> With this patch, Vector API would always generate non strictly-ordered `AddReductionVF/D' on SVE machines with vector length <= 16B as it is more beneficial to generate non-strictly ordered instructions on such machines compared to strictly ordered ones. >> >> [AArch64] >> On Neon, non strictly-ordered `AddReductionVF/D` cannot be generated. Auto-vectorization has already banned these nodes in JDK-8275275 [2]. >> >> This patch adds matching rules for non strictly-ordered `AddReductionVF/D`. >> >> No effects on other platforms. >> >> [Performance] >> FloatMaxVector.ADDLanes [3] measures the performance of add reduction for floating-point type. With this patch, it improves ~3x on my SVE machine (128-bit). >> >> ADDLanes >> >> Benchmark Before After Unit >> FloatMaxVector.ADDLanes 1789.513 5264.226 ops/ms >> >> >> Final code is as below: >> >> Before: >> ` fadda z17.s, p7/m, z17.s, z16.s >> ` >> After: >> >> faddp v17.4s, v21.4s, v21.4s >> faddp s18, v17.2s >> fadd s18, s18, s19 >> >> >> >> >> [Test] >> Full jtreg passed on AArch64 and x86. >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2529 >> [2] https://bugs.openjdk.org/browse/JDK-8275275 >> [3] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/micro/org/openjdk/bench/jdk/incubator/vector/operation/FloatMaxVector.java#L316 > > Bhavana Kilambi has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains 12 additional commits since the last revision: > > - Merge with master > - Make changes in IR rules for JTREG tests > - Modify JTREG IR rules and some style/format changes > - Add dump_spec and JTREG IR tests for Add/Mul Reduction Nodes > - Merge master > - Adjust format for the backend rules changed in previous commit > - Address some more review comments > - Revert to previous indentation > - Add comments, revert to requires_strict_order and other minor changes > - Naming changes: replace strict/non-strict with more technical terms > - ... and 2 more: https://git.openjdk.org/jdk/compare/d923efaf...35e6258d Testing is good, thanks for bearing with all the comments and suggestions. Thanks for the work! ------------- Marked as reviewed by epeter (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/18034#pullrequestreview-2107169313 From bkilambi at openjdk.org Mon Jun 10 09:22:17 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Mon, 10 Jun 2024 09:22:17 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v8] In-Reply-To: References: <8-_t7nWbR9gZ2_QkfFNuf5M0Q4PMkKJKgwS3ZbHcCxI=.32dc4f11-dec5-468d-afc8-3b4dae285dcb@github.com> <2y-Ag6MxVDJfYl6kM0FYjQA-kzSCekUgAMWAZmkECyQ=.2a2a0a8e-fc67-42a4-bd67-b4ae3b60bcea@github.com> <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> Message-ID: On Fri, 7 Jun 2024 13:27:28 GMT, Emanuel Peter wrote: >> Hi @eme64 , have updated patch with a merge with master. Can you please run testing? Thank you! > > @Bhavana-Kilambi thanks for the merge, just launched it before you pinged me ;) > Please ping me again after the weekend for the results! Thank you very much for taking time to review @eme64. I have learnt a lot from the comments :) ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2157812793 From simonis at openjdk.org Mon Jun 10 09:40:16 2024 From: simonis at openjdk.org (Volker Simonis) Date: Mon, 10 Jun 2024 09:40:16 GMT Subject: Integrated: 8333722: Fix CompilerDirectives for non-compiler JVM variants In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 13:36:25 GMT, Volker Simonis wrote: > `DirectivesStack::getMatchingDirective()` relies on the fact that the default directives set is always enabled. And that's indeed the case for normal builds with C1 and C2 compilers (see `DirectivesStack::init()` in `compilerDirectives.cpp`): > > > // Create a new dirstack and push a default directive > void DirectivesStack::init() { > CompilerDirectives* _default_directives = new CompilerDirectives(); > char str[] = "*.*"; > const char* error_msg = nullptr; > _default_directives->add_match(str, error_msg); > #if defined(COMPILER1) || INCLUDE_JVMCI > _default_directives->_c1_store->EnableOption = true; > #endif > #ifdef COMPILER2 > if (CompilerConfig::is_c2_enabled()) { > _default_directives->_c2_store->EnableOption = true; > } > #endif > assert(error_msg == nullptr, "Must succeed."); > push(_default_directives); > } > > > However, if we're building a JVM configuration without compilers (e.g. `--with-jvm-variants=core`), this is not the case and `DirectivesStack::getMatchingDirective()` will return the base directive set without incrementing the reference count of its directive: > > > CompilerDirectives* dir = _top; > assert(dir != nullptr, "Must be initialized"); > > while (dir != nullptr) { > if (dir->is_default_directive() || dir->match(method)) { > match = dir->get_for(comp); > assert(match != nullptr, "Consistency"); > if (match->EnableOption) { > // The directiveSet for this compile is also enabled -> success > dir->inc_refcount(); > break; > } > } > dir = dir->next(); > } > } > guarantee(match != nullptr, "There should always be a default directive that matches"); > > // Check for legacy compile commands update, without DirectivesStack_lock > return match->compilecommand_compatibility_init(method); > > > If this directive set will be released, it will delete the corresponding base directive and subsequent usages of the base directive will lead to a segmentation fault. > > After [JDK-8329421: Native methods can not be selectively printed](https://bugs.openjdk.org/browse/JDK-8329421) which replaced the call to > > DirectiveSet* directive = DirectivesStack::getDefaultDirective(CompileBroker::compiler(CompLevel_simple)); > > by > > DirectiveSet* directive = DirectivesStack::getMatchingDirective(method, CompileBroker::compiler(CompLevel_simple)); > > in `sharedRuntime.cpp` this issue is now triggered at JVM startup for non-compiler configurations when native wrappers are generated (see https://github.com/openjdk/jdk/pull/18567#... This pull request has now been integrated. Changeset: 5f9d3e3a Author: Volker Simonis URL: https://git.openjdk.org/jdk/commit/5f9d3e3af8342592242cb304b2c219508d56ed3a Stats: 1 line in 1 file changed: 0 ins; 0 del; 1 mod 8333722: Fix CompilerDirectives for non-compiler JVM variants Reviewed-by: kvn ------------- PR: https://git.openjdk.org/jdk/pull/19578 From simonis at openjdk.org Mon Jun 10 10:17:33 2024 From: simonis at openjdk.org (Volker Simonis) Date: Mon, 10 Jun 2024 10:17:33 GMT Subject: [jdk23] RFR: 8333722: Fix CompilerDirectives for non-compiler JVM variants Message-ID: Hi all, This pull request contains a backport of commit 5f9d3e3a from the openjdk/jdk repository. The commit being backported was authored by Volker Simonis on 10 Jun 2024 and was reviewed by Vladimir Kozlov. Thanks! ------------- Commit messages: - Backport 5f9d3e3af8342592242cb304b2c219508d56ed3a Changes: https://git.openjdk.org/jdk/pull/19622/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19622&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333722 Stats: 1 line in 1 file changed: 0 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19622.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19622/head:pull/19622 PR: https://git.openjdk.org/jdk/pull/19622 From adinn at openjdk.org Mon Jun 10 10:31:20 2024 From: adinn at openjdk.org (Andrew Dinn) Date: Mon, 10 Jun 2024 10:31:20 GMT Subject: RFR: 8321308: AArch64: Fix matching predication for cbz/cbnz [v2] In-Reply-To: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> References: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> Message-ID: On Thu, 6 Jun 2024 14:35:29 GMT, Fei Gao wrote: >> For array length check like: >> >> if (a.length > 0) { >> [Block 1] >> } else { >> [Block 2] >> } >> >> >> Since `a.length` is unsigned, it's semantically equivalent to: >> >> if (a.length != 0) { >> [Block 1] >> } else { >> [Block 2] >> } >> >> >> On aarch64 port, we can do the conversion like above, during c2 compiler instruction matching, for certain unsigned integral comparisons. >> >> For example, >> >> cmpw w11, #0 # unsigned >> bls label # unsigned >> [Block 1] >> >> label: >> [Block 2] >> >> >> can be converted to: >> >> cbz w11, label >> [Block 1] >> >> label: >> [Block 2] >> >> >> Currently, we have some matching rules to do the conversion [[1]](https://github.com/openjdk/jdk/blob/4f1a10f84bcfadef263a0890b6834ccd3d5bb52f/src/hotspot/cpu/aarch64/aarch64.ad#L15688). But the predicate here [[2]](https://github.com/openjdk/jdk/blob/4f1a10f84bcfadef263a0890b6834ccd3d5bb52f/src/hotspot/cpu/aarch64/aarch64.ad#L5631) matches wrong `BoolTest` masks, so these rules fail to convert. I guess it's a typo introduced in [JDK-8160006](https://bugs.openjdk.org/browse/JDK-8160006). The patch fixes it. > > Fei Gao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: > > - Redefine the interface for cmpOpUEqNeLeGt > - Merge branch 'master' into fg8321308 > - 8321308: AArch64: Fix matching predication for cbz/cbnz > > For array length check like: > ``` > if (a.length > 0) { > [Block 1] > } else { > [Block 2] > } > ``` > > Since `a.length` is unsigned, it's semantically equivalent to: > ``` > if (a.length != 0) { > [Block 1] > } else { > [Block 2] > } > ``` > > On aarch64 port, we can do the conversion like above, during c2 > compiler instruction matching, for certain unsigned integral > comparisons. > > For example, > ``` > cmpw w11, #0 # unsigned > bls label # unsigned > [Block 1] > > label: > [Block 2] > ``` > > can be converted to: > ``` > cbz w11, label > [Block 1] > > label: > [Block 2] > ``` > > Currently, we have some matching rules to do the conversion[1]. > But the predicate here[2] matches wrong `BoolTest` masks, > so these rules fail to convert. I guess it's a typo introduced > in JDK-8160006. The patch fixes it. > > [1] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L16179 > [2] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L6140 Thanks, Fei. Looks good. ------------- Marked as reviewed by adinn (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/16989#pullrequestreview-2107413052 From pminborg at openjdk.org Mon Jun 10 11:55:29 2024 From: pminborg at openjdk.org (Per Minborg) Date: Mon, 10 Jun 2024 11:55:29 GMT Subject: RFR: 8330465: Stable Values and Collections (Internal) [v20] In-Reply-To: References: <-KSimQo5kkmCzzMShqGe5QZ9yCSzpWL98gN13v4wP0k=.11dd8d06-18a6-4577-8342-66632cea0b6e@github.com> Message-ID: On Fri, 17 May 2024 09:31:33 GMT, Per Minborg wrote: >> # Stable Values & Collections (Internal) >> >> ## Summary >> This PR proposes to introduce an internal _Stable Values & Collections_ API, which provides immutable value holders where elements are initialized _at most once_. Stable Values & Collections offer the performance and safety benefits of final fields while offering greater flexibility as to the timing of initialization. >> >> ## Goals >> * Provide an easy and intuitive API to describe value holders that can change at most once. >> * Decouple declaration from initialization without significant footprint or performance penalties. >> * Reduce the amount of static initializer and/or field initialization code. >> * Uphold integrity and consistency, even in a multi-threaded environment. >> >> For more details, see the draft JEP: https://openjdk.org/jeps/8312611 >> >> ## Performance >> Performance compared to instance variables using two `AtomicReference` and two protected by double-checked locking under concurrent access by all threads: >> >> >> Benchmark Mode Cnt Score Error Units >> StableBenchmark.atomic thrpt 10 259.478 ? 36.809 ops/us >> StableBenchmark.dcl thrpt 10 225.710 ? 26.638 ops/us >> StableBenchmark.stable thrpt 10 4382.478 ? 1151.472 ops/us <- StableValue significantly faster >> >> >> Performance compared to static variables protected by `AtomicReference`, class-holder idiom holder, and double-checked locking (all threads): >> >> >> Benchmark Mode Cnt Score Error Units >> StableStaticBenchmark.atomic thrpt 10 6487.835 ? 385.639 ops/us >> StableStaticBenchmark.dcl thrpt 10 6605.239 ? 210.610 ops/us >> StableStaticBenchmark.stable thrpt 10 14338.239 ? 1426.874 ops/us >> StableStaticBenchmark.staticCHI thrpt 10 13780.341 ? 1839.651 ops/us >> >> >> Performance for stable lists (thread safe) in both instance and static contexts whereby we access a single value compared to `ArrayList` instances (which are not thread-safe) (all threads): >> >> >> Benchmark Mode Cnt Score Error Units >> StableListElementBenchmark.instanceArrayList thrpt 10 5812.992 ? 1169.730 ops/us >> StableListElementBenchmark.instanceList thrpt 10 4818.643 ? 704.893 ops/us >> StableListElementBenchmark... > > Per Minborg has updated the pull request incrementally with two additional commits since the last revision: > > - Add benchmarks for memoized IntFunction and Function > - Add benchmark for memoized supplier A new PR will be made available shortly. ------------- PR Comment: https://git.openjdk.org/jdk/pull/18794#issuecomment-2158140833 From pminborg at openjdk.org Mon Jun 10 11:55:30 2024 From: pminborg at openjdk.org (Per Minborg) Date: Mon, 10 Jun 2024 11:55:30 GMT Subject: Withdrawn: 8330465: Stable Values and Collections (Internal) In-Reply-To: <-KSimQo5kkmCzzMShqGe5QZ9yCSzpWL98gN13v4wP0k=.11dd8d06-18a6-4577-8342-66632cea0b6e@github.com> References: <-KSimQo5kkmCzzMShqGe5QZ9yCSzpWL98gN13v4wP0k=.11dd8d06-18a6-4577-8342-66632cea0b6e@github.com> Message-ID: On Tue, 16 Apr 2024 11:47:23 GMT, Per Minborg wrote: > # Stable Values & Collections (Internal) > > ## Summary > This PR proposes to introduce an internal _Stable Values & Collections_ API, which provides immutable value holders where elements are initialized _at most once_. Stable Values & Collections offer the performance and safety benefits of final fields while offering greater flexibility as to the timing of initialization. > > ## Goals > * Provide an easy and intuitive API to describe value holders that can change at most once. > * Decouple declaration from initialization without significant footprint or performance penalties. > * Reduce the amount of static initializer and/or field initialization code. > * Uphold integrity and consistency, even in a multi-threaded environment. > > For more details, see the draft JEP: https://openjdk.org/jeps/8312611 > > ## Performance > Performance compared to instance variables using two `AtomicReference` and two protected by double-checked locking under concurrent access by all threads: > > > Benchmark Mode Cnt Score Error Units > StableBenchmark.atomic thrpt 10 259.478 ? 36.809 ops/us > StableBenchmark.dcl thrpt 10 225.710 ? 26.638 ops/us > StableBenchmark.stable thrpt 10 4382.478 ? 1151.472 ops/us <- StableValue significantly faster > > > Performance compared to static variables protected by `AtomicReference`, class-holder idiom holder, and double-checked locking (all threads): > > > Benchmark Mode Cnt Score Error Units > StableStaticBenchmark.atomic thrpt 10 6487.835 ? 385.639 ops/us > StableStaticBenchmark.dcl thrpt 10 6605.239 ? 210.610 ops/us > StableStaticBenchmark.stable thrpt 10 14338.239 ? 1426.874 ops/us > StableStaticBenchmark.staticCHI thrpt 10 13780.341 ? 1839.651 ops/us > > > Performance for stable lists (thread safe) in both instance and static contexts whereby we access a single value compared to `ArrayList` instances (which are not thread-safe) (all threads): > > > Benchmark Mode Cnt Score Error Units > StableListElementBenchmark.instanceArrayList thrpt 10 5812.992 ? 1169.730 ops/us > StableListElementBenchmark.instanceList thrpt 10 4818.643 ? 704.893 ops/us > StableListElementBenchmark.staticArrayList thrpt 10 7614.741 ? 564.777 ops/us > StableListElementBe... This pull request has been closed without being integrated. ------------- PR: https://git.openjdk.org/jdk/pull/18794 From thartmann at openjdk.org Mon Jun 10 12:02:11 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Mon, 10 Jun 2024 12:02:11 GMT Subject: [jdk23] RFR: 8333722: Fix CompilerDirectives for non-compiler JVM variants In-Reply-To: References: Message-ID: <6bSk0gmZnIh_XWbur2xHucbVpacNAQquBknL7VWH9NM=.246183f0-be9d-40dd-900c-3711598e2179@github.com> On Mon, 10 Jun 2024 10:12:04 GMT, Volker Simonis wrote: > Hi all, > > This pull request contains a backport of commit 5f9d3e3a from the openjdk/jdk repository. > > The commit being backported was authored by Volker Simonis on 10 Jun 2024 and was reviewed by Vladimir Kozlov. > > Thanks! Looks good and trivial. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19622#pullrequestreview-2107591154 From simonis at openjdk.org Mon Jun 10 12:40:11 2024 From: simonis at openjdk.org (Volker Simonis) Date: Mon, 10 Jun 2024 12:40:11 GMT Subject: [jdk23] RFR: 8333722: Fix CompilerDirectives for non-compiler JVM variants In-Reply-To: <6bSk0gmZnIh_XWbur2xHucbVpacNAQquBknL7VWH9NM=.246183f0-be9d-40dd-900c-3711598e2179@github.com> References: <6bSk0gmZnIh_XWbur2xHucbVpacNAQquBknL7VWH9NM=.246183f0-be9d-40dd-900c-3711598e2179@github.com> Message-ID: On Mon, 10 Jun 2024 11:59:53 GMT, Tobias Hartmann wrote: >> Hi all, >> >> This pull request contains a backport of commit 5f9d3e3a from the openjdk/jdk repository. >> >> The commit being backported was authored by Volker Simonis on 10 Jun 2024 and was reviewed by Vladimir Kozlov. >> >> Thanks! > > Looks good and trivial. Thanks @TobiHartmann! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19622#issuecomment-2158233536 From gcao at openjdk.org Mon Jun 10 13:47:17 2024 From: gcao at openjdk.org (Gui Cao) Date: Mon, 10 Jun 2024 13:47:17 GMT Subject: Integrated: 8333652: RISC-V: compiler/vectorapi/VectorGatherMaskFoldingTest.java fails when using RVV In-Reply-To: References: Message-ID: On Wed, 5 Jun 2024 15:26:57 GMT, Gui Cao wrote: > Hi, We are experiencing test failures in test/hotspot/jtreg/compiler/vectorapi/VectorGatherMaskFoldingTest.java on Banana Pi BPI-F3 board (has RVV1.0), see jbs issue for exception information. > > related C2 instruct: > https://github.com/openjdk/jdk/blob/326dbb1b139dd1ec1b8605339b91697cdf49da9a/src/hotspot/cpu/riscv/riscv_v.ad#L4805-L4811 > > As rvv1.0 manual requirements for vector indexed loads[1]: `Vector unit-stride and constant-stride use the EEW/EMUL encoded in the instruction for the data values, while vector indexed loads and stores use the EEW/EMUL encoded in the instruction for the index values and the SEW/LMUL encoded in vtype for the data values.` > So in this case where a 64-bit vector index is used, we need to use the vluxei64_v (64-bit indexed load) > > ### Testing > - [x] Run VectorGatherMaskFoldingTest.java on Banana Pi BPI-F3 board (with RVV1.0) > - [x] test/jdk/jdk/incubator/vector on Banana Pi BPI-F3 board (with RVV1.0) > - [x] Run VectorGatherMaskFoldingTest.java on SOPHON SG2042 (without RVV1.0) > > [1] https://github.com/riscv/riscv-v-spec/blob/v1.0/v-spec.adoc#sec-vector-loadstore-width-encoding This pull request has now been integrated. Changeset: ce5727df Author: Gui Cao Committer: Fei Yang URL: https://git.openjdk.org/jdk/commit/ce5727df4436425b24b89f24c6e6b708575ec7c6 Stats: 86 lines in 2 files changed: 70 ins; 2 del; 14 mod 8333652: RISC-V: compiler/vectorapi/VectorGatherMaskFoldingTest.java fails when using RVV Reviewed-by: fyang ------------- PR: https://git.openjdk.org/jdk/pull/19564 From duke at openjdk.org Mon Jun 10 13:47:22 2024 From: duke at openjdk.org (Shaojin Wen) Date: Mon, 10 Jun 2024 13:47:22 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null Message-ID: After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. ------------- Commit messages: - Optimization for StringBuilder append(boolean) & appendNull Changes: https://git.openjdk.org/jdk/pull/19626/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19626&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333893 Stats: 137 lines in 5 files changed: 54 ins; 41 del; 42 mod Patch: https://git.openjdk.org/jdk/pull/19626.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19626/head:pull/19626 PR: https://git.openjdk.org/jdk/pull/19626 From duke at openjdk.org Mon Jun 10 13:47:22 2024 From: duke at openjdk.org (Shaojin Wen) Date: Mon, 10 Jun 2024 13:47:22 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 12:12:58 GMT, Shaojin Wen wrote: > After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. > > This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. # 1. Compare with the master branch 1. master (`a6fc2f8`) https://github.com/wenshao/jdk/tree/upstream_master_a6fc2f8 2. current (`5e815`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406 # 2. Benchmark Commands make test TEST="micro:java.lang.StringBuilders.toStringCharWithBool8" make test TEST="micro:java.lang.StringBuilders.toStringCharWithNull8" # 3. Benchmark Numbers The performance numbers under MacBookPro M1 Pro are as follows: -Benchmark Mode Cnt Score Error Units #master (a6fc2f8) -StringBuilders.toStringCharWithBool8Latin1 avgt 15 7.371 ? 0.003 ns/op -StringBuilders.toStringCharWithBool8Utf16 avgt 15 9.613 ? 0.018 ns/op -StringBuilders.toStringCharWithNull8Latin1 avgt 15 7.071 ? 0.003 ns/op -StringBuilders.toStringCharWithNull8Utf16 avgt 15 9.296 ? 0.016 ns/op +Benchmark Mode Cnt Score Error Units #current (5e815) +StringBuilders.toStringCharWithBool8Latin1 avgt 15 6.515 ? 0.121 ns/op +11.61% +StringBuilders.toStringCharWithBool8Utf16 avgt 15 8.654 ? 0.035 ns/op +9.97% +StringBuilders.toStringCharWithNull8Latin1 avgt 15 5.550 ? 0.010 ns/op +21.51% +StringBuilders.toStringCharWithNull8Utf16 avgt 15 8.108 ? 0.041 ns/op +12.77% # 1. Compare with unsafe branch 1. current (`5e815`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406 2. unsafe (`adc220`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406_unsafe I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it? # 2. Benchmark Commands make test TEST="micro:java.lang.StringBuilders.toStringCharWithBool8" make test TEST="micro:java.lang.StringBuilders.toStringCharWithNull8" # 3. Implementation of Unsafe Branch class AbstractStringBuilder { static final Unsafe UNSAFE = Unsafe.getUnsafe(); static final int NULL_LATIN1; static final int TRUE_LATIN1; static final int FALS_LATIN1; static final long NULL_UTF16; static final long TRUE_UTF16; static final long FALS_UTF16; static { byte[] bytes4 = new byte[4]; byte[] bytes8 = new byte[8]; bytes4[0] = 'n'; bytes4[1] = 'u'; bytes4[2] = 'l'; bytes4[3] = 'l'; NULL_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); NULL_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); bytes4[0] = 't'; bytes4[1] = 'r'; bytes4[2] = 'u'; bytes4[3] = 'e'; TRUE_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); TRUE_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); bytes4[0] = 'f'; bytes4[1] = 'a'; bytes4[2] = 'l'; bytes4[3] = 's'; FALS_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); FALS_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); } private AbstractStringBuilder appendNull() { ensureCapacityInternal(count + 4); int count = this.count; byte[] val = this.value; if (isLatin1()) { UNSAFE.putInt( val, Unsafe.ARRAY_BYTE_BASE_OFFSET + count, NULL_LATIN1); } else { UNSAFE.putLong( val, Unsafe.ARRAY_BYTE_BASE_OFFSET + (count << 1), NULL_UTF16); } this.count = count + 4; return this; } public AbstractStringBuilder append(boolean b) { int count = this.count; int spaceNeeded = count + (b ? 4 : 5); ensureCapacityInternal(spaceNeeded); byte[] val = this.value; if (isLatin1()) { UNSAFE.putInt( val, Unsafe.ARRAY_BYTE_BASE_OFFSET + count, b ? TRUE_LATIN1 : FALS_LATIN1); if (!b) { val[count + 4] = 'e'; } } else { UNSAFE.putLong( val, Unsafe.ARRAY_BYTE_BASE_OFFSET + (count << 1), b ? TRUE_UTF16 : FALS_UTF16); if (!b) { StringUTF16.putChar(val, count + 4, 'e'); } } this.count = spaceNeeded; return this; } } # 4. Benchmark Numbers The performance numbers under MacBookPro M1 Pro are as follows: -Benchmark Mode Cnt Score Error Units # unsafe (adc220) -StringBuilders.toStringCharWithBool8Latin1 avgt 15 6.415 ? 0.061 ns/op -StringBuilders.toStringCharWithBool8Utf16 avgt 15 7.307 ? 0.013 ns/op -StringBuilders.toStringCharWithNull8Latin1 avgt 15 5.443 ? 0.011 ns/op -StringBuilders.toStringCharWithNull8Utf16 avgt 15 6.944 ? 0.102 ns/op +Benchmark Mode Cnt Score Error Units #current (5e815) +StringBuilders.toStringCharWithBool8Latin1 avgt 15 6.515 ? 0.121 ns/op -1.55% +StringBuilders.toStringCharWithBool8Utf16 avgt 15 8.654 ? 0.035 ns/op -18.44% +StringBuilders.toStringCharWithNull8Latin1 avgt 15 5.550 ? 0.010 ns/op -1.96% +StringBuilders.toStringCharWithNull8Utf16 avgt 15 8.108 ? 0.041 ns/op -16.76% ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2158201904 PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2158296234 From liach at openjdk.org Mon Jun 10 14:22:11 2024 From: liach at openjdk.org (Chen Liang) Date: Mon, 10 Jun 2024 14:22:11 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 12:12:58 GMT, Shaojin Wen wrote: > After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. > > This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. I think for that C2 JIT to work, we need to merge the `'t' 'r' 'u' 'e'` ascii bytes into an int constant. Same for `false`. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2158504397 From simonis at openjdk.org Mon Jun 10 14:34:15 2024 From: simonis at openjdk.org (Volker Simonis) Date: Mon, 10 Jun 2024 14:34:15 GMT Subject: [jdk23] Integrated: 8333722: Fix CompilerDirectives for non-compiler JVM variants In-Reply-To: References: Message-ID: <4bxvLRQqYaDWAmWN_tLC1pLOkqTfjsGmO4pPvNr_FDM=.eb2923ff-64ff-4b10-aa29-e8eb60fc5cfb@github.com> On Mon, 10 Jun 2024 10:12:04 GMT, Volker Simonis wrote: > Hi all, > > This pull request contains a backport of commit 5f9d3e3a from the openjdk/jdk repository. > > The commit being backported was authored by Volker Simonis on 10 Jun 2024 and was reviewed by Vladimir Kozlov. > > Thanks! This pull request has now been integrated. Changeset: fdbc2b24 Author: Volker Simonis URL: https://git.openjdk.org/jdk/commit/fdbc2b24d33bd650f01ea4edfa716838b28d3612 Stats: 1 line in 1 file changed: 0 ins; 0 del; 1 mod 8333722: Fix CompilerDirectives for non-compiler JVM variants Reviewed-by: thartmann Backport-of: 5f9d3e3af8342592242cb304b2c219508d56ed3a ------------- PR: https://git.openjdk.org/jdk/pull/19622 From epeter at openjdk.org Mon Jun 10 14:35:13 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Mon, 10 Jun 2024 14:35:13 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: Message-ID: <9_OMKYvdx-xW6KC3sKovIYawgaRZfkKyvU97jcuskzM=.484f62c9-521c-47b5-a68a-6abc89c0e768@github.com> On Mon, 10 Jun 2024 13:04:08 GMT, Shaojin Wen wrote: >> After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. >> >> This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. > > # 1. Compare with unsafe branch > > 1. current (`5e815`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406 > 2. unsafe (`adc220`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406_unsafe > > I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it? > > # 2. Benchmark Commands > > make test TEST="micro:java.lang.StringBuilders.toStringCharWithBool8" > make test TEST="micro:java.lang.StringBuilders.toStringCharWithNull8" > > > # 3. Implementation of Unsafe Branch > > class AbstractStringBuilder { > static final Unsafe UNSAFE = Unsafe.getUnsafe(); > > static final int NULL_LATIN1; > static final int TRUE_LATIN1; > static final int FALS_LATIN1; > > static final long NULL_UTF16; > static final long TRUE_UTF16; > static final long FALS_UTF16; > > static { > byte[] bytes4 = new byte[4]; > byte[] bytes8 = new byte[8]; > > bytes4[0] = 'n'; > bytes4[1] = 'u'; > bytes4[2] = 'l'; > bytes4[3] = 'l'; > NULL_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); > StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); > NULL_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); > > bytes4[0] = 't'; > bytes4[1] = 'r'; > bytes4[2] = 'u'; > bytes4[3] = 'e'; > TRUE_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); > StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); > TRUE_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); > > bytes4[0] = 'f'; > bytes4[1] = 'a'; > bytes4[2] = 'l'; > bytes4[3] = 's'; > FALS_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); > StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); > FALS_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); > } > > private AbstractStringBuilder appendNull() { > ensureCapacityInternal(count + 4); > int count = this.count; > byte[] val = this.value; > if (isLatin1()) { > UNSAFE.putInt( > val, > Unsafe.ARRAY_BYTE_BASE_OFFSET + count, > NULL_LATIN1); > } else { > UNSAFE.putLong( > val, > Unsafe.ARRAY_BYTE_BASE_OFFSET + (count << 1), > NULL_UTF16); > } > this.count = count + 4; > return this; > } > > public AbstractStringBuilder append(boolean b) { > int count = th... @wenshao > I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it? Have you tried to see if the optimization actually was done/taken? You can use the `TraceMergeStores,` flag. Can you present the generated assembly code of the benchmarks, and explain the difference based on the generated assembly code? You can run JMH penchmarks with `perf`. These two blogs may help you: http://psy-lob-saw.blogspot.com/2015/07/jmh-perfasm.html https://shipilev.net/blog/2016/arrays-wisdom-ancients/#_meet_jmh_prof_perfasm @liach I don't think it makes a difference if it is `int` or `byte` constants. Or what exactly is the code change you are proposing? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2158533469 From duke at openjdk.org Mon Jun 10 15:06:20 2024 From: duke at openjdk.org (Ferenc Rakoczi) Date: Mon, 10 Jun 2024 15:06:20 GMT Subject: RFR: 8333867: SHA3 performance can be improved Message-ID: This PR removes some unnecessary conversions between byte arrays and long arrays during SHA3 digest computations. ------------- Commit messages: - 8333867: SHA3 performance can be improved Changes: https://git.openjdk.org/jdk/pull/19632/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19632&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333867 Stats: 63 lines in 3 files changed: 8 ins; 32 del; 23 mod Patch: https://git.openjdk.org/jdk/pull/19632.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19632/head:pull/19632 PR: https://git.openjdk.org/jdk/pull/19632 From liach at openjdk.org Mon Jun 10 15:59:11 2024 From: liach at openjdk.org (Chen Liang) Date: Mon, 10 Jun 2024 15:59:11 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 12:12:58 GMT, Shaojin Wen wrote: > After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. > > This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. You are right, I was thinking about the case where you have 2 short variables, you should combine them into a long explicitly for C2 to generate a 4-byte write, otherwise it would be 2 2-bytes. Omitted the constant part which already eliminates this restriction. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2158743190 From kvn at openjdk.org Mon Jun 10 17:28:14 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Mon, 10 Jun 2024 17:28:14 GMT Subject: RFR: 8333867: SHA3 performance can be improved In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 15:01:55 GMT, Ferenc Rakoczi wrote: > This PR removes some unnecessary conversions between byte arrays and long arrays during SHA3 digest computations. src/java.base/share/classes/sun/security/provider/SHA3.java line 98: > 96: @IntrinsicCandidate > 97: private void implCompress0(byte[] b, int ofs) { > 98: b2lLittle(b, ofs, longBuf, 0, blockSize); What about BigEndian machines? ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19632#discussion_r1633597451 From duke at openjdk.org Mon Jun 10 17:33:11 2024 From: duke at openjdk.org (Ferenc Rakoczi) Date: Mon, 10 Jun 2024 17:33:11 GMT Subject: RFR: 8333867: SHA3 performance can be improved In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 17:25:21 GMT, Vladimir Kozlov wrote: >> This PR removes some unnecessary conversions between byte arrays and long arrays during SHA3 digest computations. > > src/java.base/share/classes/sun/security/provider/SHA3.java line 98: > >> 96: @IntrinsicCandidate >> 97: private void implCompress0(byte[] b, int ofs) { >> 98: b2lLittle(b, ofs, longBuf, 0, blockSize); > > What about BigEndian machines? According to the SHA3 algorithm specification, this byte array should be interpreted as a little endian long array. b2lLittle() does just that on both little and big endian machines. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19632#discussion_r1633606439 From kvn at openjdk.org Mon Jun 10 17:51:12 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Mon, 10 Jun 2024 17:51:12 GMT Subject: RFR: 8333867: SHA3 performance can be improved In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 17:29:31 GMT, Ferenc Rakoczi wrote: >> src/java.base/share/classes/sun/security/provider/SHA3.java line 98: >> >>> 96: @IntrinsicCandidate >>> 97: private void implCompress0(byte[] b, int ofs) { >>> 98: b2lLittle(b, ofs, longBuf, 0, blockSize); >> >> What about BigEndian machines? > > According to the SHA3 algorithm specification, this byte array should be interpreted as a little endian long array. b2lLittle() does just that on both little and big endian machines. Okay. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19632#discussion_r1633627850 From kvn at openjdk.org Mon Jun 10 18:46:13 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Mon, 10 Jun 2024 18:46:13 GMT Subject: RFR: 8333867: SHA3 performance can be improved In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 15:01:55 GMT, Ferenc Rakoczi wrote: > This PR removes some unnecessary conversions between byte arrays and long arrays during SHA3 digest computations. src/java.base/share/classes/sun/security/provider/SHA3.java line 100: > 98: b2lLittle(b, ofs, longBuf, 0, blockSize); > 99: for (int i = 0; i < blockSize / 8; i++) { > 100: state[i] ^= longBuf[i]; Clever. So the intrinsic (C2 code) still generates code corresponding original loop with `byte b[]` array. This will be confusing. It will also slowdown execution in Interpreter so - additional array copy. New code also assumes that `buffer.length == blockSize` and `(buffer.length % 8) == 0`. I hope there is some assertions/checks in java code to verify that. Some one from core-libs have to review this. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19632#discussion_r1633688295 From eastigeevich at openjdk.org Mon Jun 10 20:10:27 2024 From: eastigeevich at openjdk.org (Evgeny Astigeevich) Date: Mon, 10 Jun 2024 20:10:27 GMT Subject: RFR: 8333891: Method excluded with directive is not compiled after removal of directive Message-ID: <2xstE3V0PD8FGcijx_THSX1YgIJ7fZLponoL7b96TiY=.04ecae5f-9e3a-4c26-9893-72822f31c753@github.com> We can exclude Java methods from compilation with compiler directives. Later we can remove those directives. This PR fixes a bug that after removal of those directives Java methods don't become compilable. A regression test is added. Tested fastdebug build with a new test and tier1 tests. ------------- Commit messages: - 8333891: Method excluded with directive is not compiled after removal of directive Changes: https://git.openjdk.org/jdk/pull/19637/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19637&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8333891 Stats: 122 lines in 2 files changed: 122 ins; 0 del; 0 mod Patch: https://git.openjdk.org/jdk/pull/19637.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19637/head:pull/19637 PR: https://git.openjdk.org/jdk/pull/19637 From duke at openjdk.org Mon Jun 10 21:12:13 2024 From: duke at openjdk.org (Ferenc Rakoczi) Date: Mon, 10 Jun 2024 21:12:13 GMT Subject: RFR: 8333867: SHA3 performance can be improved In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 18:43:49 GMT, Vladimir Kozlov wrote: >> This PR removes some unnecessary conversions between byte arrays and long arrays during SHA3 digest computations. > > src/java.base/share/classes/sun/security/provider/SHA3.java line 100: > >> 98: b2lLittle(b, ofs, longBuf, 0, blockSize); >> 99: for (int i = 0; i < blockSize / 8; i++) { >> 100: state[i] ^= longBuf[i]; > > Clever. So the intrinsic (C2 code) still generates code corresponding original loop with `byte b[]` array. This will be confusing. It will also slowdown execution in Interpreter so - additional array copy. > > New code also assumes that `buffer.length == blockSize` and `(buffer.length % 8) == 0`. I hope there is some assertions/checks in java code to verify that. > > Some one from core-libs have to review this. Well, the intrinsic function treats the input and state as long arrays anyways, and so it only works on little endian architectures, where the conversion is a no-op. There is no additional array copy, this b2lLittle() call used to be in the keccak() method (along with the conversion back to byte array), the point of this whole change is that only one of these conversions should be done with every keccak() call (an additional benefit is that the xor and the corresponding loads+store is done on longs, not on bytes). ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19632#discussion_r1633830998 From duke at openjdk.org Mon Jun 10 23:13:13 2024 From: duke at openjdk.org (Shaojin Wen) Date: Mon, 10 Jun 2024 23:13:13 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: <9_OMKYvdx-xW6KC3sKovIYawgaRZfkKyvU97jcuskzM=.484f62c9-521c-47b5-a68a-6abc89c0e768@github.com> References: <9_OMKYvdx-xW6KC3sKovIYawgaRZfkKyvU97jcuskzM=.484f62c9-521c-47b5-a68a-6abc89c0e768@github.com> Message-ID: On Mon, 10 Jun 2024 14:32:46 GMT, Emanuel Peter wrote: >> # 1. Compare with unsafe branch >> >> 1. current (`5e815`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406 >> 2. unsafe (`adc220`) https://github.com/wenshao/jdk/tree/optim_str_builder_append_202406_unsafe >> >> I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it? >> >> # 2. Benchmark Commands >> >> make test TEST="micro:java.lang.StringBuilders.toStringCharWithBool8" >> make test TEST="micro:java.lang.StringBuilders.toStringCharWithNull8" >> >> >> # 3. Implementation of Unsafe Branch >> >> class AbstractStringBuilder { >> static final Unsafe UNSAFE = Unsafe.getUnsafe(); >> >> static final int NULL_LATIN1; >> static final int TRUE_LATIN1; >> static final int FALS_LATIN1; >> >> static final long NULL_UTF16; >> static final long TRUE_UTF16; >> static final long FALS_UTF16; >> >> static { >> byte[] bytes4 = new byte[4]; >> byte[] bytes8 = new byte[8]; >> >> bytes4[0] = 'n'; >> bytes4[1] = 'u'; >> bytes4[2] = 'l'; >> bytes4[3] = 'l'; >> NULL_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); >> NULL_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> >> bytes4[0] = 't'; >> bytes4[1] = 'r'; >> bytes4[2] = 'u'; >> bytes4[3] = 'e'; >> TRUE_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); >> TRUE_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> >> bytes4[0] = 'f'; >> bytes4[1] = 'a'; >> bytes4[2] = 'l'; >> bytes4[3] = 's'; >> FALS_LATIN1 = UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> StringUTF16.inflate(bytes4, 0, bytes8, 0, 4); >> FALS_UTF16 = UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET); >> } >> >> private AbstractStringBuilder appendNull() { >> ensureCapacityInternal(count + 4); >> int count = this.count; >> byte[] val = this.value; >> if (isLatin1()) { >> UNSAFE.putInt( >> val, >> Unsafe.ARRAY_BYTE_BASE_OFFSET + count, >> NULL_LATIN1); >> } else { >> UNSAFE.putLong( >> val, >> Unsafe.ARRAY_BYTE_BASE_OFFSET + (count << 1), >> NULL_UTF16); >> ... > > @wenshao >> I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it? > > Have you tried to see if the optimization actually was done/taken? You can use the `TraceMergeStores,` flag. Can you present the generated assembly code of the benchmarks, and explain the difference based on the generated assembly code? You can run JMH penchmarks with `perf`. These two blogs may help you: > > http://psy-lob-saw.blogspot.com/2015/07/jmh-perfasm.html > https://shipilev.net/blog/2016/arrays-wisdom-ancients/#_meet_jmh_prof_perfasm > > @liach I don't think it makes a difference if it is `int` or `byte` constants. Or what exactly is the code change you are proposing? @eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect. class AbstractStringBuilder { private AbstractStringBuilder appendNull() { // ... StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l'); // ... } public AbstractStringBuilder append(boolean b) { // ... StringUTF16.putCharsAt(val, count, 't', 'r', 'u', 'e'); // ... StringUTF16.putCharsAt(val, count, 'f', 'a', 'l', 's', 'e'); // ... } } class StringUTF16 { public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) { putChar(value, i , c1); putChar(value, i + 1, c2); putChar(value, i + 2, c3); putChar(value, i + 3, c4); } @IntrinsicCandidate // intrinsic performs no bounds checks static void putChar(byte[] val, int index, int c) { assert index >= 0 && index < length(val) : "Trusted caller missed bounds check"; index <<= 1; val[index++] = (byte)(c >> HI_BYTE_SHIFT); val[index] = (byte)(c >> LO_BYTE_SHIFT); } } The code for manually merging storage is as follows, without using Unsafe: class AbstractStringBuilder { static final long NULL_UTF16; static final long TRUE_UTF16; static final long FALS_UTF16; static { byte[] bytes = new byte[8]; StringUTF16.putCharsAt(bytes, 0, 'n', 'u', 'l', 'l'); NULL_UTF16 = getLong(bytes, 0); StringUTF16.putCharsAt(bytes, 0, 't', 'r', 'u', 'e'); TRUE_UTF16 = getLong(bytes, 0); StringUTF16.putCharsAt(bytes, 0, 'f', 'a', 'l', 's'); FALS_UTF16 = getLong(bytes, 0); } private static long getLong(byte[] bytes, int offset) { return (((long)bytes[offset ] & 0xff) ) | (((long)bytes[offset + 1] & 0xff) << 8) | (((long)bytes[offset + 2] & 0xff) << 16) | (((long)bytes[offset + 3] & 0xff) << 24) | (((long)bytes[offset + 4] & 0xff) << 32) | (((long)bytes[offset + 5] & 0xff) << 40) | (((long)bytes[offset + 6] & 0xff) << 48) | (((long)bytes[offset + 7] & 0xff) << 56); } private static void setLong(byte[] array, int offset, long value) { array[offset] = (byte) value; array[offset + 1] = (byte) (value >> 8); array[offset + 2] = (byte) (value >> 16); array[offset + 3] = (byte) (value >> 24); array[offset + 4] = (byte) (value >> 32); array[offset + 5] = (byte) (value >> 40); array[offset + 6] = (byte) (value >> 48); array[offset + 7] = (byte) (value >> 56); } private AbstractStringBuilder appendNull() { int count = this.count; ensureCapacityInternal(count + 4); byte[] val = this.value; if (isLatin1()) { val[count ] = 'n'; val[count + 1] = 'u'; val[count + 2] = 'l'; val[count + 3] = 'l'; } else { setLong(val, count, NULL_UTF16); } this.count = count + 4; return this; } public AbstractStringBuilder append(boolean b) { int count = this.count; int spaceNeeded = count + (b ? 4 : 5); ensureCapacityInternal(spaceNeeded); byte[] val = this.value; if (isLatin1()) { if (b) { val[count ] = 't'; val[count + 1] = 'r'; val[count + 2] = 'u'; val[count + 3] = 'e'; } else { val[count ] = 'f'; val[count + 1] = 'a'; val[count + 2] = 'l'; val[count + 3] = 's'; val[count + 4] = 'e'; } } else { setLong(val, count, b ? TRUE_UTF16 : FALS_UTF16); if (!b) { StringUTF16.putChar(val, count + 4, 'e'); } } this.count = spaceNeeded; return this; } } The getLong/setLong methods here can be optimized and merged by C2. Maybe we need a public class that does not use Unsafe to implement these methods. They are needed in many places. Maybe it is appropriate to improve them based on ByteArray/ByteArrayLittleEndian. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2159458425 From dlong at openjdk.org Mon Jun 10 23:40:12 2024 From: dlong at openjdk.org (Dean Long) Date: Mon, 10 Jun 2024 23:40:12 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 12:12:58 GMT, Shaojin Wen wrote: > After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. > > This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. >From a compiler point of view, it might be better to enhance the 8318446 optimization so that it recognizes code patterns that use increment. For a libraries point of view, this proposed change probably needs some comments, so it doesn't get accidentally changed in the future in a way that breaks the optimization. How would we prevent that? The situation seems fragile to me. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2159497549 From redestad at openjdk.org Mon Jun 10 23:52:20 2024 From: redestad at openjdk.org (Claes Redestad) Date: Mon, 10 Jun 2024 23:52:20 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: Message-ID: <_I5gARYyMcHy0DQVI8GnU9teGHji53iythE75m1XCmw=.d16ab969-2a5a-4e38-b628-0218db353950@github.com> On Mon, 10 Jun 2024 12:12:58 GMT, Shaojin Wen wrote: > After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. > > This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. Whether increments are handled or not, I think the `StringUTF16.putChar` intrinsic might be getting in the way here and prevent merging consecutive "char" stores into 4 or 8 byte stores. It would be interesting to get numbers for the `putChar` version with the intrinsic disabled (`-XX:+UnlockDiagnosticVMOptions -XX:DisableIntrinsic=_putCharStringU`). ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2159509806 From duke at openjdk.org Tue Jun 11 02:23:36 2024 From: duke at openjdk.org (MaxXing) Date: Tue, 11 Jun 2024 02:23:36 GMT Subject: RFR: 8333334: C2: Make result of `Node::dominates` more precise to enhance scalar replacement [v3] In-Reply-To: References: Message-ID: <91EuDDDAZwL7PXXD3FBIV1FGwErjydcgIXM3vP7kQpY=.692ec6dc-fe66-4d9f-930a-e782b7aabf29@github.com> > This patch changes the algorithm of `Node::dominates` to make the result more precise, and allows the iterators of `ConcurrentHashMap` to be scalar replaced. > > The previous algorithm will return a conservative result when encountering a dead control flow, and only try the first two input paths of a multi-input Region node, which may prevent the scalar replacement in some cases. > > For example, with G1 GC enabled, C2 generates GC barriers for `ConcurrentHashMap` iteration operations at some early phases, and then eliminates them in a later IGVN, but `LoadNode` is also idealized in the same IGVN. This causes `LoadNode::Ideal` to see some dead barrier control flows, and refuse to split some instance field loads through Phi due to the conservative result of `Node::dominates`, and thus the scalar replacement can not be applied to iterators in the later macro elimination phase. > > This patch allows `Node::dominates` to try other paths of the last multi-input Region node when the first path is dead, and makes `ConcurrentHashMap` iteration ~30% faster: > > > Benchmark (nkeys) Mode Cnt Score Error Units > Maps.testConcurrentHashMapIterators 10000 avgt 15 414099.085 ? 33230.945 ns/op # baseline > Maps.testConcurrentHashMapIterators 10000 avgt 15 315490.281 ? 3037.056 ns/op # patch > > > Testing: tier1-4. MaxXing has updated the pull request incrementally with one additional commit since the last revision: Add IR test and update copyright. ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19496/files - new: https://git.openjdk.org/jdk/pull/19496/files/b5db38dc..f2e5506e Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19496&range=02 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19496&range=01-02 Stats: 116 lines in 2 files changed: 115 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19496.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19496/head:pull/19496 PR: https://git.openjdk.org/jdk/pull/19496 From duke at openjdk.org Tue Jun 11 02:34:15 2024 From: duke at openjdk.org (MaxXing) Date: Tue, 11 Jun 2024 02:34:15 GMT Subject: RFR: 8333334: C2: Make result of `Node::dominates` more precise to enhance scalar replacement [v3] In-Reply-To: <91EuDDDAZwL7PXXD3FBIV1FGwErjydcgIXM3vP7kQpY=.692ec6dc-fe66-4d9f-930a-e782b7aabf29@github.com> References: <91EuDDDAZwL7PXXD3FBIV1FGwErjydcgIXM3vP7kQpY=.692ec6dc-fe66-4d9f-930a-e782b7aabf29@github.com> Message-ID: On Tue, 11 Jun 2024 02:23:36 GMT, MaxXing wrote: >> This patch changes the algorithm of `Node::dominates` to make the result more precise, and allows the iterators of `ConcurrentHashMap` to be scalar replaced. >> >> The previous algorithm will return a conservative result when encountering a dead control flow, and only try the first two input paths of a multi-input Region node, which may prevent the scalar replacement in some cases. >> >> For example, with G1 GC enabled, C2 generates GC barriers for `ConcurrentHashMap` iteration operations at some early phases, and then eliminates them in a later IGVN, but `LoadNode` is also idealized in the same IGVN. This causes `LoadNode::Ideal` to see some dead barrier control flows, and refuse to split some instance field loads through Phi due to the conservative result of `Node::dominates`, and thus the scalar replacement can not be applied to iterators in the later macro elimination phase. >> >> This patch allows `Node::dominates` to try other paths of the last multi-input Region node when the first path is dead, and makes `ConcurrentHashMap` iteration ~30% faster: >> >> >> Benchmark (nkeys) Mode Cnt Score Error Units >> Maps.testConcurrentHashMapIterators 10000 avgt 15 414099.085 ? 33230.945 ns/op # baseline >> Maps.testConcurrentHashMapIterators 10000 avgt 15 315490.281 ? 3037.056 ns/op # patch >> >> >> Testing: tier1-4. > > MaxXing has updated the pull request incrementally with one additional commit since the last revision: > > Add IR test and update copyright. Hi, I added an IR test to verify the optimization, now this patch is ready for review. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19496#issuecomment-2159655636 From duke at openjdk.org Tue Jun 11 05:25:39 2024 From: duke at openjdk.org (Shaojin Wen) Date: Tue, 11 Jun 2024 05:25:39 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null [v2] In-Reply-To: References: Message-ID: > After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. > > This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. Shaojin Wen has updated the pull request incrementally with one additional commit since the last revision: optimize to combining values into larger stores ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19626/files - new: https://git.openjdk.org/jdk/pull/19626/files/5e815b63..0cbaa5ac Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19626&range=01 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19626&range=00-01 Stats: 16 lines in 1 file changed: 7 ins; 0 del; 9 mod Patch: https://git.openjdk.org/jdk/pull/19626.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19626/head:pull/19626 PR: https://git.openjdk.org/jdk/pull/19626 From duke at openjdk.org Tue Jun 11 05:42:27 2024 From: duke at openjdk.org (Shaojin Wen) Date: Tue, 11 Jun 2024 05:42:27 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null [v3] In-Reply-To: References: Message-ID: > After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. > > This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. Shaojin Wen has updated the pull request incrementally with one additional commit since the last revision: bug fix ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19626/files - new: https://git.openjdk.org/jdk/pull/19626/files/0cbaa5ac..f96cde4e Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19626&range=02 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19626&range=01-02 Stats: 12 lines in 1 file changed: 2 ins; 0 del; 10 mod Patch: https://git.openjdk.org/jdk/pull/19626.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19626/head:pull/19626 PR: https://git.openjdk.org/jdk/pull/19626 From rehn at openjdk.org Tue Jun 11 05:51:18 2024 From: rehn at openjdk.org (Robbin Ehn) Date: Tue, 11 Jun 2024 05:51:18 GMT Subject: Integrated: 8333649: Allow different NativeCall encodings In-Reply-To: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> References: <9VJJ9-VcXWB-IRB-KVPyjDPYdfOif-SNvgDIfJcyfVI=.3e8ac6c7-82c1-46b5-93bd-c50a07888ca4@github.com> Message-ID: <2yyrCmd4-Syi8ByhfPV8zOvwLzintbbVs3B5ccEoH30=.cf300509-a7bf-4d2a-8a9f-3a082c94e78b@github.com> On Wed, 5 Jun 2024 12:35:47 GMT, Robbin Ehn wrote: > Hi all, please consider! > > We want to have different selectable NativeCalls. > These are not the same size, shared code should query instead of using the enum directly. > > Sanity build and tested RV/x86, hoping GHA will catch anything else. > > Thanks, Robbin This pull request has now been integrated. Changeset: 4d6064a7 Author: Robbin Ehn URL: https://git.openjdk.org/jdk/commit/4d6064a76003addf38e6eb6b925dad8043581768 Stats: 12 lines in 9 files changed: 9 ins; 0 del; 3 mod 8333649: Allow different NativeCall encodings Reviewed-by: kvn, mli ------------- PR: https://git.openjdk.org/jdk/pull/19556 From epeter at openjdk.org Tue Jun 11 06:30:13 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 11 Jun 2024 06:30:13 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: <9_OMKYvdx-xW6KC3sKovIYawgaRZfkKyvU97jcuskzM=.484f62c9-521c-47b5-a68a-6abc89c0e768@github.com> Message-ID: On Mon, 10 Jun 2024 23:10:05 GMT, Shaojin Wen wrote: >> @wenshao >>> I think the performance of the Unsafe branch may be the best data for the C2 optimizer. @eme64 can help me see if C2 can do it? >> >> Have you tried to see if the optimization actually was done/taken? You can use the `TraceMergeStores,` flag. Can you present the generated assembly code of the benchmarks, and explain the difference based on the generated assembly code? You can run JMH penchmarks with `perf`. These two blogs may help you: >> >> http://psy-lob-saw.blogspot.com/2015/07/jmh-perfasm.html >> https://shipilev.net/blog/2016/arrays-wisdom-ancients/#_meet_jmh_prof_perfasm >> >> @liach I don't think it makes a difference if it is `int` or `byte` constants. Or what exactly is the code change you are proposing? > > @eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect. > > class AbstractStringBuilder { > private AbstractStringBuilder appendNull() { > // ... > StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l'); > // ... > } > > public AbstractStringBuilder append(boolean b) { > // ... > StringUTF16.putCharsAt(val, count, 't', 'r', 'u', 'e'); > // ... > StringUTF16.putCharsAt(val, count, 'f', 'a', 'l', 's', 'e'); > // ... > } > } > > class StringUTF16 { > public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) { > putChar(value, i , c1); > putChar(value, i + 1, c2); > putChar(value, i + 2, c3); > putChar(value, i + 3, c4); > } > > @IntrinsicCandidate > // intrinsic performs no bounds checks > static void putChar(byte[] val, int index, int c) { > assert index >= 0 && index < length(val) : "Trusted caller missed bounds check"; > index <<= 1; > val[index++] = (byte)(c >> HI_BYTE_SHIFT); > val[index] = (byte)(c >> LO_BYTE_SHIFT); > } > } > > > The code for manually merging storage is as follows, without using Unsafe: > > class AbstractStringBuilder { > static final long NULL_UTF16; > static final long TRUE_UTF16; > static final long FALS_UTF16; > > static { > byte[] bytes = new byte[8]; > > StringUTF16.putCharsAt(bytes, 0, 'n', 'u', 'l', 'l'); > NULL_UTF16 = getLong(bytes, 0); > > StringUTF16.putCharsAt(bytes, 0, 't', 'r', 'u', 'e'); > TRUE_UTF16 = getLong(bytes, 0); > > StringUTF16.putCharsAt(bytes, 0, 'f', 'a', 'l', 's'); > FALS_UTF16 = getLong(bytes, 0); > } > > private static long getLong(byte[] bytes, int offset) { > return (((long)bytes[offset ] & 0xff) ) | > (((long)bytes[offset + 1] & 0xff) << 8) | > (((long)bytes[offset + 2] & 0xff) << 16) | > (((long)bytes[offset + 3] & 0xff) << 24) | > (((long)bytes[offset + 4] & 0xff) << 32) | > (((long)bytes[offset + 5] & 0xff) << 40) | > (((long)bytes[offset + 6] & 0xff) << 48) | > (((long)bytes[offset + 7] & 0xff) << 56); > } > > private static void setLong(byte[] array, int offset, long value) { > array[offset] = (byte) value; > array[offset + 1] = (byte) (value >> 8); > array[offset + 2] = (byte) (value >> 16); > array[offset + 3] = (byte) (value >> 24); > arra... @wenshao > @eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect. As I asked above, you will need to provide some evidence / generated assembly / perf data, and logs from `TraceMergeStores`. I currently do not have time to produce these myself, and I think they would be crucial to determine where the missing performance has gone. See my earlier comment: https://github.com/openjdk/jdk/pull/19626#issuecomment-2158533469 And please also try @cl4es advide here: https://github.com/openjdk/jdk/pull/19626#issuecomment-2159509806 And sure, maybe you need some public API for setting multiple bytes at once, which the `MergeStores` optimization can optimize. I'm a C2 engineer, so I leave that up to the library folks ;) ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2159895757 From duke at openjdk.org Tue Jun 11 07:18:46 2024 From: duke at openjdk.org (Daniel Skantz) Date: Tue, 11 Jun 2024 07:18:46 GMT Subject: RFR: 8330157: C2: Add a stress flag for bailouts Message-ID: This patch adds a diagnostic/stress flag for C2 bailouts. It can be used to support testing of existing bailouts to prevent issues like [JDK-8318445](https://bugs.openjdk.org/browse/JDK-8318445), and can test for issues only seen at runtime such as [JDK-8326376](https://bugs.openjdk.org/browse/JDK-8326376). It can also be useful if we want to add more bailouts ([JDK-8318900](https://bugs.openjdk.org/browse/JDK-8318900)). We check two invariants. a) Bailouts should be successful starting from any given `failing()` check. b) The VM should not record a bailout when one is pending (in which case we have continued to optimize for too long). a), b) are checked by randomly starting a bailout at calls to `failing()` with a user-given probability. The added flag should not have any effect in debug mode. Testing: T1-5, with flag and without it. We want to check that this does not cause any test failures without the flag set, and no unexpected failures with it. Tests failing because of timeout or because an error is printed to output when compilation fails can be expected in some cases. ------------- Commit messages: - undo raised initialization; skip if stress seed is not initialized yet. - remove unused var - revert last - lower stress seed init again - make test quicker - remove warning check entirely - import - fix test condition - whitespace - add a basic test - ... and 6 more: https://git.openjdk.org/jdk/compare/83b34410...9245b2c6 Changes: https://git.openjdk.org/jdk/pull/19646/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19646&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8330157 Stats: 146 lines in 12 files changed: 125 ins; 0 del; 21 mod Patch: https://git.openjdk.org/jdk/pull/19646.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19646/head:pull/19646 PR: https://git.openjdk.org/jdk/pull/19646 From bkilambi at openjdk.org Tue Jun 11 07:20:31 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Tue, 11 Jun 2024 07:20:31 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v8] In-Reply-To: References: <8-_t7nWbR9gZ2_QkfFNuf5M0Q4PMkKJKgwS3ZbHcCxI=.32dc4f11-dec5-468d-afc8-3b4dae285dcb@github.com> <2y-Ag6MxVDJfYl6kM0FYjQA-kzSCekUgAMWAZmkECyQ=.2a2a0a8e-fc67-42a4-bd67-b4ae3b60bcea@github.com> <9nJQN4zoX1YRLxZhlwogJqwvsRnaJLvYFjcpe9FjD3A=.226310ab-016d-4ee7-a3ef-3d849012cd25@github.com> Message-ID: On Fri, 7 Jun 2024 13:27:28 GMT, Emanuel Peter wrote: >> Hi @eme64 , have updated patch with a merge with master. Can you please run testing? Thank you! > > @Bhavana-Kilambi thanks for the merge, just launched it before you pinged me ;) > Please ping me again after the weekend for the results! Hi @eme64, would you be able to sponsor please? ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2159965043 From epeter at openjdk.org Tue Jun 11 07:20:35 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 11 Jun 2024 07:20:35 GMT Subject: RFR: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction [v12] In-Reply-To: References: Message-ID: On Fri, 7 Jun 2024 13:30:35 GMT, Bhavana Kilambi wrote: >> Floating-point addition is non-associative, that is adding floating-point elements in arbitrary order may get different value. Specially, Vector API does not define the order of reduction intentionally, which allows platforms to generate more efficient codes [1]. So that needs a node to represent non strictly-ordered add-reduction for floating-point type in C2. >> >> To avoid introducing new nodes, this patch adds a bool field in `AddReductionVF/D` to distinguish whether they require strict order. It also removes `UnorderedReductionNode` and adds a virtual function `bool requires_strict_order()` in `ReductionNode`. Besides `AddReductionVF/D`, other reduction nodes' `requires_strict_order()` have a fixed value. >> >> With this patch, Vector API would always generate non strictly-ordered `AddReductionVF/D' on SVE machines with vector length <= 16B as it is more beneficial to generate non-strictly ordered instructions on such machines compared to strictly ordered ones. >> >> [AArch64] >> On Neon, non strictly-ordered `AddReductionVF/D` cannot be generated. Auto-vectorization has already banned these nodes in JDK-8275275 [2]. >> >> This patch adds matching rules for non strictly-ordered `AddReductionVF/D`. >> >> No effects on other platforms. >> >> [Performance] >> FloatMaxVector.ADDLanes [3] measures the performance of add reduction for floating-point type. With this patch, it improves ~3x on my SVE machine (128-bit). >> >> ADDLanes >> >> Benchmark Before After Unit >> FloatMaxVector.ADDLanes 1789.513 5264.226 ops/ms >> >> >> Final code is as below: >> >> Before: >> ` fadda z17.s, p7/m, z17.s, z16.s >> ` >> After: >> >> faddp v17.4s, v21.4s, v21.4s >> faddp s18, v17.2s >> fadd s18, s18, s19 >> >> >> >> >> [Test] >> Full jtreg passed on AArch64 and x86. >> >> [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2529 >> [2] https://bugs.openjdk.org/browse/JDK-8275275 >> [3] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/micro/org/openjdk/bench/jdk/incubator/vector/operation/FloatMaxVector.java#L316 > > Bhavana Kilambi has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains 12 additional commits since the last revision: > > - Merge with master > - Make changes in IR rules for JTREG tests > - Modify JTREG IR rules and some style/format changes > - Add dump_spec and JTREG IR tests for Add/Mul Reduction Nodes > - Merge master > - Adjust format for the backend rules changed in previous commit > - Address some more review comments > - Revert to previous indentation > - Add comments, revert to requires_strict_order and other minor changes > - Naming changes: replace strict/non-strict with more technical terms > - ... and 2 more: https://git.openjdk.org/jdk/compare/d7e21a65...35e6258d Let's ship it! ? ------------- PR Comment: https://git.openjdk.org/jdk/pull/18034#issuecomment-2159968567 From bkilambi at openjdk.org Tue Jun 11 07:20:37 2024 From: bkilambi at openjdk.org (Bhavana Kilambi) Date: Tue, 11 Jun 2024 07:20:37 GMT Subject: Integrated: 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction In-Reply-To: References: Message-ID: On Tue, 27 Feb 2024 21:24:46 GMT, Bhavana Kilambi wrote: > Floating-point addition is non-associative, that is adding floating-point elements in arbitrary order may get different value. Specially, Vector API does not define the order of reduction intentionally, which allows platforms to generate more efficient codes [1]. So that needs a node to represent non strictly-ordered add-reduction for floating-point type in C2. > > To avoid introducing new nodes, this patch adds a bool field in `AddReductionVF/D` to distinguish whether they require strict order. It also removes `UnorderedReductionNode` and adds a virtual function `bool requires_strict_order()` in `ReductionNode`. Besides `AddReductionVF/D`, other reduction nodes' `requires_strict_order()` have a fixed value. > > With this patch, Vector API would always generate non strictly-ordered `AddReductionVF/D' on SVE machines with vector length <= 16B as it is more beneficial to generate non-strictly ordered instructions on such machines compared to strictly ordered ones. > > [AArch64] > On Neon, non strictly-ordered `AddReductionVF/D` cannot be generated. Auto-vectorization has already banned these nodes in JDK-8275275 [2]. > > This patch adds matching rules for non strictly-ordered `AddReductionVF/D`. > > No effects on other platforms. > > [Performance] > FloatMaxVector.ADDLanes [3] measures the performance of add reduction for floating-point type. With this patch, it improves ~3x on my SVE machine (128-bit). > > ADDLanes > > Benchmark Before After Unit > FloatMaxVector.ADDLanes 1789.513 5264.226 ops/ms > > > Final code is as below: > > Before: > ` fadda z17.s, p7/m, z17.s, z16.s > ` > After: > > faddp v17.4s, v21.4s, v21.4s > faddp s18, v17.2s > fadd s18, s18, s19 > > > > > [Test] > Full jtreg passed on AArch64 and x86. > > [1] https://github.com/openjdk/jdk/blob/master/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java#L2529 > [2] https://bugs.openjdk.org/browse/JDK-8275275 > [3] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/micro/org/openjdk/bench/jdk/incubator/vector/operation/FloatMaxVector.java#L316 This pull request has now been integrated. Changeset: 0e4d4a0c Author: Bhavana Kilambi Committer: Emanuel Peter URL: https://git.openjdk.org/jdk/commit/0e4d4a0c3150c01d927bd69cc578cea053cf16b3 Stats: 587 lines in 10 files changed: 469 ins; 12 del; 106 mod 8320725: AArch64: C2: Add "requires_strict_order" flag for floating-point add and mul reduction Co-authored-by: Eric Liu Reviewed-by: gli, epeter, aph ------------- PR: https://git.openjdk.org/jdk/pull/18034 From duke at openjdk.org Tue Jun 11 08:06:14 2024 From: duke at openjdk.org (Ferenc Rakoczi) Date: Tue, 11 Jun 2024 08:06:14 GMT Subject: RFR: 8333867: SHA3 performance can be improved In-Reply-To: References: Message-ID: On Mon, 10 Jun 2024 21:09:16 GMT, Ferenc Rakoczi wrote: >> src/java.base/share/classes/sun/security/provider/SHA3.java line 100: >> >>> 98: b2lLittle(b, ofs, longBuf, 0, blockSize); >>> 99: for (int i = 0; i < blockSize / 8; i++) { >>> 100: state[i] ^= longBuf[i]; >> >> Clever. So the intrinsic (C2 code) still generates code corresponding original loop with `byte b[]` array. This will be confusing. It will also slowdown execution in Interpreter so - additional array copy. >> >> New code also assumes that `buffer.length == blockSize` and `(buffer.length % 8) == 0`. I hope there is some assertions/checks in java code to verify that. >> >> Some one from core-libs have to review this. > > Well, the intrinsic function treats the input and state as long arrays anyways, and so it only works on little endian architectures, where the conversion is a no-op. There is no additional array copy, this b2lLittle() call used to be in the keccak() method (along with the conversion back to byte array), the point of this whole change is that only one of these conversions should be done with every keccak() call (an additional benefit is that the xor and the corresponding loads+store is done on longs, not on bytes). Oh, and about the length: buffer is allocated in the constructor of the parent class (DigestBase) like this: buffer = new byte[blockSize]; Here blockSize is one of { 72, 104, 136, 144, 168 }, so divisible by 8. buffer.length was used before probably because blockSize was declared private in DigestBase. I made it protected, because in my opinion it is easier to read the code this way. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19632#discussion_r1634390741 From duke at openjdk.org Tue Jun 11 09:19:13 2024 From: duke at openjdk.org (Shaojin Wen) Date: Tue, 11 Jun 2024 09:19:13 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: <9_OMKYvdx-xW6KC3sKovIYawgaRZfkKyvU97jcuskzM=.484f62c9-521c-47b5-a68a-6abc89c0e768@github.com> Message-ID: <3Glk5fqUv6mOnaBeQZIMctUDhZzASCQEf4VNpTaEfvE=.ee098533-80c0-4bc1-8ae1-c8551b2c9ac6@github.com> On Tue, 11 Jun 2024 06:27:13 GMT, Emanuel Peter wrote: >> @eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect. >> >> class AbstractStringBuilder { >> private AbstractStringBuilder appendNull() { >> // ... >> StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l'); >> // ... >> } >> >> public AbstractStringBuilder append(boolean b) { >> // ... >> StringUTF16.putCharsAt(val, count, 't', 'r', 'u', 'e'); >> // ... >> StringUTF16.putCharsAt(val, count, 'f', 'a', 'l', 's', 'e'); >> // ... >> } >> } >> >> class StringUTF16 { >> public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) { >> putChar(value, i , c1); >> putChar(value, i + 1, c2); >> putChar(value, i + 2, c3); >> putChar(value, i + 3, c4); >> } >> >> @IntrinsicCandidate >> // intrinsic performs no bounds checks >> static void putChar(byte[] val, int index, int c) { >> assert index >= 0 && index < length(val) : "Trusted caller missed bounds check"; >> index <<= 1; >> val[index++] = (byte)(c >> HI_BYTE_SHIFT); >> val[index] = (byte)(c >> LO_BYTE_SHIFT); >> } >> } >> >> >> The code for manually merging storage is as follows, without using Unsafe: >> >> class AbstractStringBuilder { >> static final long NULL_UTF16; >> static final long TRUE_UTF16; >> static final long FALS_UTF16; >> >> static { >> byte[] bytes = new byte[8]; >> >> StringUTF16.putCharsAt(bytes, 0, 'n', 'u', 'l', 'l'); >> NULL_UTF16 = getLong(bytes, 0); >> >> StringUTF16.putCharsAt(bytes, 0, 't', 'r', 'u', 'e'); >> TRUE_UTF16 = getLong(bytes, 0); >> >> StringUTF16.putCharsAt(bytes, 0, 'f', 'a', 'l', 's'); >> FALS_UTF16 = getLong(bytes, 0); >> } >> >> private static long getLong(byte[] bytes, int offset) { >> return (((long)bytes[offset ] & 0xff) ) | >> (((long)bytes[offset + 1] & 0xff) << 8) | >> (((long)bytes[offset + 2] & 0xff) << 16) | >> (((long)bytes[offset + 3] & 0xff) << 24) | >> (((long)bytes[offset + 4] & 0xff) << 32) | >> (((long)bytes[offset + 5] & 0xff) << 40) | >> (((long)bytes[offset + 6] & 0xff) << 48) | >> (((long)bytes[offset + 7] & 0xff) << 56); >> } >> >> private static void setLong(byte[] array, int offset, long value) { >> array[offset] = (byte) value; >> array[offse... > > @wenshao >> @eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect. > > As I asked above, you will need to provide some evidence / generated assembly / perf data, and logs from `TraceMergeStores`. I currently do not have time to produce these myself, and I think they would be crucial to determine where the missing performance has gone. See my earlier comment: > https://github.com/openjdk/jdk/pull/19626#issuecomment-2158533469 > > And please also try @cl4es advide here: > https://github.com/openjdk/jdk/pull/19626#issuecomment-2159509806 > > And sure, maybe you need some public API for setting multiple bytes at once, which the `MergeStores` optimization can optimize. I'm a C2 engineer, so I leave that up to the library folks ;) @eme64 The assembly information is below, can you take a look and see if it can help you diagnose the problem? * JavaCode class AbstractStringBuilder { private AbstractStringBuilder appendNull() { int count = this.count; ensureCapacityInternal(count + 4); byte[] val = this.value; if (isLatin1()) { val[count ] = 'n'; val[count + 1] = 'u'; val[count + 2] = 'l'; val[count + 3] = 'l'; } else { StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l'); } this.count = count + 4; return this; } } class StringUTF16 { public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) { putChar(value, i , c1); putChar(value, i + 1, c2); putChar(value, i + 2, c3); putChar(value, i + 3, c4); } } * Apple M1 StringBuilder.appendNull PrintAssembly /Users/wenshao/Work/git/jdk/build/macosx-aarch64-server-release/jdk/bin/java -XX:+UnlockDiagnosticVMOptions -XX:+PrintAssembly -XX:CompileCommand=compileonly,*StringBuilder.appendNull -XX:-TieredCompilation -XX:TieredStopAtLevel=4 -javaagent:/Applications/IntelliJ IDEA.app/Contents/lib/idea_rt.jar=61041:/Applications/IntelliJ IDEA.app/Contents/bin -Dfile.encoding=UTF-8 -Dsun.stdout.encoding=UTF-8 -Dsun.stderr.encoding=UTF-8 .... Compiled method (n/a) 96 1 n java.lang.invoke.MethodHandle::linkToStatic(LLLLLLL)L (native) total in heap [0x0000000102efba08,0x0000000102efbb20] = 280 relocation [0x0000000102efbae0,0x0000000102efbae8] = 8 main code [0x0000000102efbb00,0x0000000102efbb20] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c3e1c80} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/Object' # parm5: c_rarg6:c_rarg6 = 'java/lang/Object' # parm6: c_rarg7:c_rarg7 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102efbb00: nop 0x0000000102efbb04: ldr w12, [x7, #36] 0x0000000102efbb08: lsl x12, x12, #3 0x0000000102efbb0c: ldr x12, [x12, #16] 0x0000000102efbb10: cbz x12, 0x0000000102efbb1c 0x0000000102efbb14: ldr x8, [x12, #64] 0x0000000102efbb18: br x8 0x0000000102efbb1c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 102 2 n java.lang.invoke.MethodHandle::linkToStatic(LLL)L (native) total in heap [0x0000000102f01108,0x0000000102f01220] = 280 relocation [0x0000000102f011e0,0x0000000102f011e8] = 8 main code [0x0000000102f01200,0x0000000102f01220] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c432368} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f01200: nop 0x0000000102f01204: ldr w12, [x3, #36] 0x0000000102f01208: lsl x12, x12, #3 0x0000000102f0120c: ldr x12, [x12, #16] 0x0000000102f01210: cbz x12, 0x0000000102f0121c 0x0000000102f01214: ldr x8, [x12, #64] 0x0000000102f01218: br x8 0x0000000102f0121c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 103 3 n java.lang.invoke.MethodHandle::invokeBasic(LLLLLL)L (native) total in heap [0x0000000102f01408,0x0000000102f01530] = 296 relocation [0x0000000102f014e0,0x0000000102f014e8] = 8 main code [0x0000000102f01500,0x0000000102f01530] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c43ace8} 'invokeBasic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # parm2: c_rarg4:c_rarg4 = 'java/lang/Object' # parm3: c_rarg5:c_rarg5 = 'java/lang/Object' # parm4: c_rarg6:c_rarg6 = 'java/lang/Object' # parm5: c_rarg7:c_rarg7 = 'java/lang/Object' # [sp+0x0] (sp of caller) 0x0000000102f01500: nop 0x0000000102f01504: ldr w12, [x1, #20] 0x0000000102f01508: lsl x12, x12, #3 0x0000000102f0150c: ldr w12, [x12, #40] 0x0000000102f01510: lsl x12, x12, #3 0x0000000102f01514: ldr w12, [x12, #36] 0x0000000102f01518: lsl x12, x12, #3 0x0000000102f0151c: ldr x12, [x12, #16] 0x0000000102f01520: cbz x12, 0x0000000102f0152c 0x0000000102f01524: ldr x8, [x12, #64] 0x0000000102f01528: br x8 0x0000000102f0152c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 104 4 n java.lang.invoke.MethodHandle::linkToSpecial(LLLLLLLL)L (native) total in heap [0x0000000102f01708,0x0000000102f01828] = 288 relocation [0x0000000102f017e0,0x0000000102f017e8] = 8 main code [0x0000000102f01800,0x0000000102f01824] = 36 stub code [0x0000000102f01824,0x0000000102f01828] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c43ae60} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/Object' # parm5: c_rarg6:c_rarg6 = 'java/lang/Object' # parm6: c_rarg7:c_rarg7 = 'java/lang/Object' # parm7: c_rarg0:c_rarg0 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f01800: nop 0x0000000102f01804: ldr xzr, [x1] 0x0000000102f01808: ldr w12, [x0, #36] 0x0000000102f0180c: lsl x12, x12, #3 0x0000000102f01810: ldr x12, [x12, #16] 0x0000000102f01814: cbz x12, 0x0000000102f01820 0x0000000102f01818: ldr x8, [x12, #64] 0x0000000102f0181c: br x8 0x0000000102f01820: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f01824: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 107 5 n java.lang.invoke.MethodHandle::linkToSpecial(LLLL)V (native) total in heap [0x0000000102f01d88,0x0000000102f01ea8] = 288 relocation [0x0000000102f01e60,0x0000000102f01e68] = 8 main code [0x0000000102f01e80,0x0000000102f01ea4] = 36 stub code [0x0000000102f01ea4,0x0000000102f01ea8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c44f7a0} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)V' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f01e80: nop 0x0000000102f01e84: ldr xzr, [x1] 0x0000000102f01e88: ldr w12, [x4, #36] 0x0000000102f01e8c: lsl x12, x12, #3 0x0000000102f01e90: ldr x12, [x12, #16] 0x0000000102f01e94: cbz x12, 0x0000000102f01ea0 0x0000000102f01e98: ldr x8, [x12, #64] 0x0000000102f01e9c: br x8 0x0000000102f01ea0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f01ea4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 108 6 n java.lang.invoke.MethodHandle::invokeBasic(LL)L (native) total in heap [0x0000000102f02408,0x0000000102f02530] = 296 relocation [0x0000000102f024e0,0x0000000102f024e8] = 8 main code [0x0000000102f02500,0x0000000102f02530] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c450b68} 'invokeBasic' '(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # [sp+0x0] (sp of caller) 0x0000000102f02500: nop 0x0000000102f02504: ldr w12, [x1, #20] 0x0000000102f02508: lsl x12, x12, #3 0x0000000102f0250c: ldr w12, [x12, #40] 0x0000000102f02510: lsl x12, x12, #3 0x0000000102f02514: ldr w12, [x12, #36] 0x0000000102f02518: lsl x12, x12, #3 0x0000000102f0251c: ldr x12, [x12, #16] 0x0000000102f02520: cbz x12, 0x0000000102f0252c 0x0000000102f02524: ldr x8, [x12, #64] 0x0000000102f02528: br x8 0x0000000102f0252c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 109 7 n java.lang.invoke.MethodHandle::linkToSpecial(LLLL)L (native) total in heap [0x0000000102f02708,0x0000000102f02828] = 288 relocation [0x0000000102f027e0,0x0000000102f027e8] = 8 main code [0x0000000102f02800,0x0000000102f02824] = 36 stub code [0x0000000102f02824,0x0000000102f02828] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c450c80} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f02800: nop 0x0000000102f02804: ldr xzr, [x1] 0x0000000102f02808: ldr w12, [x4, #36] 0x0000000102f0280c: lsl x12, x12, #3 0x0000000102f02810: ldr x12, [x12, #16] 0x0000000102f02814: cbz x12, 0x0000000102f02820 0x0000000102f02818: ldr x8, [x12, #64] 0x0000000102f0281c: br x8 0x0000000102f02820: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f02824: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 117 8 n java.lang.invoke.MethodHandle::linkToSpecial(LLL)L (native) total in heap [0x0000000102f05088,0x0000000102f051a8] = 288 relocation [0x0000000102f05160,0x0000000102f05168] = 8 main code [0x0000000102f05180,0x0000000102f051a4] = 36 stub code [0x0000000102f051a4,0x0000000102f051a8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c4911f8} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f05180: nop 0x0000000102f05184: ldr xzr, [x1] 0x0000000102f05188: ldr w12, [x3, #36] 0x0000000102f0518c: lsl x12, x12, #3 0x0000000102f05190: ldr x12, [x12, #16] 0x0000000102f05194: cbz x12, 0x0000000102f051a0 0x0000000102f05198: ldr x8, [x12, #64] 0x0000000102f0519c: br x8 0x0000000102f051a0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f051a4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 123 9 n java.lang.invoke.MethodHandle::linkToStatic(LLIL)I (native) total in heap [0x0000000102f08b88,0x0000000102f08ca0] = 280 relocation [0x0000000102f08c60,0x0000000102f08c68] = 8 main code [0x0000000102f08c80,0x0000000102f08ca0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c4c2f78} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;ILjava/lang/invoke/MemberName;)I' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3 = int # parm3: c_rarg4:c_rarg4 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f08c80: nop 0x0000000102f08c84: ldr w12, [x4, #36] 0x0000000102f08c88: lsl x12, x12, #3 0x0000000102f08c8c: ldr x12, [x12, #16] 0x0000000102f08c90: cbz x12, 0x0000000102f08c9c 0x0000000102f08c94: ldr x8, [x12, #64] 0x0000000102f08c98: br x8 0x0000000102f08c9c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 128 10 n java.lang.invoke.MethodHandle::linkToSpecial(LLL)V (native) total in heap [0x0000000102f09208,0x0000000102f09328] = 288 relocation [0x0000000102f092e0,0x0000000102f092e8] = 8 main code [0x0000000102f09300,0x0000000102f09324] = 36 stub code [0x0000000102f09324,0x0000000102f09328] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c4dbc20} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)V' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f09300: nop 0x0000000102f09304: ldr xzr, [x1] 0x0000000102f09308: ldr w12, [x3, #36] 0x0000000102f0930c: lsl x12, x12, #3 0x0000000102f09310: ldr x12, [x12, #16] 0x0000000102f09314: cbz x12, 0x0000000102f09320 0x0000000102f09318: ldr x8, [x12, #64] 0x0000000102f0931c: br x8 0x0000000102f09320: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f09324: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 129 11 n java.lang.invoke.MethodHandle::invokeBasic(L)L (native) total in heap [0x0000000102f09508,0x0000000102f09630] = 296 relocation [0x0000000102f095e0,0x0000000102f095e8] = 8 main code [0x0000000102f09600,0x0000000102f09630] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c4dc348} 'invokeBasic' '(Ljava/lang/Object;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # [sp+0x0] (sp of caller) 0x0000000102f09600: nop 0x0000000102f09604: ldr w12, [x1, #20] 0x0000000102f09608: lsl x12, x12, #3 0x0000000102f0960c: ldr w12, [x12, #40] 0x0000000102f09610: lsl x12, x12, #3 0x0000000102f09614: ldr w12, [x12, #36] 0x0000000102f09618: lsl x12, x12, #3 0x0000000102f0961c: ldr x12, [x12, #16] 0x0000000102f09620: cbz x12, 0x0000000102f0962c 0x0000000102f09624: ldr x8, [x12, #64] 0x0000000102f09628: br x8 0x0000000102f0962c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 131 12 n java.lang.invoke.MethodHandle::linkToStatic(LL)L (native) total in heap [0x0000000102f09808,0x0000000102f09920] = 280 relocation [0x0000000102f098e0,0x0000000102f098e8] = 8 main code [0x0000000102f09900,0x0000000102f09920] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c4e4358} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f09900: nop 0x0000000102f09904: ldr w12, [x2, #36] 0x0000000102f09908: lsl x12, x12, #3 0x0000000102f0990c: ldr x12, [x12, #16] 0x0000000102f09910: cbz x12, 0x0000000102f0991c 0x0000000102f09914: ldr x8, [x12, #64] 0x0000000102f09918: br x8 0x0000000102f0991c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 206 13 n java.lang.invoke.MethodHandle::linkToInterface(LL)L (native) total in heap [0x0000000102f0b708,0x0000000102f0b868] = 352 relocation [0x0000000102f0b7e0,0x0000000102f0b7e8] = 8 main code [0x0000000102f0b800,0x0000000102f0b864] = 100 stub code [0x0000000102f0b864,0x0000000102f0b868] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c5a1548} 'linkToInterface' '(Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0b800: nop 0x0000000102f0b804: ldr w10, [x1, #8] 0x0000000102f0b808: eor x10, x10, #0x700000000000 0x0000000102f0b80c: ldr w14, [x2, #24] 0x0000000102f0b810: lsl x14, x14, #3 0x0000000102f0b814: ldr x14, [x14, #16] 0x0000000102f0b818: ldr x12, [x2, #16] 0x0000000102f0b81c: ldr w11, [x10, #172] 0x0000000102f0b820: add x11, x10, x11, uxtx #3 0x0000000102f0b824: add x11, x11, #0x1d8 0x0000000102f0b828: add x10, x10, x12, uxtx #3 0x0000000102f0b82c: ldr x12, [x11] 0x0000000102f0b830: cmp x14, x12 0x0000000102f0b834: b.eq 0x0000000102f0b848 // b.none 0x0000000102f0b838: cbz x12, 0x0000000102f0b860 0x0000000102f0b83c: ldr x12, [x11, #16]! 0x0000000102f0b840: cmp x14, x12 0x0000000102f0b844: b.ne 0x0000000102f0b838 // b.any 0x0000000102f0b848: ldr w11, [x11, #8] 0x0000000102f0b84c: ldr x12, [x10, w11, uxtw] 0x0000000102f0b850: cbz x12, 0x0000000102f0b85c 0x0000000102f0b854: ldr x8, [x12, #64] 0x0000000102f0b858: br x8 0x0000000102f0b85c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} 0x0000000102f0b860: b 0x0000000102e9dd80 ; {runtime_call IncompatibleClassChangeError throw_exception} [Stub Code] 0x0000000102f0b864: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 210 14 n java.lang.invoke.MethodHandle::linkToStatic(L)L (native) total in heap [0x0000000102f0bd88,0x0000000102f0bea0] = 280 relocation [0x0000000102f0be60,0x0000000102f0be68] = 8 main code [0x0000000102f0be80,0x0000000102f0bea0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c5c9ad0} 'linkToStatic' '(Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0be80: nop 0x0000000102f0be84: ldr w12, [x1, #36] 0x0000000102f0be88: lsl x12, x12, #3 0x0000000102f0be8c: ldr x12, [x12, #16] 0x0000000102f0be90: cbz x12, 0x0000000102f0be9c 0x0000000102f0be94: ldr x8, [x12, #64] 0x0000000102f0be98: br x8 0x0000000102f0be9c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 213 15 n java.lang.invoke.MethodHandle::linkToStatic(LLLL)L (native) total in heap [0x0000000102f0c788,0x0000000102f0c8a0] = 280 relocation [0x0000000102f0c860,0x0000000102f0c868] = 8 main code [0x0000000102f0c880,0x0000000102f0c8a0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c5dd5e8} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0c880: nop 0x0000000102f0c884: ldr w12, [x4, #36] 0x0000000102f0c888: lsl x12, x12, #3 0x0000000102f0c88c: ldr x12, [x12, #16] 0x0000000102f0c890: cbz x12, 0x0000000102f0c89c 0x0000000102f0c894: ldr x8, [x12, #64] 0x0000000102f0c898: br x8 0x0000000102f0c89c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 214 16 n java.lang.invoke.MethodHandle::invokeBasic()L (native) total in heap [0x0000000102f0d188,0x0000000102f0d2b0] = 296 relocation [0x0000000102f0d260,0x0000000102f0d268] = 8 main code [0x0000000102f0d280,0x0000000102f0d2b0] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c5e1620} 'invokeBasic' '()Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # [sp+0x0] (sp of caller) 0x0000000102f0d280: nop 0x0000000102f0d284: ldr w12, [x1, #20] 0x0000000102f0d288: lsl x12, x12, #3 0x0000000102f0d28c: ldr w12, [x12, #40] 0x0000000102f0d290: lsl x12, x12, #3 0x0000000102f0d294: ldr w12, [x12, #36] 0x0000000102f0d298: lsl x12, x12, #3 0x0000000102f0d29c: ldr x12, [x12, #16] 0x0000000102f0d2a0: cbz x12, 0x0000000102f0d2ac 0x0000000102f0d2a4: ldr x8, [x12, #64] 0x0000000102f0d2a8: br x8 0x0000000102f0d2ac: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 214 17 n java.lang.invoke.MethodHandle::linkToSpecial(LL)L (native) total in heap [0x0000000102f0d488,0x0000000102f0d5a8] = 288 relocation [0x0000000102f0d560,0x0000000102f0d568] = 8 main code [0x0000000102f0d580,0x0000000102f0d5a4] = 36 stub code [0x0000000102f0d5a4,0x0000000102f0d5a8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c5e1738} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0d580: nop 0x0000000102f0d584: ldr xzr, [x1] 0x0000000102f0d588: ldr w12, [x2, #36] 0x0000000102f0d58c: lsl x12, x12, #3 0x0000000102f0d590: ldr x12, [x12, #16] 0x0000000102f0d594: cbz x12, 0x0000000102f0d5a0 0x0000000102f0d598: ldr x8, [x12, #64] 0x0000000102f0d59c: br x8 0x0000000102f0d5a0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f0d5a4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 218 18 n java.lang.invoke.MethodHandle::linkToSpecial(LL)V (native) total in heap [0x0000000102f0d788,0x0000000102f0d8a8] = 288 relocation [0x0000000102f0d860,0x0000000102f0d868] = 8 main code [0x0000000102f0d880,0x0000000102f0d8a4] = 36 stub code [0x0000000102f0d8a4,0x0000000102f0d8a8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c5462f8} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/invoke/MemberName;)V' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0d880: nop 0x0000000102f0d884: ldr xzr, [x1] 0x0000000102f0d888: ldr w12, [x2, #36] 0x0000000102f0d88c: lsl x12, x12, #3 0x0000000102f0d890: ldr x12, [x12, #16] 0x0000000102f0d894: cbz x12, 0x0000000102f0d8a0 0x0000000102f0d898: ldr x8, [x12, #64] 0x0000000102f0d89c: br x8 0x0000000102f0d8a0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f0d8a4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 220 19 n java.lang.invoke.MethodHandle::linkToInterface(LLL)I (native) total in heap [0x0000000102f0da88,0x0000000102f0dbe8] = 352 relocation [0x0000000102f0db60,0x0000000102f0db68] = 8 main code [0x0000000102f0db80,0x0000000102f0dbe4] = 100 stub code [0x0000000102f0dbe4,0x0000000102f0dbe8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c5488b0} 'linkToInterface' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)I' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0db80: nop 0x0000000102f0db84: ldr w10, [x1, #8] 0x0000000102f0db88: eor x10, x10, #0x700000000000 0x0000000102f0db8c: ldr w14, [x3, #24] 0x0000000102f0db90: lsl x14, x14, #3 0x0000000102f0db94: ldr x14, [x14, #16] 0x0000000102f0db98: ldr x12, [x3, #16] 0x0000000102f0db9c: ldr w11, [x10, #172] 0x0000000102f0dba0: add x11, x10, x11, uxtx #3 0x0000000102f0dba4: add x11, x11, #0x1d8 0x0000000102f0dba8: add x10, x10, x12, uxtx #3 0x0000000102f0dbac: ldr x12, [x11] 0x0000000102f0dbb0: cmp x14, x12 0x0000000102f0dbb4: b.eq 0x0000000102f0dbc8 // b.none 0x0000000102f0dbb8: cbz x12, 0x0000000102f0dbe0 0x0000000102f0dbbc: ldr x12, [x11, #16]! 0x0000000102f0dbc0: cmp x14, x12 0x0000000102f0dbc4: b.ne 0x0000000102f0dbb8 // b.any 0x0000000102f0dbc8: ldr w11, [x11, #8] 0x0000000102f0dbcc: ldr x12, [x10, w11, uxtw] 0x0000000102f0dbd0: cbz x12, 0x0000000102f0dbdc 0x0000000102f0dbd4: ldr x8, [x12, #64] 0x0000000102f0dbd8: br x8 0x0000000102f0dbdc: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} 0x0000000102f0dbe0: b 0x0000000102e9dd80 ; {runtime_call IncompatibleClassChangeError throw_exception} [Stub Code] 0x0000000102f0dbe4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 224 20 n java.lang.invoke.MethodHandle::linkToStatic(LL)I (native) total in heap [0x0000000102f0e108,0x0000000102f0e220] = 280 relocation [0x0000000102f0e1e0,0x0000000102f0e1e8] = 8 main code [0x0000000102f0e200,0x0000000102f0e220] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c5531f8} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/invoke/MemberName;)I' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0e200: nop 0x0000000102f0e204: ldr w12, [x2, #36] 0x0000000102f0e208: lsl x12, x12, #3 0x0000000102f0e20c: ldr x12, [x12, #16] 0x0000000102f0e210: cbz x12, 0x0000000102f0e21c 0x0000000102f0e214: ldr x8, [x12, #64] 0x0000000102f0e218: br x8 0x0000000102f0e21c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 226 21 n java.lang.invoke.MethodHandle::linkToStatic(LLL)I (native) total in heap [0x0000000102f0e408,0x0000000102f0e520] = 280 relocation [0x0000000102f0e4e0,0x0000000102f0e4e8] = 8 main code [0x0000000102f0e500,0x0000000102f0e520] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c555080} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)I' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0e500: nop 0x0000000102f0e504: ldr w12, [x3, #36] 0x0000000102f0e508: lsl x12, x12, #3 0x0000000102f0e50c: ldr x12, [x12, #16] 0x0000000102f0e510: cbz x12, 0x0000000102f0e51c 0x0000000102f0e514: ldr x8, [x12, #64] 0x0000000102f0e518: br x8 0x0000000102f0e51c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 227 22 n java.lang.invoke.MethodHandle::linkToVirtual(LL)L (native) total in heap [0x0000000102f0e708,0x0000000102f0e828] = 288 relocation [0x0000000102f0e7e0,0x0000000102f0e7e8] = 8 main code [0x0000000102f0e800,0x0000000102f0e828] = 40 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c555590} 'linkToVirtual' '(Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0e800: nop 0x0000000102f0e804: ldr w10, [x1, #8] 0x0000000102f0e808: eor x10, x10, #0x700000000000 0x0000000102f0e80c: ldr x11, [x2, #16] 0x0000000102f0e810: add x12, x10, x11, uxtx #3 0x0000000102f0e814: ldr x12, [x12, #472] 0x0000000102f0e818: cbz x12, 0x0000000102f0e824 0x0000000102f0e81c: ldr x8, [x12, #64] 0x0000000102f0e820: br x8 0x0000000102f0e824: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 229 23 n java.lang.invoke.MethodHandle::linkToSpecial(LL)I (native) total in heap [0x0000000102f0ea08,0x0000000102f0eb28] = 288 relocation [0x0000000102f0eae0,0x0000000102f0eae8] = 8 main code [0x0000000102f0eb00,0x0000000102f0eb24] = 36 stub code [0x0000000102f0eb24,0x0000000102f0eb28] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c55ec78} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/invoke/MemberName;)I' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f0eb00: nop 0x0000000102f0eb04: ldr xzr, [x1] 0x0000000102f0eb08: ldr w12, [x2, #36] 0x0000000102f0eb0c: lsl x12, x12, #3 0x0000000102f0eb10: ldr x12, [x12, #16] 0x0000000102f0eb14: cbz x12, 0x0000000102f0eb20 0x0000000102f0eb18: ldr x8, [x12, #64] 0x0000000102f0eb1c: br x8 0x0000000102f0eb20: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f0eb24: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 248 24 n jdk.internal.vm.Continuation::enterSpecial (native) total in heap [0x0000000102f0f088,0x0000000102f0f380] = 760 relocation [0x0000000102f0f160,0x0000000102f0f188] = 40 main code [0x0000000102f0f1c0,0x0000000102f0f330] = 368 stub code [0x0000000102f0f330,0x0000000102f0f370] = 64 metadata [0x0000000102f0f370,0x0000000102f0f380] = 16 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Entry Point] # {method} {0x000000011c08c298} 'enterSpecial' '(Ljdk/internal/vm/Continuation;ZZ)V' in 'jdk/internal/vm/Continuation' # parm0: c_rarg1:c_rarg1 = 'jdk/internal/vm/Continuation' # parm1: c_rarg2 = boolean # parm2: c_rarg3 = boolean # [sp+0x50] (sp of caller) 0x0000000102f0f1c0: ldp x2, x1, [x20, #8] 0x0000000102f0f1c4: ldr x3, [x20] 0x0000000102f0f1c8: ldr x8, [x28, #1344] 0x0000000102f0f1cc: cmp sp, x8 0x0000000102f0f1d0: b.ls 0x0000000102f0f1dc // b.plast 0x0000000102f0f1d4: mov x8, sp 0x0000000102f0f1d8: str x8, [x28, #1344] 0x0000000102f0f1dc: stp x29, x30, [sp, #-16]! 0x0000000102f0f1e0: mov x29, sp 0x0000000102f0f1e4: sub sp, sp, #0x40 0x0000000102f0f1e8: ldr x8, [x28, #1336] 0x0000000102f0f1ec: str x8, [sp, #8] 0x0000000102f0f1f0: mov x8, sp 0x0000000102f0f1f4: str x8, [x28, #1336] 0x0000000102f0f1f8: str x1, [sp, #16] 0x0000000102f0f1fc: str w3, [sp, #32] 0x0000000102f0f200: str xzr, [sp, #24] 0x0000000102f0f204: str wzr, [sp, #36] 0x0000000102f0f208: str wzr, [sp, #56] 0x0000000102f0f20c: ldr x8, [x28, #1344] 0x0000000102f0f210: str x8, [sp, #40] 0x0000000102f0f214: ldr x8, [x28, #1360] 0x0000000102f0f218: str x8, [sp, #48] 0x0000000102f0f21c: str xzr, [x28, #1344] 0x0000000102f0f220: str xzr, [x28, #1360] 0x0000000102f0f224: cbnz x2, 0x0000000102f0f2a0 0x0000000102f0f228: bl 0x0000000102ea8580 ; ImmutableOopMap {} ; {static_call} 0x0000000102f0f22c: nop ; {other} 0x0000000102f0f230: movk xzr, #0x1a4 0x0000000102f0f234: movk xzr, #0x0 0x0000000102f0f238: b 0x0000000102f0f2b0 0x0000000102f0f23c: nop [Verified Entry Point] 0x0000000102f0f240: stp x29, x30, [sp, #-16]! 0x0000000102f0f244: mov x29, sp 0x0000000102f0f248: sub sp, sp, #0x40 0x0000000102f0f24c: ldr x8, [x28, #1336] 0x0000000102f0f250: str x8, [sp, #8] 0x0000000102f0f254: mov x8, sp 0x0000000102f0f258: str x8, [x28, #1336] 0x0000000102f0f25c: str x1, [sp, #16] 0x0000000102f0f260: str w3, [sp, #32] 0x0000000102f0f264: str xzr, [sp, #24] 0x0000000102f0f268: str wzr, [sp, #36] 0x0000000102f0f26c: str wzr, [sp, #56] 0x0000000102f0f270: ldr x8, [x28, #1344] 0x0000000102f0f274: str x8, [sp, #40] 0x0000000102f0f278: ldr x8, [x28, #1360] 0x0000000102f0f27c: str x8, [sp, #48] 0x0000000102f0f280: str xzr, [x28, #1344] 0x0000000102f0f284: str xzr, [x28, #1360] 0x0000000102f0f288: cbnz x2, 0x0000000102f0f2a0 0x0000000102f0f28c: bl 0x0000000102ea8580 ; ImmutableOopMap {} ; {static_call} 0x0000000102f0f290: nop ; {other} 0x0000000102f0f294: movk xzr, #0x208 0x0000000102f0f298: movk xzr, #0x100 0x0000000102f0f29c: b 0x0000000102f0f2b0 0x0000000102f0f2a0: bl Stub::Cont thaw ; ImmutableOopMap {} ; {runtime_call StubRoutines (continuation stubs)} 0x0000000102f0f2a4: nop ; {other} 0x0000000102f0f2a8: movk xzr, #0x21c 0x0000000102f0f2ac: movk xzr, #0x200 0x0000000102f0f2b0: ldr x8, [sp, #40] 0x0000000102f0f2b4: str x8, [x28, #1344] 0x0000000102f0f2b8: ldr x8, [sp, #48] 0x0000000102f0f2bc: str x8, [x28, #1360] 0x0000000102f0f2c0: ldr x9, [sp, #8] 0x0000000102f0f2c4: str x9, [x28, #1336] 0x0000000102f0f2c8: add x29, sp, #0x40 0x0000000102f0f2cc: mov sp, x29 0x0000000102f0f2d0: ldp x29, x30, [sp], #16 0x0000000102f0f2d4: ret [Exception Handler] 0x0000000102f0f2d8: mov x19, x0 0x0000000102f0f2dc: ldr x8, [sp, #40] 0x0000000102f0f2e0: str x8, [x28, #1344] 0x0000000102f0f2e4: ldr x8, [sp, #48] 0x0000000102f0f2e8: str x8, [x28, #1360] 0x0000000102f0f2ec: ldr x9, [sp, #8] 0x0000000102f0f2f0: str x9, [x28, #1336] 0x0000000102f0f2f4: add x29, sp, #0x40 0x0000000102f0f2f8: ldr x1, [x29, #8] 0x0000000102f0f2fc: mov x0, x28 0x0000000102f0f300: stp x8, x12, [sp, #-16]! 0x0000000102f0f304: mov x8, #0x7460 // #29792 0x0000000102f0f308: movk x8, #0x254, lsl #16 0x0000000102f0f30c: movk x8, #0x1, lsl #32 0x0000000102f0f310: blr x8 0x0000000102f0f314: ldp x8, x12, [sp], #16 0x0000000102f0f318: mov x1, x0 0x0000000102f0f31c: mov x0, x19 0x0000000102f0f320: mov sp, x29 0x0000000102f0f324: ldp x29, x30, [sp], #16 0x0000000102f0f328: mov x3, x30 0x0000000102f0f32c: br x1 [Stub Code] 0x0000000102f0f330: isb ; {no_reloc} 0x0000000102f0f334: mov x12, #0x0 // #0 ; {metadata(nullptr)} 0x0000000102f0f338: movk x12, #0x0, lsl #16 0x0000000102f0f33c: movk x12, #0x0, lsl #32 0x0000000102f0f340: mov x8, #0x0 // #0 0x0000000102f0f344: movk x8, #0x0, lsl #16 0x0000000102f0f348: movk x8, #0x0, lsl #32 0x0000000102f0f34c: br x8 0x0000000102f0f350: isb ; {static_stub} 0x0000000102f0f354: mov x12, #0x0 // #0 ; {metadata(nullptr)} 0x0000000102f0f358: movk x12, #0x0, lsl #16 0x0000000102f0f35c: movk x12, #0x0, lsl #32 0x0000000102f0f360: mov x8, #0x0 // #0 0x0000000102f0f364: movk x8, #0x0, lsl #16 0x0000000102f0f368: movk x8, #0x0, lsl #32 0x0000000102f0f36c: br x8 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 250 25 n jdk.internal.vm.Continuation::doYield (native) total in heap [0x0000000102f0f388,0x0000000102f0f528] = 416 relocation [0x0000000102f0f460,0x0000000102f0f468] = 8 main code [0x0000000102f0f480,0x0000000102f0f528] = 168 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c08c1e0} 'doYield' '()I' in 'jdk/internal/vm/Continuation' # [sp+0x10] (sp of caller) 0x0000000102f0f480: stp x29, x30, [sp, #-16]! 0x0000000102f0f484: mov x29, sp 0x0000000102f0f488: mov x1, sp 0x0000000102f0f48c: nop ; {other} 0x0000000102f0f490: movk xzr, #0x104 0x0000000102f0f494: movk xzr, #0x0 0x0000000102f0f498: mov x0, x28 0x0000000102f0f49c: adr x8, 0x0000000102f0f48c 0x0000000102f0f4a0: str x8, [x28, #936] 0x0000000102f0f4a4: mov x8, sp 0x0000000102f0f4a8: str x8, [x28, #928] 0x0000000102f0f4ac: str x29, [x28, #944] 0x0000000102f0f4b0: stp x8, x12, [sp, #-16]! 0x0000000102f0f4b4: mov x8, #0xdb70 // #56176 0x0000000102f0f4b8: movk x8, #0x1f3, lsl #16 0x0000000102f0f4bc: movk x8, #0x1, lsl #32 0x0000000102f0f4c0: blr x8 0x0000000102f0f4c4: ldp x8, x12, [sp], #16 0x0000000102f0f4c8: str xzr, [x28, #928] 0x0000000102f0f4cc: str xzr, [x28, #944] 0x0000000102f0f4d0: str xzr, [x28, #936] 0x0000000102f0f4d4: cbnz x0, 0x0000000102f0f4fc 0x0000000102f0f4d8: ldr x8, [x28, #1336] 0x0000000102f0f4dc: mov sp, x8 0x0000000102f0f4e0: ldr x8, [sp, #40] 0x0000000102f0f4e4: str x8, [x28, #1344] 0x0000000102f0f4e8: ldr x8, [sp, #48] 0x0000000102f0f4ec: str x8, [x28, #1360] 0x0000000102f0f4f0: ldr x9, [sp, #8] 0x0000000102f0f4f4: str x9, [x28, #1336] 0x0000000102f0f4f8: add x29, sp, #0x40 0x0000000102f0f4fc: ldr x8, [x28, #8] 0x0000000102f0f500: cbz x8, 0x0000000102f0f51c 0x0000000102f0f504: mov sp, x29 0x0000000102f0f508: ldp x29, x30, [sp], #16 0x0000000102f0f50c: mov x8, #0xc040 // #49216 ; {runtime_call StubRoutines (initial stubs)} 0x0000000102f0f510: movk x8, #0x2e6, lsl #16 0x0000000102f0f514: movk x8, #0x1, lsl #32 0x0000000102f0f518: br x8 0x0000000102f0f51c: mov sp, x29 0x0000000102f0f520: ldp x29, x30, [sp], #16 0x0000000102f0f524: ret -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 336 26 n java.lang.invoke.MethodHandle::linkToStatic(LLL)V (native) total in heap [0x0000000102f12788,0x0000000102f128a0] = 280 relocation [0x0000000102f12860,0x0000000102f12868] = 8 main code [0x0000000102f12880,0x0000000102f128a0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c679960} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)V' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f12880: nop 0x0000000102f12884: ldr w12, [x3, #36] 0x0000000102f12888: lsl x12, x12, #3 0x0000000102f1288c: ldr x12, [x12, #16] 0x0000000102f12890: cbz x12, 0x0000000102f1289c 0x0000000102f12894: ldr x8, [x12, #64] 0x0000000102f12898: br x8 0x0000000102f1289c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 337 27 n java.lang.invoke.MethodHandle::invokeBasic(LL)V (native) total in heap [0x0000000102f12a88,0x0000000102f12bb0] = 296 relocation [0x0000000102f12b60,0x0000000102f12b68] = 8 main code [0x0000000102f12b80,0x0000000102f12bb0] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c679a78} 'invokeBasic' '(Ljava/lang/Object;Ljava/lang/Object;)V' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # [sp+0x0] (sp of caller) 0x0000000102f12b80: nop 0x0000000102f12b84: ldr w12, [x1, #20] 0x0000000102f12b88: lsl x12, x12, #3 0x0000000102f12b8c: ldr w12, [x12, #40] 0x0000000102f12b90: lsl x12, x12, #3 0x0000000102f12b94: ldr w12, [x12, #36] 0x0000000102f12b98: lsl x12, x12, #3 0x0000000102f12b9c: ldr x12, [x12, #16] 0x0000000102f12ba0: cbz x12, 0x0000000102f12bac 0x0000000102f12ba4: ldr x8, [x12, #64] 0x0000000102f12ba8: br x8 0x0000000102f12bac: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 340 28 n java.lang.invoke.MethodHandle::linkToStatic(LLLLL)L (native) total in heap [0x0000000102f12d88,0x0000000102f12ea0] = 280 relocation [0x0000000102f12e60,0x0000000102f12e68] = 8 main code [0x0000000102f12e80,0x0000000102f12ea0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c67d468} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f12e80: nop 0x0000000102f12e84: ldr w12, [x5, #36] 0x0000000102f12e88: lsl x12, x12, #3 0x0000000102f12e8c: ldr x12, [x12, #16] 0x0000000102f12e90: cbz x12, 0x0000000102f12e9c 0x0000000102f12e94: ldr x8, [x12, #64] 0x0000000102f12e98: br x8 0x0000000102f12e9c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 341 29 n java.lang.invoke.MethodHandle::invokeBasic(LLLL)L (native) total in heap [0x0000000102f13088,0x0000000102f131b0] = 296 relocation [0x0000000102f13160,0x0000000102f13168] = 8 main code [0x0000000102f13180,0x0000000102f131b0] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c67d8f8} 'invokeBasic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # parm2: c_rarg4:c_rarg4 = 'java/lang/Object' # parm3: c_rarg5:c_rarg5 = 'java/lang/Object' # [sp+0x0] (sp of caller) 0x0000000102f13180: nop 0x0000000102f13184: ldr w12, [x1, #20] 0x0000000102f13188: lsl x12, x12, #3 0x0000000102f1318c: ldr w12, [x12, #40] 0x0000000102f13190: lsl x12, x12, #3 0x0000000102f13194: ldr w12, [x12, #36] 0x0000000102f13198: lsl x12, x12, #3 0x0000000102f1319c: ldr x12, [x12, #16] 0x0000000102f131a0: cbz x12, 0x0000000102f131ac 0x0000000102f131a4: ldr x8, [x12, #64] 0x0000000102f131a8: br x8 0x0000000102f131ac: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 344 30 n java.lang.invoke.MethodHandle::linkToStatic(LLLLLL)L (native) total in heap [0x0000000102f13388,0x0000000102f134a0] = 280 relocation [0x0000000102f13460,0x0000000102f13468] = 8 main code [0x0000000102f13480,0x0000000102f134a0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c67f2e0} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/Object' # parm5: c_rarg6:c_rarg6 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f13480: nop 0x0000000102f13484: ldr w12, [x6, #36] 0x0000000102f13488: lsl x12, x12, #3 0x0000000102f1348c: ldr x12, [x12, #16] 0x0000000102f13490: cbz x12, 0x0000000102f1349c 0x0000000102f13494: ldr x8, [x12, #64] 0x0000000102f13498: br x8 0x0000000102f1349c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 345 31 n java.lang.invoke.MethodHandle::invokeBasic(LLLLL)L (native) total in heap [0x0000000102f13688,0x0000000102f137b0] = 296 relocation [0x0000000102f13760,0x0000000102f13768] = 8 main code [0x0000000102f13780,0x0000000102f137b0] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c67f5a8} 'invokeBasic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # parm2: c_rarg4:c_rarg4 = 'java/lang/Object' # parm3: c_rarg5:c_rarg5 = 'java/lang/Object' # parm4: c_rarg6:c_rarg6 = 'java/lang/Object' # [sp+0x0] (sp of caller) 0x0000000102f13780: nop 0x0000000102f13784: ldr w12, [x1, #20] 0x0000000102f13788: lsl x12, x12, #3 0x0000000102f1378c: ldr w12, [x12, #40] 0x0000000102f13790: lsl x12, x12, #3 0x0000000102f13794: ldr w12, [x12, #36] 0x0000000102f13798: lsl x12, x12, #3 0x0000000102f1379c: ldr x12, [x12, #16] 0x0000000102f137a0: cbz x12, 0x0000000102f137ac 0x0000000102f137a4: ldr x8, [x12, #64] 0x0000000102f137a8: br x8 0x0000000102f137ac: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 346 32 n java.lang.invoke.MethodHandle::invokeBasic(LLL)L (native) total in heap [0x0000000102f13988,0x0000000102f13ab0] = 296 relocation [0x0000000102f13a60,0x0000000102f13a68] = 8 main code [0x0000000102f13a80,0x0000000102f13ab0] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c67f870} 'invokeBasic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # parm2: c_rarg4:c_rarg4 = 'java/lang/Object' # [sp+0x0] (sp of caller) 0x0000000102f13a80: nop 0x0000000102f13a84: ldr w12, [x1, #20] 0x0000000102f13a88: lsl x12, x12, #3 0x0000000102f13a8c: ldr w12, [x12, #40] 0x0000000102f13a90: lsl x12, x12, #3 0x0000000102f13a94: ldr w12, [x12, #36] 0x0000000102f13a98: lsl x12, x12, #3 0x0000000102f13a9c: ldr x12, [x12, #16] 0x0000000102f13aa0: cbz x12, 0x0000000102f13aac 0x0000000102f13aa4: ldr x8, [x12, #64] 0x0000000102f13aa8: br x8 0x0000000102f13aac: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 346 33 n java.lang.invoke.MethodHandle::linkToSpecial(LLLLL)L (native) total in heap [0x0000000102f13c88,0x0000000102f13da8] = 288 relocation [0x0000000102f13d60,0x0000000102f13d68] = 8 main code [0x0000000102f13d80,0x0000000102f13da4] = 36 stub code [0x0000000102f13da4,0x0000000102f13da8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c67f988} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f13d80: nop 0x0000000102f13d84: ldr xzr, [x1] 0x0000000102f13d88: ldr w12, [x5, #36] 0x0000000102f13d8c: lsl x12, x12, #3 0x0000000102f13d90: ldr x12, [x12, #16] 0x0000000102f13d94: cbz x12, 0x0000000102f13da0 0x0000000102f13d98: ldr x8, [x12, #64] 0x0000000102f13d9c: br x8 0x0000000102f13da0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f13da4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 370 34 n java.lang.invoke.MethodHandle::linkToStatic(LL)V (native) total in heap [0x0000000102f15f08,0x0000000102f16020] = 280 relocation [0x0000000102f15fe0,0x0000000102f15fe8] = 8 main code [0x0000000102f16000,0x0000000102f16020] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c6efad0} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/invoke/MemberName;)V' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f16000: nop 0x0000000102f16004: ldr w12, [x2, #36] 0x0000000102f16008: lsl x12, x12, #3 0x0000000102f1600c: ldr x12, [x12, #16] 0x0000000102f16010: cbz x12, 0x0000000102f1601c 0x0000000102f16014: ldr x8, [x12, #64] 0x0000000102f16018: br x8 0x0000000102f1601c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 385 35 n java.lang.invoke.MethodHandle::linkToStatic(IL)I (native) total in heap [0x0000000102f16208,0x0000000102f16320] = 280 relocation [0x0000000102f162e0,0x0000000102f162e8] = 8 main code [0x0000000102f16300,0x0000000102f16320] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c795268} 'linkToStatic' '(ILjava/lang/invoke/MemberName;)I' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1 = int # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f16300: nop 0x0000000102f16304: ldr w12, [x2, #36] 0x0000000102f16308: lsl x12, x12, #3 0x0000000102f1630c: ldr x12, [x12, #16] 0x0000000102f16310: cbz x12, 0x0000000102f1631c 0x0000000102f16314: ldr x8, [x12, #64] 0x0000000102f16318: br x8 0x0000000102f1631c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 393 36 n java.lang.invoke.MethodHandle::linkToStatic(LLLLL)I (native) total in heap [0x0000000102f18808,0x0000000102f18920] = 280 relocation [0x0000000102f188e0,0x0000000102f188e8] = 8 main code [0x0000000102f18900,0x0000000102f18920] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7c5e80} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)I' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f18900: nop 0x0000000102f18904: ldr w12, [x5, #36] 0x0000000102f18908: lsl x12, x12, #3 0x0000000102f1890c: ldr x12, [x12, #16] 0x0000000102f18910: cbz x12, 0x0000000102f1891c 0x0000000102f18914: ldr x8, [x12, #64] 0x0000000102f18918: br x8 0x0000000102f1891c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] ============================= C2-compiled nmethod ============================== ----------------------------------- Assembly ----------------------------------- Compiled method (c2) 397 37 java.lang.AbstractStringBuilder::appendNull (75 bytes) total in heap [0x0000000102f1cf88,0x0000000102f1d558] = 1488 relocation [0x0000000102f1d060,0x0000000102f1d0b8] = 88 main code [0x0000000102f1d0c0,0x0000000102f1d4f0] = 1072 stub code [0x0000000102f1d4f0,0x0000000102f1d520] = 48 metadata [0x0000000102f1d520,0x0000000102f1d558] = 56 immutable data [0x0000000144856800,0x0000000144856c78] = 1144 dependencies [0x0000000144856800,0x0000000144856810] = 16 nul chk table [0x0000000144856810,0x0000000144856820] = 16 handler table [0x0000000144856820,0x0000000144856850] = 48 scopes pcs [0x0000000144856850,0x0000000144856b20] = 720 scopes data [0x0000000144856b20,0x0000000144856c78] = 344 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Instructions begin] 0x0000000102f1d0c0: nop 0x0000000102f1d0c4: nop 0x0000000102f1d0c8: nop [Entry Point] # {method} {0x000000011c0cefd0} 'appendNull' '()Ljava/lang/AbstractStringBuilder;' in 'java/lang/AbstractStringBuilder' # [sp+0x50] (sp of caller) 0x0000000102f1d0cc: ldr w8, [x1, #8] 0x0000000102f1d0d0: ldr w10, [x9, #8] 0x0000000102f1d0d4: cmp w8, w10 0x0000000102f1d0d8: b.eq 0x0000000102f1d0e0 // b.none 0x0000000102f1d0dc: b 0x0000000102ea7c80 ; {runtime_call ic_miss_stub} [Verified Entry Point] 0x0000000102f1d0e0: nop 0x0000000102f1d0e4: sub x9, sp, #0x14, lsl #12 0x0000000102f1d0e8: str xzr, [x9] 0x0000000102f1d0ec: sub sp, sp, #0x50 0x0000000102f1d0f0: stp x29, x30, [sp, #64] 0x0000000102f1d0f4: ldr w8, 0x0000000102f1d4ec 0x0000000102f1d0f8: ldr w9, [x28, #32] 0x0000000102f1d0fc: cmp x8, x9 0x0000000102f1d100: b.ne 0x0000000102f1d4d8 // b.any;*synchronization entry ; - java.lang.AbstractStringBuilder::appendNull at -1 (line 711) 0x0000000102f1d104: ldr w10, [x1, #20] 0x0000000102f1d108: mov x19, x1 0x0000000102f1d10c: lsl x14, x10, #3 ;*getfield value {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 1 (line 316) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d110: ldr w15, [x14, #12] ; implicit exception: dispatches to 0x0000000102f1d4a0 ;*arraylength {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 4 (line 316) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d114: ldrsb w12, [x1, #16] ;*getfield coder {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 6 (line 316) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d118: ldr w17, [x1, #12] ;*getfield count {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 1 (line 711) 0x0000000102f1d11c: asr w11, w15, w12 0x0000000102f1d120: sub w13, w17, w11 0x0000000102f1d124: add w11, w13, #0x4 0x0000000102f1d128: add w13, w17, #0x4 ;*iadd {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 8 (line 712) 0x0000000102f1d12c: cmp w11, #0x0 0x0000000102f1d130: b.gt 0x0000000102f1d17c 0x0000000102f1d134: lsl x10, x10, #3 ;*getfield value {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 13 (line 713) 0x0000000102f1d138: cbz w12, 0x0000000102f1d3d8 ;*ifeq {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 21 (line 714) 0x0000000102f1d13c: mov w11, #0x6e // #110 0x0000000102f1d140: add x10, x10, w17, sxtw #1 0x0000000102f1d144: strh w11, [x10, #16] ;*invokestatic putChar {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.StringUTF16::putCharsAt at 3 (line 1552) ; - java.lang.AbstractStringBuilder::appendNull at 63 (line 722) 0x0000000102f1d148: mov w11, #0x75 // #117 0x0000000102f1d14c: mov w12, #0x6c // #108 0x0000000102f1d150: strh w11, [x10, #18] ;*invokestatic putChar {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.StringUTF16::putCharsAt at 11 (line 1553) ; - java.lang.AbstractStringBuilder::appendNull at 63 (line 722) 0x0000000102f1d154: strh w12, [x10, #20] ;*invokestatic putChar {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.StringUTF16::putCharsAt at 20 (line 1554) ; - java.lang.AbstractStringBuilder::appendNull at 63 (line 722) 0x0000000102f1d158: strh w12, [x10, #22] ;*invokestatic putChar {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.StringUTF16::putCharsAt at 29 (line 1555) ; - java.lang.AbstractStringBuilder::appendNull at 63 (line 722) 0x0000000102f1d15c: str w13, [x19, #12] ;*synchronization entry ; - java.lang.AbstractStringBuilder::appendNull at -1 (line 711) 0x0000000102f1d160: mov x0, x19 0x0000000102f1d164: ldp x29, x30, [sp, #64] 0x0000000102f1d168: add sp, sp, #0x50 0x0000000102f1d16c: ldr x8, [x28, #1096] ; {poll_return} 0x0000000102f1d170: cmp sp, x8 0x0000000102f1d174: b.hi 0x0000000102f1d4cc // b.pmore 0x0000000102f1d178: ret 0x0000000102f1d17c: stp w15, w13, [sp] 0x0000000102f1d180: str x14, [sp, #8] 0x0000000102f1d184: str w17, [sp, #20] 0x0000000102f1d188: mov x29, x1 0x0000000102f1d18c: mov w2, w13 0x0000000102f1d190: bl 0x0000000102ea7f80 ; ImmutableOopMap {rfp=Oop [8]=Oop } ;*invokevirtual newCapacity {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 24 (line 319) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) ; {optimized virtual_call} 0x0000000102f1d194: nop ; {other} 0x0000000102f1d198: movk xzr, #0x20c 0x0000000102f1d19c: movk xzr, #0x0 ;*invokevirtual newCapacity {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 24 (line 319) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d1a0: mov x19, x29 0x0000000102f1d1a4: ldrsb w10, [x19, #16] 0x0000000102f1d1a8: lsl w14, w0, w10 ;*ishl {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 31 (line 319) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d1ac: ldr w11, [sp] 0x0000000102f1d1b0: cmp w11, w14 0x0000000102f1d1b4: csel w29, w11, w14, lt // lt = tstop;*invokestatic min {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 25 (line 3542) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d1b8: cmp w14, w11 0x0000000102f1d1bc: b.eq 0x0000000102f1d480 // b.none;*if_icmpne {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 3 (line 3537) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d1c0: mov w12, w14 0x0000000102f1d1c4: mov x10, #0x17 // #23 0x0000000102f1d1c8: add x12, x10, w12, sxtw ;*newarray {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 15 (line 3540) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d1cc: cmp w11, w29 0x0000000102f1d1d0: b.cc 0x0000000102f1d434 // b.lo, b.ul, b.last 0x0000000102f1d1d4: cmp w14, w29 0x0000000102f1d1d8: b.cc 0x0000000102f1d434 // b.lo, b.ul, b.last ;*invokestatic arraycopy {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 28 (line 3541) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d1dc: and x13, x12, #0xfffffffffffffff8;*newarray {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 15 (line 3540) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d1e0: cmp w14, #0x100, lsl #12 0x0000000102f1d1e4: b.hi 0x0000000102f1d398 // b.pmore 0x0000000102f1d1e8: ldr x20, [x28, #440] 0x0000000102f1d1ec: ldr x11, [x28, #456] 0x0000000102f1d1f0: add x10, x20, x13 0x0000000102f1d1f4: cmp x10, x11 0x0000000102f1d1f8: b.cs 0x0000000102f1d398 // b.hs, b.nlast ;*putfield value {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 35 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d1fc: orr x11, xzr, #0x1 0x0000000102f1d200: str x10, [x28, #440] 0x0000000102f1d204: str x11, [x20] 0x0000000102f1d208: mov x11, #0x40000 // #262144 ; {metadata({type array byte})} 0x0000000102f1d20c: movk x11, #0xa00 0x0000000102f1d210: prfm pstl1keep, [x10, #384] 0x0000000102f1d214: str w11, [x20, #8] 0x0000000102f1d218: prfm pstl1keep, [x10, #512] 0x0000000102f1d21c: str w14, [x20, #12] 0x0000000102f1d220: prfm pstl1keep, [x10, #640] ;*newarray {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 15 (line 3540) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d224: lsr x11, x12, #3 0x0000000102f1d228: add x1, x20, #0x10 0x0000000102f1d22c: lsr x21, x20, #3 0x0000000102f1d230: cbz w29, 0x0000000102f1d400 0x0000000102f1d234: ldr x10, [sp, #8] 0x0000000102f1d238: add x0, x10, #0x10 0x0000000102f1d23c: cmp w29, w14 0x0000000102f1d240: b.lt 0x0000000102f1d2b4 // b.tstop 0x0000000102f1d244: sub x10, x13, #0x10 0x0000000102f1d248: lsr x2, x10, #3 0x0000000102f1d24c: bl Stub::arrayof_jlong_disjoint_arraycopy ; {runtime_call StubRoutines (final stubs)} 0x0000000102f1d250: nop ; {other} 0x0000000102f1d254: movk xzr, #0x0 0x0000000102f1d258: movk xzr, #0x0 0x0000000102f1d25c: dmb ishst 0x0000000102f1d260: ldrsb w10, [x28, #56] 0x0000000102f1d264: cbnz w10, 0x0000000102f1d304 0x0000000102f1d268: mov x10, x19 0x0000000102f1d26c: mov x11, x20 0x0000000102f1d270: eor x11, x11, x10 0x0000000102f1d274: lsr x11, x11, #23 0x0000000102f1d278: str w21, [x19, #20] 0x0000000102f1d27c: cbz x11, 0x0000000102f1d2a0 0x0000000102f1d280: lsr x10, x10, #9 0x0000000102f1d284: mov x11, #0xc000 // #49152 0x0000000102f1d288: movk x11, #0x5e6, lsl #16 0x0000000102f1d28c: movk x11, #0x1, lsl #32 0x0000000102f1d290: add x0, x11, x10 0x0000000102f1d294: ldrsb w10, [x0] 0x0000000102f1d298: cmp w10, #0x2 0x0000000102f1d29c: b.ne 0x0000000102f1d330 // b.any;*getfield value {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 13 (line 713) 0x0000000102f1d2a0: ldrsb w12, [x19, #16] ;*getfield coder {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::isLatin1 at 7 (line 1791) ; - java.lang.AbstractStringBuilder::appendNull at 18 (line 714) 0x0000000102f1d2a4: lsr x10, x20, #3 0x0000000102f1d2a8: ldr w13, [sp, #4] 0x0000000102f1d2ac: ldr w17, [sp, #20] 0x0000000102f1d2b0: b 0x0000000102f1d134 0x0000000102f1d2b4: sxtw x2, w29 0x0000000102f1d2b8: add x12, x2, #0x10 0x0000000102f1d2bc: and x10, x12, #0xfffffffffffffff8 0x0000000102f1d2c0: add x10, x20, x10 0x0000000102f1d2c4: sub x11, x11, x12, lsr #3 0x0000000102f1d2c8: subs x8, x11, #0x8 0x0000000102f1d2cc: b.cc 0x0000000102f1d2d4 // b.lo, b.ul, b.last 0x0000000102f1d2d0: bl Stub::zero_blocks ; {runtime_call StubRoutines (final stubs)} 0x0000000102f1d2d4: tbz w11, #2, 0x0000000102f1d2e0 0x0000000102f1d2d8: stp xzr, xzr, [x10], #16 0x0000000102f1d2dc: stp xzr, xzr, [x10], #16 0x0000000102f1d2e0: tbz w11, #1, 0x0000000102f1d2e8 0x0000000102f1d2e4: stp xzr, xzr, [x10], #16 0x0000000102f1d2e8: tbz w11, #0, 0x0000000102f1d2f0 0x0000000102f1d2ec: str xzr, [x10] 0x0000000102f1d2f0: bl Stub::arrayof_jbyte_disjoint_arraycopy ; {runtime_call StubRoutines (final stubs)} 0x0000000102f1d2f4: nop ; {other} 0x0000000102f1d2f8: movk xzr, #0x0 0x0000000102f1d2fc: movk xzr, #0x0 0x0000000102f1d300: b 0x0000000102f1d25c 0x0000000102f1d304: ldr w11, [x19, #20] 0x0000000102f1d308: lsl x0, x11, #3 0x0000000102f1d30c: cbz x0, 0x0000000102f1d268 0x0000000102f1d310: ldr x10, [x28, #40] 0x0000000102f1d314: cbz x10, 0x0000000102f1d450 0x0000000102f1d318: ldr x11, [x28, #48] 0x0000000102f1d31c: sub x12, x10, #0x8 0x0000000102f1d320: add x10, x11, x10 0x0000000102f1d324: stur x0, [x10, #-8] 0x0000000102f1d328: str x12, [x28, #40] 0x0000000102f1d32c: b 0x0000000102f1d268 0x0000000102f1d330: ldp x11, x10, [x28, #64] 0x0000000102f1d334: dmb ish 0x0000000102f1d338: ldrsb w12, [x0] 0x0000000102f1d33c: cbz w12, 0x0000000102f1d388 0x0000000102f1d340: strb wzr, [x0] 0x0000000102f1d344: cbz x11, 0x0000000102f1d35c 0x0000000102f1d348: sub x12, x11, #0x8 0x0000000102f1d34c: add x10, x10, x11 0x0000000102f1d350: stur x0, [x10, #-8] 0x0000000102f1d354: str x12, [x28, #64] 0x0000000102f1d358: b 0x0000000102f1d388 0x0000000102f1d35c: mov x1, x28 0x0000000102f1d360: adr x9, 0x0000000102f1d378 0x0000000102f1d364: mov x8, #0x6ae4 // #27364 ; {runtime_call G1BarrierSetRuntime::write_ref_field_post_entry(unsigned char volatile*, JavaThread*)} 0x0000000102f1d368: movk x8, #0x200, lsl #16 0x0000000102f1d36c: movk x8, #0x1, lsl #32 0x0000000102f1d370: stp xzr, x9, [sp, #-16]! 0x0000000102f1d374: blr x8 0x0000000102f1d378: nop ; {other} 0x0000000102f1d37c: movk xzr, #0x0 0x0000000102f1d380: movk xzr, #0x0 0x0000000102f1d384: add sp, sp, #0x10 ;*putfield value {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 35 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d388: ldr w11, [x19, #20] 0x0000000102f1d38c: lsl x10, x11, #3 ;*getfield value {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 13 (line 713) 0x0000000102f1d390: mov x20, x10 0x0000000102f1d394: b 0x0000000102f1d2a0 0x0000000102f1d398: str w14, [sp] 0x0000000102f1d39c: stp x19, x12, [sp, #24] 0x0000000102f1d3a0: str x13, [sp, #40] 0x0000000102f1d3a4: mov x1, #0xa00 // #2560 ; {metadata({type array byte})} 0x0000000102f1d3a8: movk x1, #0x4, lsl #16 0x0000000102f1d3ac: movk x1, #0x7000, lsl #32 0x0000000102f1d3b0: mov w2, w14 0x0000000102f1d3b4: bl 0x0000000102eecf00 ; ImmutableOopMap {[8]=Oop [24]=Oop } ;*newarray {reexecute=0 rethrow=0 return_oop=1} ; - java.util.Arrays::copyOf at 15 (line 3540) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) ; {runtime_call _new_array_nozero_Java} 0x0000000102f1d3b8: nop ; {other} 0x0000000102f1d3bc: movk xzr, #0x430 0x0000000102f1d3c0: movk xzr, #0x100 ;*newarray {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 15 (line 3540) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d3c4: ldr w14, [sp] 0x0000000102f1d3c8: ldr x19, [sp, #24] 0x0000000102f1d3cc: mov x20, x0 0x0000000102f1d3d0: ldp x12, x13, [sp, #32] 0x0000000102f1d3d4: b 0x0000000102f1d224 0x0000000102f1d3d8: cmp w12, #0x0 ;*invokevirtual isLatin1 {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 18 (line 714) 0x0000000102f1d3dc: cset w29, eq // eq = none ;*ireturn {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::isLatin1 at 18 (line 1791) ; - java.lang.AbstractStringBuilder::appendNull at 18 (line 714) 0x0000000102f1d3e0: mov w1, #0xffffff45 // #-187 0x0000000102f1d3e4: str x19, [sp] 0x0000000102f1d3e8: str w17, [sp, #8] 0x0000000102f1d3ec: str x10, [sp, #16] 0x0000000102f1d3f0: bl 0x0000000102eaed00 ; ImmutableOopMap {[0]=Oop [16]=Oop } ;*ifeq {reexecute=1 rethrow=0 return_oop=0} ; - (reexecute) java.lang.AbstractStringBuilder::appendNull at 21 (line 714) ; {runtime_call UncommonTrapBlob} 0x0000000102f1d3f4: nop ; {other} 0x0000000102f1d3f8: movk xzr, #0x46c 0x0000000102f1d3fc: movk xzr, #0x200 ;*ifeq {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 21 (line 714) 0x0000000102f1d400: mov x10, x1 0x0000000102f1d404: sub x11, x11, #0x2 0x0000000102f1d408: subs x8, x11, #0x8 0x0000000102f1d40c: b.cc 0x0000000102f1d414 // b.lo, b.ul, b.last 0x0000000102f1d410: bl Stub::zero_blocks ; {runtime_call StubRoutines (final stubs)} 0x0000000102f1d414: tbz w11, #2, 0x0000000102f1d420 0x0000000102f1d418: stp xzr, xzr, [x10], #16 0x0000000102f1d41c: stp xzr, xzr, [x10], #16 0x0000000102f1d420: tbz w11, #1, 0x0000000102f1d428 0x0000000102f1d424: stp xzr, xzr, [x10], #16 0x0000000102f1d428: tbz w11, #0, 0x0000000102f1d430 0x0000000102f1d42c: str xzr, [x10] 0x0000000102f1d430: b 0x0000000102f1d25c ;*invokestatic arraycopy {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 28 (line 3541) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d434: mov w1, #0xffffffcc // #-52 0x0000000102f1d438: str x19, [sp] 0x0000000102f1d43c: str w14, [sp, #16] 0x0000000102f1d440: bl 0x0000000102eaed00 ; ImmutableOopMap {[0]=Oop [8]=Oop } ;*newarray {reexecute=1 rethrow=0 return_oop=0} ; - (reexecute) java.util.Arrays::copyOf at 15 (line 3540) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) ; {runtime_call UncommonTrapBlob} 0x0000000102f1d444: nop ; {other} 0x0000000102f1d448: movk xzr, #0x4bc 0x0000000102f1d44c: movk xzr, #0x300 0x0000000102f1d450: mov x1, x28 0x0000000102f1d454: adr x9, 0x0000000102f1d46c 0x0000000102f1d458: mov x8, #0x6abc // #27324 ; {runtime_call G1BarrierSetRuntime::write_ref_field_pre_entry(oopDesc*, JavaThread*)} 0x0000000102f1d45c: movk x8, #0x200, lsl #16 0x0000000102f1d460: movk x8, #0x1, lsl #32 0x0000000102f1d464: stp xzr, x9, [sp, #-16]! 0x0000000102f1d468: blr x8 0x0000000102f1d46c: nop ; {other} 0x0000000102f1d470: movk xzr, #0x0 0x0000000102f1d474: movk xzr, #0x0 0x0000000102f1d478: add sp, sp, #0x10 ;*putfield value {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 35 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d47c: b 0x0000000102f1d268 0x0000000102f1d480: mov w1, #0xffffff45 // #-187 0x0000000102f1d484: str x19, [sp] 0x0000000102f1d488: str w11, [sp, #16] 0x0000000102f1d48c: str w14, [sp, #24] 0x0000000102f1d490: bl 0x0000000102eaed00 ; ImmutableOopMap {[0]=Oop [8]=Oop } ;*if_icmpne {reexecute=1 rethrow=0 return_oop=0} ; - (reexecute) java.util.Arrays::copyOf at 3 (line 3537) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) ; {runtime_call UncommonTrapBlob} 0x0000000102f1d494: nop ; {other} 0x0000000102f1d498: movk xzr, #0x50c 0x0000000102f1d49c: movk xzr, #0x400 ;*if_icmpne {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 3 (line 3537) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d4a0: mov w1, #0xfffffff6 // #-10 0x0000000102f1d4a4: bl 0x0000000102eaed00 ; ImmutableOopMap {} ;*arraylength {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 4 (line 316) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) ; {runtime_call UncommonTrapBlob} 0x0000000102f1d4a8: nop ; {other} 0x0000000102f1d4ac: movk xzr, #0x520 0x0000000102f1d4b0: movk xzr, #0x500 ;*newarray {reexecute=0 rethrow=0 return_oop=0} ; - java.util.Arrays::copyOf at 15 (line 3540) ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 32 (line 318) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d4b4: mov x1, x0 0x0000000102f1d4b8: b 0x0000000102f1d4c0 ;*invokevirtual newCapacity {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at 24 (line 319) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d4bc: mov x1, x0 ;*synchronization entry ; - java.lang.AbstractStringBuilder::ensureCapacityInternal at -1 (line 316) ; - java.lang.AbstractStringBuilder::appendNull at 9 (line 712) 0x0000000102f1d4c0: ldp x29, x30, [sp, #64] 0x0000000102f1d4c4: add sp, sp, #0x50 0x0000000102f1d4c8: b 0x0000000102eeed00 ;*getfield value {reexecute=0 rethrow=0 return_oop=0} ; - java.lang.AbstractStringBuilder::appendNull at 13 (line 713) ; {runtime_call _rethrow_Java} 0x0000000102f1d4cc: adr x8, 0x0000000102f1d16c ; {internal_word} 0x0000000102f1d4d0: str x8, [x28, #1120] 0x0000000102f1d4d4: b 0x0000000102eae680 ; {runtime_call SafepointBlob} 0x0000000102f1d4d8: mov x8, #0x16c0 // #5824 ; {runtime_call StubRoutines (final stubs)} 0x0000000102f1d4dc: movk x8, #0x2eb, lsl #16 0x0000000102f1d4e0: movk x8, #0x1, lsl #32 0x0000000102f1d4e4: blr x8 0x0000000102f1d4e8: b 0x0000000102f1d104 0x0000000102f1d4ec: udf #1 ; {other} [Stub Code] 0x0000000102f1d4f0: isb ; {no_reloc} 0x0000000102f1d4f4: mov x12, #0x0 // #0 ; {metadata(nullptr)} 0x0000000102f1d4f8: movk x12, #0x0, lsl #16 0x0000000102f1d4fc: movk x12, #0x0, lsl #32 0x0000000102f1d500: mov x8, #0x0 // #0 0x0000000102f1d504: movk x8, #0x0, lsl #16 0x0000000102f1d508: movk x8, #0x0, lsl #32 0x0000000102f1d50c: br x8 [Exception Handler] 0x0000000102f1d510: b 0x0000000102eed880 ; {runtime_call ExceptionBlob} [Deopt Handler Code] 0x0000000102f1d514: adr x30, 0x0000000102f1d514 0x0000000102f1d518: b 0x0000000102eae980 ; {runtime_call DeoptimizationBlob} 0x0000000102f1d51c: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38661 38 n java.lang.invoke.MethodHandle::linkToSpecial(LLLLLLL)L (native) total in heap [0x0000000102f1cc88,0x0000000102f1cda8] = 288 relocation [0x0000000102f1cd60,0x0000000102f1cd68] = 8 main code [0x0000000102f1cd80,0x0000000102f1cda4] = 36 stub code [0x0000000102f1cda4,0x0000000102f1cda8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d03b0} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/Object' # parm5: c_rarg6:c_rarg6 = 'java/lang/Object' # parm6: c_rarg7:c_rarg7 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1cd80: nop 0x0000000102f1cd84: ldr xzr, [x1] 0x0000000102f1cd88: ldr w12, [x7, #36] 0x0000000102f1cd8c: lsl x12, x12, #3 0x0000000102f1cd90: ldr x12, [x12, #16] 0x0000000102f1cd94: cbz x12, 0x0000000102f1cda0 0x0000000102f1cd98: ldr x8, [x12, #64] 0x0000000102f1cd9c: br x8 0x0000000102f1cda0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f1cda4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38663 39 n java.lang.invoke.MethodHandle::linkToStatic(LJL)L (native) total in heap [0x0000000102f1c988,0x0000000102f1caa0] = 280 relocation [0x0000000102f1ca60,0x0000000102f1ca68] = 8 main code [0x0000000102f1ca80,0x0000000102f1caa0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d0768} 'linkToStatic' '(Ljava/lang/Object;JLjava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = long # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1ca80: nop 0x0000000102f1ca84: ldr w12, [x3, #36] 0x0000000102f1ca88: lsl x12, x12, #3 0x0000000102f1ca8c: ldr x12, [x12, #16] 0x0000000102f1ca90: cbz x12, 0x0000000102f1ca9c 0x0000000102f1ca94: ldr x8, [x12, #64] 0x0000000102f1ca98: br x8 0x0000000102f1ca9c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38664 40 n java.lang.invoke.MethodHandle::invokeBasic(LJ)L (native) total in heap [0x0000000102f1c688,0x0000000102f1c7b0] = 296 relocation [0x0000000102f1c760,0x0000000102f1c768] = 8 main code [0x0000000102f1c780,0x0000000102f1c7b0] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d0880} 'invokeBasic' '(Ljava/lang/Object;J)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # parm1: c_rarg3:c_rarg3 = long # [sp+0x0] (sp of caller) 0x0000000102f1c780: nop 0x0000000102f1c784: ldr w12, [x1, #20] 0x0000000102f1c788: lsl x12, x12, #3 0x0000000102f1c78c: ldr w12, [x12, #40] 0x0000000102f1c790: lsl x12, x12, #3 0x0000000102f1c794: ldr w12, [x12, #36] 0x0000000102f1c798: lsl x12, x12, #3 0x0000000102f1c79c: ldr x12, [x12, #16] 0x0000000102f1c7a0: cbz x12, 0x0000000102f1c7ac 0x0000000102f1c7a4: ldr x8, [x12, #64] 0x0000000102f1c7a8: br x8 0x0000000102f1c7ac: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38664 41 n java.lang.invoke.MethodHandle::linkToSpecial(LLJL)L (native) total in heap [0x0000000102f1c388,0x0000000102f1c4a8] = 288 relocation [0x0000000102f1c460,0x0000000102f1c468] = 8 main code [0x0000000102f1c480,0x0000000102f1c4a4] = 36 stub code [0x0000000102f1c4a4,0x0000000102f1c4a8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d0b08} 'linkToSpecial' '(Ljava/lang/Object;Ljava/lang/Object;JLjava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = long # parm3: c_rarg4:c_rarg4 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1c480: nop 0x0000000102f1c484: ldr xzr, [x1] 0x0000000102f1c488: ldr w12, [x4, #36] 0x0000000102f1c48c: lsl x12, x12, #3 0x0000000102f1c490: ldr x12, [x12, #16] 0x0000000102f1c494: cbz x12, 0x0000000102f1c4a0 0x0000000102f1c498: ldr x8, [x12, #64] 0x0000000102f1c49c: br x8 0x0000000102f1c4a0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f1c4a4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38665 42 n java.lang.invoke.MethodHandle::linkToStatic(JLJLL)J (native) total in heap [0x0000000102f1bd08,0x0000000102f1be20] = 280 relocation [0x0000000102f1bde0,0x0000000102f1bde8] = 8 main code [0x0000000102f1be00,0x0000000102f1be20] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d0da0} 'linkToStatic' '(JLjava/lang/Object;JLjava/lang/Object;Ljava/lang/invoke/MemberName;)J' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = long # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = long # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1be00: nop 0x0000000102f1be04: ldr w12, [x5, #36] 0x0000000102f1be08: lsl x12, x12, #3 0x0000000102f1be0c: ldr x12, [x12, #16] 0x0000000102f1be10: cbz x12, 0x0000000102f1be1c 0x0000000102f1be14: ldr x8, [x12, #64] 0x0000000102f1be18: br x8 0x0000000102f1be1c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38666 43 n java.lang.invoke.MethodHandle::invokeBasic(JLJL)J (native) total in heap [0x0000000102f1ba08,0x0000000102f1bb30] = 296 relocation [0x0000000102f1bae0,0x0000000102f1bae8] = 8 main code [0x0000000102f1bb00,0x0000000102f1bb30] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d0eb8} 'invokeBasic' '(JLjava/lang/Object;JLjava/lang/Object;)J' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = long # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # parm2: c_rarg4:c_rarg4 = long # parm3: c_rarg5:c_rarg5 = 'java/lang/Object' # [sp+0x0] (sp of caller) 0x0000000102f1bb00: nop 0x0000000102f1bb04: ldr w12, [x1, #20] 0x0000000102f1bb08: lsl x12, x12, #3 0x0000000102f1bb0c: ldr w12, [x12, #40] 0x0000000102f1bb10: lsl x12, x12, #3 0x0000000102f1bb14: ldr w12, [x12, #36] 0x0000000102f1bb18: lsl x12, x12, #3 0x0000000102f1bb1c: ldr x12, [x12, #16] 0x0000000102f1bb20: cbz x12, 0x0000000102f1bb2c 0x0000000102f1bb24: ldr x8, [x12, #64] 0x0000000102f1bb28: br x8 0x0000000102f1bb2c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38666 44 n java.lang.invoke.MethodHandle::linkToSpecial(LJLJLL)J (native) total in heap [0x0000000102f1b388,0x0000000102f1b4a8] = 288 relocation [0x0000000102f1b460,0x0000000102f1b468] = 8 main code [0x0000000102f1b480,0x0000000102f1b4a4] = 36 stub code [0x0000000102f1b4a4,0x0000000102f1b4a8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d11a0} 'linkToSpecial' '(Ljava/lang/Object;JLjava/lang/Object;JLjava/lang/Object;Ljava/lang/invoke/MemberName;)J' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = long # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = long # parm4: c_rarg5:c_rarg5 = 'java/lang/Object' # parm5: c_rarg6:c_rarg6 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1b480: nop 0x0000000102f1b484: ldr xzr, [x1] 0x0000000102f1b488: ldr w12, [x6, #36] 0x0000000102f1b48c: lsl x12, x12, #3 0x0000000102f1b490: ldr x12, [x12, #16] 0x0000000102f1b494: cbz x12, 0x0000000102f1b4a0 0x0000000102f1b498: ldr x8, [x12, #64] 0x0000000102f1b49c: br x8 0x0000000102f1b4a0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f1b4a4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38668 45 n java.lang.invoke.MethodHandle::invokeBasic(JLJ)J (native) total in heap [0x0000000102f1a988,0x0000000102f1aab0] = 296 relocation [0x0000000102f1aa60,0x0000000102f1aa68] = 8 main code [0x0000000102f1aa80,0x0000000102f1aab0] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d14c8} 'invokeBasic' '(JLjava/lang/Object;J)J' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = long # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # parm2: c_rarg4:c_rarg4 = long # [sp+0x0] (sp of caller) 0x0000000102f1aa80: nop 0x0000000102f1aa84: ldr w12, [x1, #20] 0x0000000102f1aa88: lsl x12, x12, #3 0x0000000102f1aa8c: ldr w12, [x12, #40] 0x0000000102f1aa90: lsl x12, x12, #3 0x0000000102f1aa94: ldr w12, [x12, #36] 0x0000000102f1aa98: lsl x12, x12, #3 0x0000000102f1aa9c: ldr x12, [x12, #16] 0x0000000102f1aaa0: cbz x12, 0x0000000102f1aaac 0x0000000102f1aaa4: ldr x8, [x12, #64] 0x0000000102f1aaa8: br x8 0x0000000102f1aaac: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38668 46 n java.lang.invoke.MethodHandle::linkToSpecial(LJLJL)J (native) total in heap [0x0000000102f1a688,0x0000000102f1a7a8] = 288 relocation [0x0000000102f1a760,0x0000000102f1a768] = 8 main code [0x0000000102f1a780,0x0000000102f1a7a4] = 36 stub code [0x0000000102f1a7a4,0x0000000102f1a7a8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d15e0} 'linkToSpecial' '(Ljava/lang/Object;JLjava/lang/Object;JLjava/lang/invoke/MemberName;)J' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = long # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = long # parm4: c_rarg5:c_rarg5 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1a780: nop 0x0000000102f1a784: ldr xzr, [x1] 0x0000000102f1a788: ldr w12, [x5, #36] 0x0000000102f1a78c: lsl x12, x12, #3 0x0000000102f1a790: ldr x12, [x12, #16] 0x0000000102f1a794: cbz x12, 0x0000000102f1a7a0 0x0000000102f1a798: ldr x8, [x12, #64] 0x0000000102f1a79c: br x8 0x0000000102f1a7a0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f1a7a4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38669 47 n java.lang.invoke.MethodHandle::linkToStatic(JL)L (native) total in heap [0x0000000102f1a008,0x0000000102f1a120] = 280 relocation [0x0000000102f1a0e0,0x0000000102f1a0e8] = 8 main code [0x0000000102f1a100,0x0000000102f1a120] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d1910} 'linkToStatic' '(JLjava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = long # parm1: c_rarg2:c_rarg2 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1a100: nop 0x0000000102f1a104: ldr w12, [x2, #36] 0x0000000102f1a108: lsl x12, x12, #3 0x0000000102f1a10c: ldr x12, [x12, #16] 0x0000000102f1a110: cbz x12, 0x0000000102f1a11c 0x0000000102f1a114: ldr x8, [x12, #64] 0x0000000102f1a118: br x8 0x0000000102f1a11c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38670 48 n java.lang.invoke.MethodHandle::invokeBasic(J)L (native) total in heap [0x0000000102f19d08,0x0000000102f19e30] = 296 relocation [0x0000000102f19de0,0x0000000102f19de8] = 8 main code [0x0000000102f19e00,0x0000000102f19e30] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d1b18} 'invokeBasic' '(J)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = long # [sp+0x0] (sp of caller) 0x0000000102f19e00: nop 0x0000000102f19e04: ldr w12, [x1, #20] 0x0000000102f19e08: lsl x12, x12, #3 0x0000000102f19e0c: ldr w12, [x12, #40] 0x0000000102f19e10: lsl x12, x12, #3 0x0000000102f19e14: ldr w12, [x12, #36] 0x0000000102f19e18: lsl x12, x12, #3 0x0000000102f19e1c: ldr x12, [x12, #16] 0x0000000102f19e20: cbz x12, 0x0000000102f19e2c 0x0000000102f19e24: ldr x8, [x12, #64] 0x0000000102f19e28: br x8 0x0000000102f19e2c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38670 49 n java.lang.invoke.MethodHandle::linkToSpecial(LJL)L (native) total in heap [0x0000000102f19a08,0x0000000102f19b28] = 288 relocation [0x0000000102f19ae0,0x0000000102f19ae8] = 8 main code [0x0000000102f19b00,0x0000000102f19b24] = 36 stub code [0x0000000102f19b24,0x0000000102f19b28] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d1c30} 'linkToSpecial' '(Ljava/lang/Object;JLjava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = long # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f19b00: nop 0x0000000102f19b04: ldr xzr, [x1] 0x0000000102f19b08: ldr w12, [x3, #36] 0x0000000102f19b0c: lsl x12, x12, #3 0x0000000102f19b10: ldr x12, [x12, #16] 0x0000000102f19b14: cbz x12, 0x0000000102f19b20 0x0000000102f19b18: ldr x8, [x12, #64] 0x0000000102f19b1c: br x8 0x0000000102f19b20: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f19b24: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38671 50 n java.lang.invoke.MethodHandle::linkToStatic(JJL)J (native) total in heap [0x0000000102f19588,0x0000000102f196a0] = 280 relocation [0x0000000102f19660,0x0000000102f19668] = 8 main code [0x0000000102f19680,0x0000000102f196a0] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d1dd8} 'linkToStatic' '(JJLjava/lang/invoke/MemberName;)J' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = long # parm1: c_rarg2:c_rarg2 = long # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f19680: nop 0x0000000102f19684: ldr w12, [x3, #36] 0x0000000102f19688: lsl x12, x12, #3 0x0000000102f1968c: ldr x12, [x12, #16] 0x0000000102f19690: cbz x12, 0x0000000102f1969c 0x0000000102f19694: ldr x8, [x12, #64] 0x0000000102f19698: br x8 0x0000000102f1969c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38671 51 n java.lang.invoke.MethodHandle::invokeBasic(JJ)J (native) total in heap [0x0000000102f1d588,0x0000000102f1d6b0] = 296 relocation [0x0000000102f1d660,0x0000000102f1d668] = 8 main code [0x0000000102f1d680,0x0000000102f1d6b0] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d1ef0} 'invokeBasic' '(JJ)J' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = long # parm1: c_rarg3:c_rarg3 = long # [sp+0x0] (sp of caller) 0x0000000102f1d680: nop 0x0000000102f1d684: ldr w12, [x1, #20] 0x0000000102f1d688: lsl x12, x12, #3 0x0000000102f1d68c: ldr w12, [x12, #40] 0x0000000102f1d690: lsl x12, x12, #3 0x0000000102f1d694: ldr w12, [x12, #36] 0x0000000102f1d698: lsl x12, x12, #3 0x0000000102f1d69c: ldr x12, [x12, #16] 0x0000000102f1d6a0: cbz x12, 0x0000000102f1d6ac 0x0000000102f1d6a4: ldr x8, [x12, #64] 0x0000000102f1d6a8: br x8 0x0000000102f1d6ac: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38672 52 n java.lang.invoke.MethodHandle::linkToSpecial(LJJL)J (native) total in heap [0x0000000102f1d888,0x0000000102f1d9a8] = 288 relocation [0x0000000102f1d960,0x0000000102f1d968] = 8 main code [0x0000000102f1d980,0x0000000102f1d9a4] = 36 stub code [0x0000000102f1d9a4,0x0000000102f1d9a8] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d2008} 'linkToSpecial' '(Ljava/lang/Object;JJLjava/lang/invoke/MemberName;)J' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = long # parm2: c_rarg3:c_rarg3 = long # parm3: c_rarg4:c_rarg4 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1d980: nop 0x0000000102f1d984: ldr xzr, [x1] 0x0000000102f1d988: ldr w12, [x4, #36] 0x0000000102f1d98c: lsl x12, x12, #3 0x0000000102f1d990: ldr x12, [x12, #16] 0x0000000102f1d994: cbz x12, 0x0000000102f1d9a0 0x0000000102f1d998: ldr x8, [x12, #64] 0x0000000102f1d99c: br x8 0x0000000102f1d9a0: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f1d9a4: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38698 53 n java.lang.invoke.MethodHandle::linkToStatic(LLLJL)L (native) total in heap [0x0000000102f1df08,0x0000000102f1e020] = 280 relocation [0x0000000102f1dfe0,0x0000000102f1dfe8] = 8 main code [0x0000000102f1e000,0x0000000102f1e020] = 32 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d3a80} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;JLjava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = long # parm4: c_rarg5:c_rarg5 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1e000: nop 0x0000000102f1e004: ldr w12, [x5, #36] 0x0000000102f1e008: lsl x12, x12, #3 0x0000000102f1e00c: ldr x12, [x12, #16] 0x0000000102f1e010: cbz x12, 0x0000000102f1e01c 0x0000000102f1e014: ldr x8, [x12, #64] 0x0000000102f1e018: br x8 0x0000000102f1e01c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38699 54 n java.lang.invoke.MethodHandle::invokeBasic(LLLJ)L (native) total in heap [0x0000000102f1e208,0x0000000102f1e330] = 296 relocation [0x0000000102f1e2e0,0x0000000102f1e2e8] = 8 main code [0x0000000102f1e300,0x0000000102f1e330] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d3bc8} 'invokeBasic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;J)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = 'java/lang/Object' # parm1: c_rarg3:c_rarg3 = 'java/lang/Object' # parm2: c_rarg4:c_rarg4 = 'java/lang/Object' # parm3: c_rarg5:c_rarg5 = long # [sp+0x0] (sp of caller) 0x0000000102f1e300: nop 0x0000000102f1e304: ldr w12, [x1, #20] 0x0000000102f1e308: lsl x12, x12, #3 0x0000000102f1e30c: ldr w12, [x12, #40] 0x0000000102f1e310: lsl x12, x12, #3 0x0000000102f1e314: ldr w12, [x12, #36] 0x0000000102f1e318: lsl x12, x12, #3 0x0000000102f1e31c: ldr x12, [x12, #16] 0x0000000102f1e320: cbz x12, 0x0000000102f1e32c 0x0000000102f1e324: ldr x8, [x12, #64] 0x0000000102f1e328: br x8 0x0000000102f1e32c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38702 55 n java.lang.invoke.MethodHandle::invokeBasic(J)J (native) total in heap [0x0000000102f1e508,0x0000000102f1e630] = 296 relocation [0x0000000102f1e5e0,0x0000000102f1e5e8] = 8 main code [0x0000000102f1e600,0x0000000102f1e630] = 48 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d4b48} 'invokeBasic' '(J)J' in 'java/lang/invoke/MethodHandle' # this: c_rarg1:c_rarg1 = 'java/lang/invoke/MethodHandle' # parm0: c_rarg2:c_rarg2 = long # [sp+0x0] (sp of caller) 0x0000000102f1e600: nop 0x0000000102f1e604: ldr w12, [x1, #20] 0x0000000102f1e608: lsl x12, x12, #3 0x0000000102f1e60c: ldr w12, [x12, #40] 0x0000000102f1e610: lsl x12, x12, #3 0x0000000102f1e614: ldr w12, [x12, #36] 0x0000000102f1e618: lsl x12, x12, #3 0x0000000102f1e61c: ldr x12, [x12, #16] 0x0000000102f1e620: cbz x12, 0x0000000102f1e62c 0x0000000102f1e624: ldr x8, [x12, #64] 0x0000000102f1e628: br x8 0x0000000102f1e62c: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} -------------------------------------------------------------------------------- [/Disassembly] Compiled method (n/a) 38702 56 n java.lang.invoke.MethodHandle::linkToSpecial(LJL)J (native) total in heap [0x0000000102f1e808,0x0000000102f1e928] = 288 relocation [0x0000000102f1e8e0,0x0000000102f1e8e8] = 8 main code [0x0000000102f1e900,0x0000000102f1e924] = 36 stub code [0x0000000102f1e924,0x0000000102f1e928] = 4 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011c7d5010} 'linkToSpecial' '(Ljava/lang/Object;JLjava/lang/invoke/MemberName;)J' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = long # parm2: c_rarg3:c_rarg3 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x0000000102f1e900: nop 0x0000000102f1e904: ldr xzr, [x1] 0x0000000102f1e908: ldr w12, [x3, #36] 0x0000000102f1e90c: lsl x12, x12, #3 0x0000000102f1e910: ldr x12, [x12, #16] 0x0000000102f1e914: cbz x12, 0x0000000102f1e920 0x0000000102f1e918: ldr x8, [x12, #64] 0x0000000102f1e91c: br x8 0x0000000102f1e920: b 0x0000000102e9e080 ; {runtime_call AbstractMethodError throw_exception} [Stub Code] 0x0000000102f1e924: udf #0 -------------------------------------------------------------------------------- [/Disassembly] Process finished with exit code 0 ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2160211224 From epeter at openjdk.org Tue Jun 11 09:37:12 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 11 Jun 2024 09:37:12 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: <3Glk5fqUv6mOnaBeQZIMctUDhZzASCQEf4VNpTaEfvE=.ee098533-80c0-4bc1-8ae1-c8551b2c9ac6@github.com> References: <9_OMKYvdx-xW6KC3sKovIYawgaRZfkKyvU97jcuskzM=.484f62c9-521c-47b5-a68a-6abc89c0e768@github.com> <3Glk5fqUv6mOnaBeQZIMctUDhZzASCQEf4VNpTaEfvE=.ee098533-80c0-4bc1-8ae1-c8551b2c9ac6@github.com> Message-ID: On Tue, 11 Jun 2024 09:17:00 GMT, Shaojin Wen wrote: >> @wenshao >>> @eme64 It seems that when the following code uses StringUTF16.putChar, C2's optimization is not as good as the manual merging and storage effect. >> >> As I asked above, you will need to provide some evidence / generated assembly / perf data, and logs from `TraceMergeStores`. I currently do not have time to produce these myself, and I think they would be crucial to determine where the missing performance has gone. See my earlier comment: >> https://github.com/openjdk/jdk/pull/19626#issuecomment-2158533469 >> >> And please also try @cl4es advide here: >> https://github.com/openjdk/jdk/pull/19626#issuecomment-2159509806 >> >> And sure, maybe you need some public API for setting multiple bytes at once, which the `MergeStores` optimization can optimize. I'm a C2 engineer, so I leave that up to the library folks ;) > > @eme64 The assembly information is below, can you take a look and see if it can help you diagnose the problem? > > * JavaCode > > class AbstractStringBuilder { > private AbstractStringBuilder appendNull() { > int count = this.count; > ensureCapacityInternal(count + 4); > byte[] val = this.value; > if (isLatin1()) { > val[count ] = 'n'; > val[count + 1] = 'u'; > val[count + 2] = 'l'; > val[count + 3] = 'l'; > } else { > StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l'); > } > this.count = count + 4; > return this; > } > } > > class StringUTF16 { > public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) { > putChar(value, i , c1); > putChar(value, i + 1, c2); > putChar(value, i + 2, c3); > putChar(value, i + 3, c4); > } > } > > > * Apple M1 StringBuilder.appendNull PrintAssembly > > /Users/wenshao/Work/git/jdk/build/macosx-aarch64-server-release/jdk/bin/java -XX:+UnlockDiagnosticVMOptions -XX:+PrintAssembly -XX:CompileCommand=compileonly,*StringBuilder.appendNull -XX:-TieredCompilation -XX:TieredStopAtLevel=4 -javaagent:/Applications/IntelliJ IDEA.app/Contents/lib/idea_rt.jar=61041:/Applications/IntelliJ IDEA.app/Contents/bin -Dfile.encoding=UTF-8 -Dsun.stdout.encoding=UTF-8 -Dsun.stderr.encoding=UTF-8 .... > > Compiled method (n/a) 96 1 n java.lang.invoke.MethodHandle::linkToStatic(LLLLLLL)L (native) > total in heap [0x0000000102efba08,0x0000000102efbb20] = 280 > relocation [0x0000000102efbae0,0x0000000102efbae8] = 8 > main code [0x0000000102efbb00,0x0000000102efbb20] = 32 > > [Disassembly] > -------------------------------------------------------------------------------- > [Constant Pool (empty)] > > -------------------------------------------------------------------------------- > > [Verified Entry Point] > # {method} {0x000000011c3e1c80} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' > # parm0: c_rarg1:c_rarg1 > = 'java/lang/Object' > # parm1: c_rarg2:c_rarg2 > = 'java/lang/Object' > # parm2: c_rarg3:c_rarg3 > = 'java/lang/Object' > # parm3: c_rarg4:c_rarg4 > = 'java/lang/Object' > # parm4: c_rarg5:c_rarg5 > = 'j... @wenshao This is just an assembly dump. You need to have some profiling data that tells you where the time is spent. I'm not going to do the analysis work for you, I'm sorry. I gave you some pointers as how to do that. If you have more questions about how to do that, feel free to ask. You also have not provided the `TraceMergeStores` log yet, as I asked you. Can you investigate WHY there is a performance difference? Which `loads` and `branches` etc are generated? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2160251736 From duke at openjdk.org Tue Jun 11 09:42:13 2024 From: duke at openjdk.org (Shaojin Wen) Date: Tue, 11 Jun 2024 09:42:13 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: <9_OMKYvdx-xW6KC3sKovIYawgaRZfkKyvU97jcuskzM=.484f62c9-521c-47b5-a68a-6abc89c0e768@github.com> <3Glk5fqUv6mOnaBeQZIMctUDhZzASCQEf4VNpTaEfvE=.ee098533-80c0-4bc1-8ae1-c8551b2c9ac6@github.com> Message-ID: On Tue, 11 Jun 2024 09:32:14 GMT, Emanuel Peter wrote: >> @eme64 The assembly information is below, can you take a look and see if it can help you diagnose the problem? >> >> * JavaCode >> >> class AbstractStringBuilder { >> private AbstractStringBuilder appendNull() { >> int count = this.count; >> ensureCapacityInternal(count + 4); >> byte[] val = this.value; >> if (isLatin1()) { >> val[count ] = 'n'; >> val[count + 1] = 'u'; >> val[count + 2] = 'l'; >> val[count + 3] = 'l'; >> } else { >> StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l'); >> } >> this.count = count + 4; >> return this; >> } >> } >> >> class StringUTF16 { >> public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) { >> putChar(value, i , c1); >> putChar(value, i + 1, c2); >> putChar(value, i + 2, c3); >> putChar(value, i + 3, c4); >> } >> } >> >> >> * Apple M1 StringBuilder.appendNull PrintAssembly >> >> /Users/wenshao/Work/git/jdk/build/macosx-aarch64-server-release/jdk/bin/java -XX:+UnlockDiagnosticVMOptions -XX:+PrintAssembly -XX:CompileCommand=compileonly,*StringBuilder.appendNull -XX:-TieredCompilation -XX:TieredStopAtLevel=4 -javaagent:/Applications/IntelliJ IDEA.app/Contents/lib/idea_rt.jar=61041:/Applications/IntelliJ IDEA.app/Contents/bin -Dfile.encoding=UTF-8 -Dsun.stdout.encoding=UTF-8 -Dsun.stderr.encoding=UTF-8 .... >> >> Compiled method (n/a) 96 1 n java.lang.invoke.MethodHandle::linkToStatic(LLLLLLL)L (native) >> total in heap [0x0000000102efba08,0x0000000102efbb20] = 280 >> relocation [0x0000000102efbae0,0x0000000102efbae8] = 8 >> main code [0x0000000102efbb00,0x0000000102efbb20] = 32 >> >> [Disassembly] >> -------------------------------------------------------------------------------- >> [Constant Pool (empty)] >> >> -------------------------------------------------------------------------------- >> >> [Verified Entry Point] >> # {method} {0x000000011c3e1c80} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' >> # parm0: c_rarg1:c_rarg1 >> = 'java/lang/Object' >> # parm1: c_rarg2:c_rarg2 >> = 'java/lang/Object' >> # parm2: c_rarg3:c_rarg3 >> = 'java/lang/Object' >> # parm3: c_rarg4:c... > > @wenshao This is just an assembly dump. You need to have some profiling data that tells you where the time is spent. I'm not going to do the analysis work for you, I'm sorry. I gave you some pointers as how to do that. If you have more questions about how to do that, feel free to ask. You also have not provided the `TraceMergeStores` log yet, as I asked you. > > Can you investigate WHY there is a performance difference? Which `loads` and `branches` etc are generated? @eme64 How to TraceMergeStores? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2160275514 From epeter at openjdk.org Tue Jun 11 09:46:24 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 11 Jun 2024 09:46:24 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null In-Reply-To: References: <9_OMKYvdx-xW6KC3sKovIYawgaRZfkKyvU97jcuskzM=.484f62c9-521c-47b5-a68a-6abc89c0e768@github.com> <3Glk5fqUv6mOnaBeQZIMctUDhZzASCQEf4VNpTaEfvE=.ee098533-80c0-4bc1-8ae1-c8551b2c9ac6@github.com> Message-ID: On Tue, 11 Jun 2024 09:39:29 GMT, Shaojin Wen wrote: >> @wenshao This is just an assembly dump. You need to have some profiling data that tells you where the time is spent. I'm not going to do the analysis work for you, I'm sorry. I gave you some pointers as how to do that. If you have more questions about how to do that, feel free to ask. You also have not provided the `TraceMergeStores` log yet, as I asked you. >> >> Can you investigate WHY there is a performance difference? Which `loads` and `branches` etc are generated? > > @eme64 How to TraceMergeStores? @wenshao Have you grepped it in the code base? `grep TraceMergeStores src/hotspot/share/ -r` Hence, use enable flag with `-XX:+TraceMergeStores` ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2160285500 From epeter at openjdk.org Tue Jun 11 09:53:17 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 11 Jun 2024 09:53:17 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null [v3] In-Reply-To: References: Message-ID: On Tue, 11 Jun 2024 05:42:27 GMT, Shaojin Wen wrote: >> After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. >> >> This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. > > Shaojin Wen has updated the pull request incrementally with one additional commit since the last revision: > > bug fix Also: the assembly that you pasted: does it have the original stores, or do you see that the stores were actually merged? ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2160305076 From duke at openjdk.org Tue Jun 11 09:58:13 2024 From: duke at openjdk.org (Ferenc Rakoczi) Date: Tue, 11 Jun 2024 09:58:13 GMT Subject: RFR: 8333867: SHA3 performance can be improved In-Reply-To: References: Message-ID: <9k5Cf1pnyqps-ajlB7hVNb7w_XYmTNP7uTDKd7pePVE=.34f3b680-fe4f-4069-b8b2-a2f20cfb235e@github.com> On Mon, 10 Jun 2024 15:01:55 GMT, Ferenc Rakoczi wrote: > This PR removes some unnecessary conversions between byte arrays and long arrays during SHA3 digest computations. Some microbenchmark data (the percentages are improvements in ops/sec): Benchmark Linux aarch64 Linux x64 MacOSX aarch64 MacOSX x64 openjdk.bench.java.security.MessageDigests.digest-digesterName:SHA3_256-length:16384-provider:DEFAULT 27.67% 20.60% -0.00% 24.75% openjdk.bench.java.security.MessageDigests.digest-digesterName:SHA3_256-length:64-provider:DEFAULT 11.95% 6.27% -3.97% 12.72% openjdk.bench.java.security.MessageDigests.digest-digesterName:SHA3_512-length:16384-provider:DEFAULT 18.00% 14.99% 0.01% 15.35% openjdk.bench.java.security.MessageDigests.digest-digesterName:SHA3_512-length:64-provider:DEFAULT 11.91% 5.87% -3.39% 10.04% ------------- PR Comment: https://git.openjdk.org/jdk/pull/19632#issuecomment-2160317942 From fgao at openjdk.org Tue Jun 11 09:59:18 2024 From: fgao at openjdk.org (Fei Gao) Date: Tue, 11 Jun 2024 09:59:18 GMT Subject: RFR: 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug" [v3] In-Reply-To: References: <16J-lJ2AceGTVcRWBcP15yKcwO-1IA1XsngyOuNjf7k=.0776f081-ae2c-4279-87cf-d909806c2bc4@github.com> Message-ID: <8rCiP8cJqVLboRSHx7Q_7w3qnZyjBTDHoEj2e_C48qU=.f4cc070a-13a0-4717-8262-a7781d8454b7@github.com> On Thu, 6 Jun 2024 15:39:22 GMT, Andrew Haley wrote: > On 6/6/24 13:42, Fei Gao wrote: > > > Sorry, did you mean loading from base plus offset, like `ldr x0, [x6, #8]` or `ldr x0, [x6, x7]`, takes one more cycle than loading from base > > register only, like `ldr x0, [x6]`? Does the address addition take one > > cycle? > > We know that, on many Arm cores, Store ?OPs are split into address and data ?OPs which are executed separately. That doesn't usually cause any additional delay, because cores execute many operations in parallel, so an address generation ?OP for base+offset very probably will execute in parallel with some previous instructions, meaning that the target address is ready before it is needed. This split of address generation must happen regardless of whether a store (or a load) is a single instruction > > `str x0, [x1, #80]` > > or a pair of instructions > > `add r8, x1, #80; str x0, [x8]`. > > Of course, a pair of instructions occupies twice as much icache space, and you can run out of instruction decode bandwidth. However, in the case of Unsafe operations, I don't believe that an occasional unnecessary two-instruction operation will result in a performance regression. Thanks for your kind explanation @theRealAph . That quite makes sense to me. I'll continue processing this pull request to implement it. ------------- PR Comment: https://git.openjdk.org/jdk/pull/16991#issuecomment-2160321273 From fgao at openjdk.org Tue Jun 11 10:04:17 2024 From: fgao at openjdk.org (Fei Gao) Date: Tue, 11 Jun 2024 10:04:17 GMT Subject: RFR: 8321308: AArch64: Fix matching predication for cbz/cbnz [v2] In-Reply-To: References: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> Message-ID: On Mon, 10 Jun 2024 10:29:03 GMT, Andrew Dinn wrote: > Thanks, Fei. Looks good. Thanks @adinn . Can I have a second review for the latest commit please? @dean-long @theRealAph @RealFYang @TobiHartmann ------------- PR Comment: https://git.openjdk.org/jdk/pull/16989#issuecomment-2160336097 From qamai at openjdk.org Tue Jun 11 11:11:22 2024 From: qamai at openjdk.org (Quan Anh Mai) Date: Tue, 11 Jun 2024 11:11:22 GMT Subject: RFR: 8323079: Regression of -5% to -11% with SPECjvm2008-MonteCarlo after JDK-8319451 Message-ID: <4Rj1XN-XrPA-f9U9jM89v1iO-mLEa3UlZK6B8p2CtlQ=.0a4d9776-0be7-4b45-b8e6-99a0884a62fe@github.com> Hi, I cannot explain the regression, comparing the current mainline to JDK-21 reveals a decrease in performance, yet it is only for some combinations of OS-GC and perfasm shows that the hot regions (>99% of execution time) do not contain differences that can explain the results. Consequently, with the advice of @TobiHartmann , I propose to effectively revert JDK-8319451 for the generation of `CMove`s inside loops. For those outside, the before-JDK-8319451 probability threshold is 0.001 and the current value is 0.01. I think the current value is more reasonable as evidenced by the benchmark added in JDK-8319451. Please kindly review, thank you very much. Quan Anh ------------- Commit messages: - JDK-8323079 Changes: https://git.openjdk.org/jdk/pull/19650/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19650&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8323079 Stats: 1 line in 1 file changed: 0 ins; 0 del; 1 mod Patch: https://git.openjdk.org/jdk/pull/19650.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19650/head:pull/19650 PR: https://git.openjdk.org/jdk/pull/19650 From qamai at openjdk.org Tue Jun 11 11:16:11 2024 From: qamai at openjdk.org (Quan Anh Mai) Date: Tue, 11 Jun 2024 11:16:11 GMT Subject: RFR: 8323079: Regression of -5% to -11% with SPECjvm2008-MonteCarlo after JDK-8319451 In-Reply-To: <4Rj1XN-XrPA-f9U9jM89v1iO-mLEa3UlZK6B8p2CtlQ=.0a4d9776-0be7-4b45-b8e6-99a0884a62fe@github.com> References: <4Rj1XN-XrPA-f9U9jM89v1iO-mLEa3UlZK6B8p2CtlQ=.0a4d9776-0be7-4b45-b8e6-99a0884a62fe@github.com> Message-ID: On Tue, 11 Jun 2024 11:06:11 GMT, Quan Anh Mai wrote: > Hi, > > I cannot explain the regression, comparing the current mainline to JDK-21 reveals a decrease in performance, yet it is only for some combinations of OS-GC and perfasm shows that the hot regions (>99% of execution time) do not contain differences that can explain the results. > > Consequently, with the advice of @TobiHartmann , I propose to effectively revert JDK-8319451 for the generation of `CMove`s inside loops. For those outside, the before-JDK-8319451 probability threshold is 0.001 and the current value is 0.01. I think the current value is more reasonable as evidenced by the benchmark added in JDK-8319451. > > Please kindly review, thank you very much. > Quan Anh @TobiHartmann Can you verify if the regression is solved with this patch. Also, I decided to not include your benchmark. The reason is that it is actually similar to that added by JDK-8319451. For such a small array, I believe the branch predictor can remember the whole sequence, which results in perfect prediction all the time, explaining the results. I recall that when making that benchmark, my CPU could remember an array of 1000 elements, which led to me using a huge array of 1 million. Thank you very much. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19650#issuecomment-2160481354 From chagedorn at openjdk.org Tue Jun 11 11:35:20 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 11 Jun 2024 11:35:20 GMT Subject: RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit [v5] In-Reply-To: References: Message-ID: On Thu, 6 Jun 2024 07:41:16 GMT, Christian Hagedorn wrote: >> A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. >> >> #### Idea of Partial Peeling >> >> Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). >> >> #### Partial Peeling with Unsigned Test >> >> However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 >> >> #### Requirements for Using an Unsigned Test >> >> The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: >> - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. >> - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. >> >> #### The Requirements Are Currently Broken >> >> This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: >> >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 >> >> #### Why Are the Requirements Broken? >> >> The reason is that >> >> i >=u limit >> >> can only be converted into the two signed comparisons >> >> i < 0 || i >= limit >> >> if `limit` is non-negative (i.e. `limit >= 0`): >> https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 >> >> This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. >> >> #### Fixing the Broken Requirements >> To fix this, I've added a ba... > > Christian Hagedorn has updated the pull request incrementally with one additional commit since the last revision: > > Added note about dummy-if Performance testing looked good. Thanks again for your careful reviews! ------------- PR Comment: https://git.openjdk.org/jdk/pull/19522#issuecomment-2160516296 From chagedorn at openjdk.org Tue Jun 11 11:35:22 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 11 Jun 2024 11:35:22 GMT Subject: Integrated: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit In-Reply-To: References: Message-ID: <3X7uLOcj99d65UySdzojgmWEvy4v0quf9L6UQm9OTsE=.9511f0cb-1a65-42a7-9a99-7e3b779599ca@github.com> On Mon, 3 Jun 2024 12:39:05 GMT, Christian Hagedorn wrote: > A signed test is wrongly split off an unsigned test during Partial Peeling which results in not entering a loop even though it should. > > #### Idea of Partial Peeling > > Partial Peeling rotates a loop with the hope to being able to convert the loop into a counted loop later. It is therefore preferable to use signed tests over unsigned tests because counted loops must use a signed loop exit test (i.e. with a `BaseCountedLoopEnd`). > > #### Partial Peeling with Unsigned Test > > However, if there is only a suitable unsigned test, we can still try to use it for Partial Peeling. The idea is to then split off a signed version of the unsigned test and place it right before the unsigned test which can then be used as a loop exit test and later as `BaseCountedLoopEnd`: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3074-L3080 > > #### Requirements for Using an Unsigned Test > > The Signed and Unsigned Loop Exit Test do not need have the same result at runtime. It is sufficient if the Signed Loop Exit Test implies the Unsigned Loop Exit Test: > - If the Signed Loop Exit Test is false, then the (original) Unsigned Loop Exit Test will make sure to exit the loop if required. > - If the Signed Loop Exit Test is true, then the (original) Unsigned Loop Exit Test must also be true. Otherwise, we would exit a loop that we should have continued to execute. > > #### The Requirements Are Currently Broken > > This strong requirement for splitting off a signed test is currently broken as seen in the test cases (for example, `testWhileLTIncr()`): We split off the signed loop exit test `i >= limit`, then partial peel and we get the signed loop exit test `i >= limit` as entry guard to the loop which is wrong: > > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/test/hotspot/jtreg/compiler/loopopts/TestPartialPeelAtUnsignedTestsNegativeLimit.java#L159-L178 > > #### Why Are the Requirements Broken? > > The reason is that > > i >=u limit > > can only be converted into the two signed comparisons > > i < 0 || i >= limit > > if `limit` is non-negative (i.e. `limit >= 0`): > https://github.com/openjdk/jdk/blob/0ea3bacf48be90d93f9e6c8e6568a0b8c61afb46/src/hotspot/share/opto/loopopts.cpp#L3054-L3061 > > This is currently missing and we wrongly use `i < 0` or `i >= limit` as split off signed test. > > #### Fixing the Broken Requirements > To fix this, I've added a bailout when `limit` could be negative which is the same as checking if the type of ... This pull request has now been integrated. Changeset: ef101f1b Author: Christian Hagedorn URL: https://git.openjdk.org/jdk/commit/ef101f1bf20f2813f855af4bc4eb317565175208 Stats: 493 lines in 2 files changed: 455 ins; 0 del; 38 mod 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit Reviewed-by: kvn, thartmann, epeter ------------- PR: https://git.openjdk.org/jdk/pull/19522 From duke at openjdk.org Tue Jun 11 11:35:28 2024 From: duke at openjdk.org (Shaojin Wen) Date: Tue, 11 Jun 2024 11:35:28 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null [v4] In-Reply-To: References: Message-ID: > After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. > > This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. Shaojin Wen has updated the pull request incrementally with one additional commit since the last revision: revert ------------- Changes: - all: https://git.openjdk.org/jdk/pull/19626/files - new: https://git.openjdk.org/jdk/pull/19626/files/f96cde4e..27a3050a Webrevs: - full: https://webrevs.openjdk.org/?repo=jdk&pr=19626&range=03 - incr: https://webrevs.openjdk.org/?repo=jdk&pr=19626&range=02-03 Stats: 18 lines in 1 file changed: 0 ins; 9 del; 9 mod Patch: https://git.openjdk.org/jdk/pull/19626.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19626/head:pull/19626 PR: https://git.openjdk.org/jdk/pull/19626 From aph at openjdk.org Tue Jun 11 11:42:28 2024 From: aph at openjdk.org (Andrew Haley) Date: Tue, 11 Jun 2024 11:42:28 GMT Subject: RFR: 8321308: AArch64: Fix matching predication for cbz/cbnz [v2] In-Reply-To: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> References: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> Message-ID: On Thu, 6 Jun 2024 14:35:29 GMT, Fei Gao wrote: >> For array length check like: >> >> if (a.length > 0) { >> [Block 1] >> } else { >> [Block 2] >> } >> >> >> Since `a.length` is unsigned, it's semantically equivalent to: >> >> if (a.length != 0) { >> [Block 1] >> } else { >> [Block 2] >> } >> >> >> On aarch64 port, we can do the conversion like above, during c2 compiler instruction matching, for certain unsigned integral comparisons. >> >> For example, >> >> cmpw w11, #0 # unsigned >> bls label # unsigned >> [Block 1] >> >> label: >> [Block 2] >> >> >> can be converted to: >> >> cbz w11, label >> [Block 1] >> >> label: >> [Block 2] >> >> >> Currently, we have some matching rules to do the conversion [[1]](https://github.com/openjdk/jdk/blob/4f1a10f84bcfadef263a0890b6834ccd3d5bb52f/src/hotspot/cpu/aarch64/aarch64.ad#L15688). But the predicate here [[2]](https://github.com/openjdk/jdk/blob/4f1a10f84bcfadef263a0890b6834ccd3d5bb52f/src/hotspot/cpu/aarch64/aarch64.ad#L5631) matches wrong `BoolTest` masks, so these rules fail to convert. I guess it's a typo introduced in [JDK-8160006](https://bugs.openjdk.org/browse/JDK-8160006). The patch fixes it. > > Fei Gao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: > > - Redefine the interface for cmpOpUEqNeLeGt > - Merge branch 'master' into fg8321308 > - 8321308: AArch64: Fix matching predication for cbz/cbnz > > For array length check like: > ``` > if (a.length > 0) { > [Block 1] > } else { > [Block 2] > } > ``` > > Since `a.length` is unsigned, it's semantically equivalent to: > ``` > if (a.length != 0) { > [Block 1] > } else { > [Block 2] > } > ``` > > On aarch64 port, we can do the conversion like above, during c2 > compiler instruction matching, for certain unsigned integral > comparisons. > > For example, > ``` > cmpw w11, #0 # unsigned > bls label # unsigned > [Block 1] > > label: > [Block 2] > ``` > > can be converted to: > ``` > cbz w11, label > [Block 1] > > label: > [Block 2] > ``` > > Currently, we have some matching rules to do the conversion[1]. > But the predicate here[2] matches wrong `BoolTest` masks, > so these rules fail to convert. I guess it's a typo introduced > in JDK-8160006. The patch fixes it. > > [1] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L16179 > [2] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L6140 Marked as reviewed by aph (Reviewer). ------------- PR Review: https://git.openjdk.org/jdk/pull/16989#pullrequestreview-2110182426 From chagedorn at openjdk.org Tue Jun 11 11:54:40 2024 From: chagedorn at openjdk.org (Christian Hagedorn) Date: Tue, 11 Jun 2024 11:54:40 GMT Subject: [jdk23] RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit Message-ID: Hi all, This pull request contains a backport of commit [ef101f1b](https://github.com/openjdk/jdk/commit/ef101f1bf20f2813f855af4bc4eb317565175208) from the [openjdk/jdk](https://git.openjdk.org/jdk) repository. The commit being backported was authored by Christian Hagedorn on 11 Jun 2024 and was reviewed by Vladimir Kozlov, Tobias Hartmann and Emanuel Peter. Thanks! ------------- Commit messages: - Backport ef101f1bf20f2813f855af4bc4eb317565175208 Changes: https://git.openjdk.org/jdk/pull/19653/files Webrev: https://webrevs.openjdk.org/?repo=jdk&pr=19653&range=00 Issue: https://bugs.openjdk.org/browse/JDK-8332920 Stats: 493 lines in 2 files changed: 455 ins; 0 del; 38 mod Patch: https://git.openjdk.org/jdk/pull/19653.diff Fetch: git fetch https://git.openjdk.org/jdk.git pull/19653/head:pull/19653 PR: https://git.openjdk.org/jdk/pull/19653 From fgao at openjdk.org Tue Jun 11 12:02:20 2024 From: fgao at openjdk.org (Fei Gao) Date: Tue, 11 Jun 2024 12:02:20 GMT Subject: RFR: 8321308: AArch64: Fix matching predication for cbz/cbnz [v2] In-Reply-To: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> References: <6FSXKpeMm9wx4EYkn0YFOoxhM1Y7HLR983UBFHNspKs=.2442771b-050f-4a15-b6ef-e9c7fecf7be9@github.com> Message-ID: On Thu, 6 Jun 2024 14:35:29 GMT, Fei Gao wrote: >> For array length check like: >> >> if (a.length > 0) { >> [Block 1] >> } else { >> [Block 2] >> } >> >> >> Since `a.length` is unsigned, it's semantically equivalent to: >> >> if (a.length != 0) { >> [Block 1] >> } else { >> [Block 2] >> } >> >> >> On aarch64 port, we can do the conversion like above, during c2 compiler instruction matching, for certain unsigned integral comparisons. >> >> For example, >> >> cmpw w11, #0 # unsigned >> bls label # unsigned >> [Block 1] >> >> label: >> [Block 2] >> >> >> can be converted to: >> >> cbz w11, label >> [Block 1] >> >> label: >> [Block 2] >> >> >> Currently, we have some matching rules to do the conversion [[1]](https://github.com/openjdk/jdk/blob/4f1a10f84bcfadef263a0890b6834ccd3d5bb52f/src/hotspot/cpu/aarch64/aarch64.ad#L15688). But the predicate here [[2]](https://github.com/openjdk/jdk/blob/4f1a10f84bcfadef263a0890b6834ccd3d5bb52f/src/hotspot/cpu/aarch64/aarch64.ad#L5631) matches wrong `BoolTest` masks, so these rules fail to convert. I guess it's a typo introduced in [JDK-8160006](https://bugs.openjdk.org/browse/JDK-8160006). The patch fixes it. > > Fei Gao has updated the pull request with a new target base due to a merge or a rebase. The incremental webrev excludes the unrelated changes brought in by the merge/rebase. The pull request contains three additional commits since the last revision: > > - Redefine the interface for cmpOpUEqNeLeGt > - Merge branch 'master' into fg8321308 > - 8321308: AArch64: Fix matching predication for cbz/cbnz > > For array length check like: > ``` > if (a.length > 0) { > [Block 1] > } else { > [Block 2] > } > ``` > > Since `a.length` is unsigned, it's semantically equivalent to: > ``` > if (a.length != 0) { > [Block 1] > } else { > [Block 2] > } > ``` > > On aarch64 port, we can do the conversion like above, during c2 > compiler instruction matching, for certain unsigned integral > comparisons. > > For example, > ``` > cmpw w11, #0 # unsigned > bls label # unsigned > [Block 1] > > label: > [Block 2] > ``` > > can be converted to: > ``` > cbz w11, label > [Block 1] > > label: > [Block 2] > ``` > > Currently, we have some matching rules to do the conversion[1]. > But the predicate here[2] matches wrong `BoolTest` masks, > so these rules fail to convert. I guess it's a typo introduced > in JDK-8160006. The patch fixes it. > > [1] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L16179 > [2] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L6140 Thanks for all your reviews and comments. I'll integrate it. ------------- PR Comment: https://git.openjdk.org/jdk/pull/16989#issuecomment-2160581932 From stuefe at openjdk.org Tue Jun 11 12:05:13 2024 From: stuefe at openjdk.org (Thomas Stuefe) Date: Tue, 11 Jun 2024 12:05:13 GMT Subject: RFR: 8330157: C2: Add a stress flag for bailouts In-Reply-To: References: Message-ID: On Tue, 11 Jun 2024 07:14:20 GMT, Daniel Skantz wrote: > This patch adds a diagnostic/stress flag for C2 bailouts. It can be used to support testing of existing bailouts to prevent issues like [JDK-8318445](https://bugs.openjdk.org/browse/JDK-8318445), and can test for issues only seen at runtime such as [JDK-8326376](https://bugs.openjdk.org/browse/JDK-8326376). It can also be useful if we want to add more bailouts ([JDK-8318900](https://bugs.openjdk.org/browse/JDK-8318900)). > > We check two invariants. > a) Bailouts should be successful starting from any given `failing()` check. > b) The VM should not record a bailout when one is pending (in which case we have continued to optimize for too long). > > a), b) are checked by randomly starting a bailout at calls to `failing()` with a user-given probability. > > The added flag should not have any effect in debug mode. > > Testing: > > T1-5, with flag and without it. We want to check that this does not cause any test failures without the flag set, and no unexpected failures with it. Tests failing because of timeout or because an error is printed to output when compilation fails can be expected in some cases. This looks useful. Is it planned to also test bailouts for C1 with the same switch? src/hotspot/share/opto/c2_globals.hpp line 76: > 74: "Stress bailout every n:th time on average") \ > 75: range(1, max_juint) \ > 76: \ "Interval" is usually a time period. Maybe `StressBailoutProbability` ? src/hotspot/share/opto/compile.hpp line 831: > 829: > 830: const CompilationFailureInfo* first_failure_details() const { return _first_failure_details; } > 831: - Why guarantees? Why not assert? Do we really want this stress code in release builds? - If assert, the `skip` parameter can be DEBUG_ONLY. - In any case, I would rename it since is not very clear without examining the source code now. We don't skip the error check, we override the random stress bailout check. E.g. something like `DEBUG_ONLY(bool no_stress_bailout)` ? src/hotspot/share/opto/compile.hpp line 839: > 837: return false; > 838: } > 839: return fail_randomly(StressBailoutInterval); To make this easier to understand, I'd invert the logic to if (StressBailout && !skip) { return fail_randomly } src/hotspot/share/opto/compile.hpp line 843: > 841: > 842: bool fail_randomly(uint invprob) { > 843: guarantee(0 < invprob, "domain error"); - invprob depends on StressBailoutInterval. It never is anything different. Why not hard-code it right here for simplicity? - Does this really need guarantee? Maybe assert, or potentially remove completely? `StressBailoutInterval` is a range-checked option >=1, and if range-checks don't work we notice. Also, since we use it for division, we would get a clear error immediately. src/hotspot/share/opto/compile.hpp line 847: > 845: return false; // debug builds assert on bailouts. > 846: #endif > 847: if (!_stress_seed || (random() % invprob)) { - nit, unless comparing boolean expressions, we use explicit comparisons, not ! - but the _stress_seed != 0 exclusion is somewhat odd. We allow the seed to be 0 - since `-XX:StressSeed=0` is valid. So, either we should make sure `StressSeed` can never be 0 by restricting its valid range to >=1. Then, here (or better, inside Compile::random()) we should assert that seed != 0. Alternatively, if StressSeed=0 is supposed to be valid, this exclusion here can be removed. Note that it does not get mirrored by other uses of `Compile::random`. src/hotspot/share/opto/compile.hpp line 853: > 851: return true; > 852: } > 853: Does this function really have to be inline? We only exercise it for StressBailout=true ------------- PR Review: https://git.openjdk.org/jdk/pull/19646#pullrequestreview-2109947885 PR Review Comment: https://git.openjdk.org/jdk/pull/19646#discussion_r1634577626 PR Review Comment: https://git.openjdk.org/jdk/pull/19646#discussion_r1634562062 PR Review Comment: https://git.openjdk.org/jdk/pull/19646#discussion_r1634578749 PR Review Comment: https://git.openjdk.org/jdk/pull/19646#discussion_r1634570032 PR Review Comment: https://git.openjdk.org/jdk/pull/19646#discussion_r1634583753 PR Review Comment: https://git.openjdk.org/jdk/pull/19646#discussion_r1634574933 From duke at openjdk.org Tue Jun 11 12:14:14 2024 From: duke at openjdk.org (Shaojin Wen) Date: Tue, 11 Jun 2024 12:14:14 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null [v4] In-Reply-To: References: Message-ID: On Tue, 11 Jun 2024 11:35:28 GMT, Shaojin Wen wrote: >> After PR https://github.com/openjdk/jdk/pull/16245, C2 optimizes stores into primitive arrays by combining values ??into larger stores. >> >> This PR rewrites the code of appendNull and append(boolean) methods so that these two methods can be optimized by C2. > > Shaojin Wen has updated the pull request incrementally with one additional commit since the last revision: > > revert ([f96cde4e](https://git.openjdk.org/jdk/pull/19626/files/f96cde4e79e12e2ea46e6061f918a69f11d59985)) ([0cbaa5ac](https://git.openjdk.org/jdk/pull/19626/files/0cbaa5ac04873373131f1a7216353a5af5b3e48e)) The previous overriding of StringUTF16.putChar method did not improve performance, so I reverted to the original version. I have collected information on TraceMergeStores, but I don't have enough knowledge to analyze it yet https://github.com/wenshao/jdk/wiki/pr19626_appendNull_0 ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2160603791 From thartmann at openjdk.org Tue Jun 11 13:58:14 2024 From: thartmann at openjdk.org (Tobias Hartmann) Date: Tue, 11 Jun 2024 13:58:14 GMT Subject: [jdk23] RFR: 8332920: C2: Partial Peeling is wrongly applied for CmpU with negative limit In-Reply-To: References: Message-ID: On Tue, 11 Jun 2024 11:48:55 GMT, Christian Hagedorn wrote: > Hi all, > > This pull request contains a backport of commit [ef101f1b](https://github.com/openjdk/jdk/commit/ef101f1bf20f2813f855af4bc4eb317565175208) from the [openjdk/jdk](https://git.openjdk.org/jdk) repository. > > The commit being backported was authored by Christian Hagedorn on 11 Jun 2024 and was reviewed by Vladimir Kozlov, Tobias Hartmann and Emanuel Peter. > > Thanks! Looks good. ------------- Marked as reviewed by thartmann (Reviewer). PR Review: https://git.openjdk.org/jdk/pull/19653#pullrequestreview-2110551310 From epeter at openjdk.org Tue Jun 11 15:38:14 2024 From: epeter at openjdk.org (Emanuel Peter) Date: Tue, 11 Jun 2024 15:38:14 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null [v4] In-Reply-To: References: Message-ID: On Tue, 11 Jun 2024 12:11:21 GMT, Shaojin Wen wrote: >> Shaojin Wen has updated the pull request incrementally with one additional commit since the last revision: >> >> revert > > ([f96cde4e](https://git.openjdk.org/jdk/pull/19626/files/f96cde4e79e12e2ea46e6061f918a69f11d59985)) ([0cbaa5ac](https://git.openjdk.org/jdk/pull/19626/files/0cbaa5ac04873373131f1a7216353a5af5b3e48e)) The previous overriding of StringUTF16.putChar method did not improve performance, so I reverted to the original version. > > I have collected information on TraceMergeStores, but I don't have enough knowledge to analyze it yet > > https://github.com/wenshao/jdk/wiki/pr19626_appendNull_0 @wenshao have you published info about `TraceMergeStores` somewhere? It is very well possible that the optimization does not apply in your code. Then you would need to dig into the VM code and see why. ------------- PR Comment: https://git.openjdk.org/jdk/pull/19626#issuecomment-2161066866 From kvn at openjdk.org Tue Jun 11 15:57:13 2024 From: kvn at openjdk.org (Vladimir Kozlov) Date: Tue, 11 Jun 2024 15:57:13 GMT Subject: RFR: 8333867: SHA3 performance can be improved In-Reply-To: References: Message-ID: On Tue, 11 Jun 2024 08:03:48 GMT, Ferenc Rakoczi wrote: >> Well, the intrinsic function treats the input and state as long arrays anyways, and so it only works on little endian architectures, where the conversion is a no-op. There is no additional array copy, this b2lLittle() call used to be in the keccak() method (along with the conversion back to byte array), the point of this whole change is that only one of these conversions should be done with every keccak() call (an additional benefit is that the xor and the corresponding loads+store is done on longs, not on bytes). > > Oh, and about the length: buffer is allocated in the constructor of the parent class (DigestBase) like this: > buffer = new byte[blockSize]; > Here blockSize is one of { 72, 104, 136, 144, 168 }, so divisible by 8. > buffer.length was used before probably because blockSize was declared private in DigestBase. I made it protected, because in my opinion it is easier to read the code this way. Thank you for explanation. An other question. Is any other use of `longBuf` array after `implCompress0()` call which load values from it? Because Intrinsic code will not update it. ------------- PR Review Comment: https://git.openjdk.org/jdk/pull/19632#discussion_r1635128980 From duke at openjdk.org Tue Jun 11 15:58:13 2024 From: duke at openjdk.org (Shaojin Wen) Date: Tue, 11 Jun 2024 15:58:13 GMT Subject: RFR: 8333893: Optimization for StringBuilder append boolean & null [v4] In-Reply-To: References: Message-ID: On Tue, 11 Jun 2024 15:35:47 GMT, Emanuel Peter wrote: >> ([f96cde4e](https://git.openjdk.org/jdk/pull/19626/files/f96cde4e79e12e2ea46e6061f918a69f11d59985)) ([0cbaa5ac](https://git.openjdk.org/jdk/pull/19626/files/0cbaa5ac04873373131f1a7216353a5af5b3e48e)) The previous overriding of StringUTF16.putChar method did not improve performance, so I reverted to the original version. >> >> I have collected information on TraceMergeStores, but I don't have enough knowledge to analyze it yet >> >> https://github.com/wenshao/jdk/wiki/pr19626_appendNull_0 > > @wenshao have you published info about `TraceMergeStores` somewhere? It is very well possible that the optimization does not apply in your code. Then you would need to dig into the VM code and see why. @eme64 TraceMergeStores are here, but I can't understand these assembly codes # JavaCode class AbstractStringBuilder { private AbstractStringBuilder appendNull() { int count = this.count; ensureCapacityInternal(count + 4); byte[] val = this.value; if (isLatin1()) { val[count ] = 'n'; val[count + 1] = 'u'; val[count + 2] = 'l'; val[count + 3] = 'l'; } else { StringUTF16.putCharsAt(val, count, 'n', 'u', 'l', 'l'); } this.count = count + 4; return this; } } class StringUTF16 { public static void putCharsAt(byte[] value, int i, char c1, char c2, char c3, char c4) { putChar(value, i , c1); putChar(value, i + 1, c2); putChar(value, i + 2, c3); putChar(value, i + 3, c4); } } # TraceMergeStores CompileCommand: compileonly *StringBuilder.appendNull bool compileonly = true Compiled method (n/a) 100 1 n java.lang.invoke.MethodHandle::linkToStatic(LLLLLLL)L (native) total in heap [0x00000001043b8088,0x00000001043b8230] = 424 relocation [0x00000001043b8170,0x00000001043b8188] = 24 main code [0x00000001043b81c0,0x00000001043b8230] = 112 [Disassembly] -------------------------------------------------------------------------------- [Constant Pool (empty)] -------------------------------------------------------------------------------- [Verified Entry Point] # {method} {0x000000011842b8b8} 'linkToStatic' '(Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/Object;Ljava/lang/invoke/MemberName;)Ljava/lang/Object;' in 'java/lang/invoke/MethodHandle' # parm0: c_rarg1:c_rarg1 = 'java/lang/Object' # parm1: c_rarg2:c_rarg2 = 'java/lang/Object' # parm2: c_rarg3:c_rarg3 = 'java/lang/Object' # parm3: c_rarg4:c_rarg4 = 'java/lang/Object' # parm4: c_rarg5:c_rarg5 = 'java/lang/Object' # parm5: c_rarg6:c_rarg6 = 'java/lang/Object' # parm6: c_rarg7:c_rarg7 = 'java/lang/invoke/MemberName' # [sp+0x0] (sp of caller) 0x00000001043b81c0: nop ;; verify_klass { 0x00000001043b81c4: cbz x7, 0x00000001043b81fc 0x00000001043b81c8: stp x8, x9, [sp, #-16]! 0x00000001043b81cc: ldr w9, [x7, #8] 0x00000001043b81d0: eor x9, x9, #0x400000000000 0x00000001043b81d4: adrp x8, 0x0000000103d1b000 ; {external_word} 0x00000001043b81d8: ldr x8, [x8, #1528] 0x00000001043b81dc: cmp x9, x8 0x00000001043b81e0: b.eq 0x00000001043b8208 // b.none 0x00000001043b81e4: ldr x9, [x9, #64] 0x00000001043b81e8: adrp x8, 0x0000000103d1b000 ; {external_word} 0x00000001043b81ec: ldr x8, [x8, #1528] 0x00000001043b81f0: cmp x9, x8 0x00000001043b81f4: b.eq 0x00000001043b8208 // b.none 0x00000001043b81f8: ldp x8, x9, [sp], #16 ;; MemberName required for invokeVirtual etc. 0x00000001043b81fc: dcps1 #0xdeae 0x00000001043b8200: .inst 0x03aa865f ; undefined 0x00000001043b8204: udf #1 ;; L_ok: 0x00000001043b8208: ldp x8, x9, [sp], #16 ;; } verify_klass 0x00000001043b820c: ldr w12, [x7, #36] 0x00000001043b8210: lsl x12, x12, #3 0x00000001043b8214: ldr x12, [x12, #16] 0x00000001043b8218: cbz x12, 0x00000001043b8224 0x00000001043b821c: ldr x8, [x12, #88] 0x00000001043b8220: br x8 0x00000001043b8224: adrp x8, 0x000000010438f000 ; {runtime_call AbstractMethodError throw_exception} 0x00000001043b8228: add x8, x8, #0x900 0x00000001043b822c: br x8 --------------------------------------------------------