Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/hotspot/share/opto/c2_globals.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,9 @@
product(bool, IncrementalInlineForceCleanup, false, DIAGNOSTIC, \
"do cleanup after every iteration of incremental inlining") \
\
product(bool, IncrementalInlineVector, true, DIAGNOSTIC, \
"Inline fallback implementation of failed vector intrinsics") \
\
product(intx, LiveNodeCountInliningCutoff, 40000, \
"max number of live nodes in a method") \
range(0, max_juint / 8) \
Expand Down
23 changes: 23 additions & 0 deletions src/hotspot/share/opto/callGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,29 @@ CallGenerator* CallGenerator::for_mh_late_inline(ciMethod* caller, ciMethod* cal
return cg;
}

class LateInlineVectorCallGenerator : public LateInlineCallGenerator {
public:
LateInlineVectorCallGenerator(ciMethod* method, CallGenerator* intrinsic_cg) :
LateInlineCallGenerator(method, intrinsic_cg) {}

virtual bool is_vector_late_inline() const { return true; }

virtual JVMState* generate(JVMState* jvms) {
JVMState* new_jvms = LateInlineCallGenerator::generate(jvms);
if (IncrementalInlineVector) {
CallGenerator* inline_cg = CallGenerator::for_inline(method());
CallGenerator* fallback = CallGenerator::for_late_inline(method(), inline_cg)->with_call_node(call_node());
Compile::current()->add_vector_late_inline(fallback);
}
return new_jvms;
}
};

CallGenerator* CallGenerator::for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg) {
return new LateInlineVectorCallGenerator(m, intrinsic_cg);
}


// Allow inlining decisions to be delayed
class LateInlineVirtualCallGenerator : public VirtualCallGenerator {
private:
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/callGenerator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ class CallGenerator : public ArenaObj {
// same but for method handle calls
virtual bool is_mh_late_inline() const { return false; }
virtual bool is_string_late_inline() const { return false; }
virtual bool is_vector_late_inline() const { return false; }
virtual bool is_boxing_late_inline() const { return false; }
virtual bool is_vector_reboxing_late_inline() const { return false; }
virtual bool is_virtual_late_inline() const { return false; }
Expand Down Expand Up @@ -142,6 +143,7 @@ class CallGenerator : public ArenaObj {
static CallGenerator* for_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_mh_late_inline(ciMethod* caller, ciMethod* callee, bool input_not_const);
static CallGenerator* for_string_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg);
static CallGenerator* for_boxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_vector_reboxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_late_inline_virtual(ciMethod* m, int vtable_index, float expected_uses);
Expand Down
33 changes: 33 additions & 0 deletions src/hotspot/share/opto/compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ void Compile::remove_useless_node(Node* dead) {
remove_useless_late_inlines( &_late_inlines, dead);
remove_useless_late_inlines( &_string_late_inlines, dead);
remove_useless_late_inlines( &_boxing_late_inlines, dead);
remove_useless_late_inlines( &_vector_late_inlines, dead);
remove_useless_late_inlines(&_vector_reboxing_late_inlines, dead);

if (dead->is_CallStaticJava()) {
Expand Down Expand Up @@ -480,6 +481,7 @@ void Compile::disconnect_useless_nodes(Unique_Node_List& useful, Unique_Node_Lis
remove_useless_late_inlines( &_late_inlines, useful);
remove_useless_late_inlines( &_string_late_inlines, useful);
remove_useless_late_inlines( &_boxing_late_inlines, useful);
remove_useless_late_inlines( &_vector_late_inlines, useful);
remove_useless_late_inlines(&_vector_reboxing_late_inlines, useful);
DEBUG_ONLY(verify_graph_edges(true /*check for no_dead_code*/, root_and_safepoints);)
}
Expand Down Expand Up @@ -693,6 +695,7 @@ Compile::Compile(ciEnv* ci_env, ciMethod* target, int osr_bci,
_string_late_inlines(comp_arena(), 2, 0, nullptr),
_boxing_late_inlines(comp_arena(), 2, 0, nullptr),
_vector_reboxing_late_inlines(comp_arena(), 2, 0, nullptr),
_vector_late_inlines(comp_arena(), 2, 0, nullptr),
_late_inlines_pos(0),
_has_mh_late_inlines(false),
_oom(false),
Expand Down Expand Up @@ -2158,6 +2161,32 @@ void Compile::shuffle_late_inlines() {
shuffle_array(*C, _late_inlines);
}

void Compile::process_vector_late_inlines() {
for (int i = 0; i < _vector_late_inlines.length(); i++) {
CallGenerator* cg = _vector_late_inlines.at(i);
ciMethod* callee = cg->method();

// Skip fallback inlining for callees already compiled into large nmethods.
if (callee->has_compiled_code() &&
callee->inline_instructions_size() > InlineSmallCode) {
continue;
}

// When a vector intrinsic fails, set_generator(cg) caches the
// LateInlineVectorCallGenerator on the call node to allow retries
// if IGVN optimizes the call node's inputs. If the call node is not
// on the IGVN worklist when cleanup runs, CallStaticJavaNode::Ideal
// does not fire and the cached generator persists. Once _late_inlines
// drains and we commit to the fallback here, clear the stale generator
// to prevent a subsequent IGVN pass from re-registering the intrinsic
// attempt into _late_inlines alongside the fallback, which would create
// duplicate call_node entries.
cg->call_node()->as_CallJava()->set_generator(nullptr);
add_late_inline(cg);
}
_vector_late_inlines.clear();
}

// Perform incremental inlining until bound on number of live nodes is reached
void Compile::inline_incrementally(PhaseIterGVN& igvn) {
TracePhase tp(_t_incrInline);
Expand Down Expand Up @@ -2215,6 +2244,10 @@ void Compile::inline_incrementally(PhaseIterGVN& igvn) {
print_method(PHASE_INCREMENTAL_INLINE_STEP, 3);

if (failing()) return;

if (_late_inlines.length() == 0) {
process_vector_late_inlines();
}
}

igvn_worklist()->ensure_empty(); // should be done with igvn
Expand Down
7 changes: 7 additions & 0 deletions src/hotspot/share/opto/compile.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,7 @@ class Compile : public Phase {
GrowableArray<CallGenerator*> _boxing_late_inlines; // same but for boxing operations

GrowableArray<CallGenerator*> _vector_reboxing_late_inlines; // same but for vector reboxing operations
GrowableArray<CallGenerator*> _vector_late_inlines; // inline fallback implementation for failed intrinsics

int _late_inlines_pos; // Where in the queue should the next late inlining candidate go (emulate depth first inlining)
bool _has_mh_late_inlines; // Can there still be a method handle late inlining pending?
Expand Down Expand Up @@ -508,6 +509,12 @@ class Compile : public Phase {
InlinePrinter _inline_printer;

public:

void add_vector_late_inline(CallGenerator* cg) {
_vector_late_inlines.push(cg);
}
void process_vector_late_inlines();

void* barrier_set_state() const { return _barrier_set_state; }

InlinePrinter* inline_printer() { return &_inline_printer; }
Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/share/opto/doCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ CallGenerator* Compile::call_generator(ciMethod* callee, int vtable_index, bool
cg_intrinsic = cg;
cg = nullptr;
} else if (IncrementalInline && should_delay_vector_inlining(callee, jvms)) {
return CallGenerator::for_late_inline(callee, cg);
return CallGenerator::for_vector_late_inline(callee, cg);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't you apply regular inlining analysis here (resides in Compile::call_generator) to decide whether fallback implementation should be inlined or not?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

} else {
return cg;
}
Expand Down
6 changes: 3 additions & 3 deletions test/hotspot/jtreg/compiler/vectorapi/TestVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,16 +47,16 @@ public static void main(String[] args) {
public int call() { return 1; }

@Test
@IR(failOn = {IRNode.CMP_I, IRNode.CMOVE_I})
@IR(failOn = {IRNode.CMP_I, IRNode.CMOVE_I}, applyIf = {"IncrementalInlineVector", "false"})
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it mean that the rule is disabled unless the test is explicitly run with -XX:-IncrementalInlineVector?
I doubt it will be regularly executed in such mode. So, it defeats the purpose of the test, doesn't it?

Instead, why don't you explicitly run the test with -XX:-IncrementalInlineVector flag?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, I am now passing -XX:-IncrementalInlineVector to test invocation.

@IR(counts = {IRNode.VECTOR_TEST, "1"})
public int branch(long maskLong) {
var mask = VectorMask.fromLong(ByteVector.SPECIES_PREFERRED, maskLong);
return mask.allTrue() ? call() : 0;
}

@Test
@IR(failOn = {IRNode.CMP_I})
@IR(counts = {IRNode.VECTOR_TEST, "1", IRNode.CMOVE_I, "1"})
@IR(failOn = {IRNode.CMP_I}, applyIf = {"IncrementalInlineVector", "false"})
@IR(counts = {IRNode.VECTOR_TEST, "1", IRNode.CMOVE_I, "1"}, applyIf = {"IncrementalInlineVector", "false"})
public int cmove(long maskLong) {
var mask = VectorMask.fromLong(ByteVector.SPECIES_PREFERRED, maskLong);
return mask.allTrue() ? 1 : 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1294,7 +1294,7 @@ public static void testCompareMaskNotDoubleNegative() {
public static void main(String[] args) {
TestFramework testFramework = new TestFramework();
testFramework.setDefaultWarmup(5000)
.addFlags("--add-modules=jdk.incubator.vector")
.addFlags("--add-modules=jdk.incubator.vector", "-XX:InlineSmallCode=100000")
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should AbstractMask::intoArray() be marked w/ @ForceInline instead?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With @ForceInline over AbstractMask::intoArray test passes with "-ea -esa -XX:CompileThreshold=100 -XX:+UnlockExperimentalVMOptions -server -XX:-TieredCompilation" but fails with default option due to difference in inlining


Failed IR Rules (1) of Methods (1)
----------------------------------
1) Method "compiler.vectorapi.VectorMaskCompareNotTest::testCompareNEMaskNotFloatNaN" - [Failed IR rules: 1]:
   * @IR rule 1: "@compiler.lib.ir_framework.IR(phase={DEFAULT}, applyIfPlatformAnd={}, applyIfCPUFeatureOr={"asimd", "true", "avx", "true", "rvv", "true"}, counts={"_#XOR_V_MASK#_", "= 0", "_#XOR_V#_", "= 0", "_#VECTOR_MASK_CMP#_", "= 2"}, failOn={}, applyIfPlatform={}, applyIfPlatformOr={}, applyIfOr={}, applyIfCPUFeatureAnd={}, applyIf={}, applyIfCPUFeature={}, applyIfAnd={}, applyIfNot={})"
     > Phase "PrintIdeal":
       - counts: Graph contains wrong number of nodes:
         * Constraint 3: "(\d+(\s){2}(VectorMaskCmp.*)+(\s){2}===.*)"
           - Failed comparison: [found] 0 = 2 [given]
           - No nodes matched!

With -XX:InlineSmallCode=1000000 it passes with all the configurations.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please, elaborate where it fails. Does func.apply(m).intoArray(mr, 0); in testCompareMaskNotFloat cause problems?

Copy link
Copy Markdown
Member Author

@jatin-bhateja jatin-bhateja May 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I investigated if further here is my analysis

Adding @ForceInline to AbstractMask::intoArray is desirable for vector intrinsic inlining, but it exposes a pre-existing bug in C2's switch profiling.

The bug is in Parse::do_tableswitch() in parse2.cpp: when a mature MDO has all-zero MultiBranchData counts, merge_ranges() marks every arm as never_reached, and jump_switch_ranges() collapses the entire switch to a single unstable_if trap. The parser should treat this as "no useful profile" (fall back to cnt = 1.0F), not "every arm is cold." I confirmed this analysis by passing -XX:-TieredCompilation or -XX:-UseSwitchProfiling — the test passes with either flag.

This profiling issue is orthogonal to the vector intrinsic late inlining work and should be addressed in a separate PR. For now, @ForceInline on AbstractMask::intoArray is not added and -XX:InlineSmallCode=1000000 is added to the failing test as a workaround

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the details. Hm, that doesn't sound right. There's no support for caller-sensitive profiling yet, so each method profile data is stored in a dedicated per-method MDO instance. (There are deoptimization counts which may depend on inlining, but regular branch counts should not be affected.) Anyway, let's continue investigating it separately.
Please, file a follow-up bug for it. Does -XX:-IncrementalInlineVector work as a workaround? I'm not fond of InlineSmallCode tweaks.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have filed a follow up JBS for this https://bugs.openjdk.org/browse/JDK-8385134

.start();
}
}