[vectorIntrinsics+mask] RFR: 8264563: Add masked vector intrinsics for binary/store operations [v2]
Xiaohong Gong
xgong at openjdk.java.net
Wed Apr 7 09:04:38 UTC 2021
On Wed, 7 Apr 2021 06:54:30 GMT, Xiaohong Gong <xgong at openjdk.org> wrote:
>> src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template line 694:
>>
>>> 692: $abstractvectortype$ hi = this.lanewise(LSHL, (op == ROR) ? neg : that);
>>> 693: $abstractvectortype$ lo = this.lanewise(LSHR, (op == ROR) ? that : neg);
>>> 694: return m != null ? blend(hi.lanewise(OR, lo), m) : hi.lanewise(OR, lo);
>>
>> Checking for m!=null would cause boxing here.
>
> Yeah, that's true. How about handling the difference separately in the masked/non-masked `lanewise` method?
My initial thought is to add a new abstract method i.g. `lanewise0()`. It did the same transformations for special opcodes and then directly calls the intrinsic method `binaryMaskOp`. All the different handles for special ops are handled differently in the `lanewise` method.
The codes might look like:
/**
* {@inheritDoc}
* @see #lanewise(VectorOperators.Binary,byte)
* @see #lanewise(VectorOperators.Binary,byte,VectorMask)
*/
@ForceInline
public final
ByteVector lanewise(VectorOperators.Binary op,
Vector<Byte> v) {
ByteVector that = (ByteVector) v;
that.check(this);
if (op == ROR || op == ROL) { // FIXME: JIT should do this
ByteVector neg = that.lanewise(NEG);
ByteVector hi = this.lanewise(LSHL, (op == ROR) ? neg : that);
ByteVector lo = this.lanewise(LSHR, (op == ROR) ? that : neg);
return hi.lanewise(OR, lo);
} else if (op == DIV) {
VectorMask<Byte> eqz = that.eq((byte)0);
if (eqz.anyTrue()) {
throw that.divZeroException();
}
}
return lanewise0(op, that, null);
}
/**
* {@inheritDoc}
* @see #lanewise(VectorOperators.Binary,byte,VectorMask)
*/
@Override
public final
ByteVector lanewise(VectorOperators.Binary op,
Vector<Byte> v,
VectorMask<Byte> m) {
ByteVector that = (ByteVector) v;
that.check(this);
if (op == ROR || op == ROL) {
return blend(lanewise(op, v), m);
} else if (op == DIV) {
VectorMask<Byte> eqz = that.eq((byte)0);
if (eqz.and(m).anyTrue()) {
throw that.divZeroException();
}
// suppress div/0 exceptions in unset lanes
that = that.lanewise(NOT, eqz);
}
return lanewise0(op, that, m);
}
protected abstract
ByteVector lanewise0(VectorOperators.Binary op,
Vector<Byte> v,
VectorMask<Byte> m);
@ForceInline
final
ByteVector lanewise0Template(VectorOperators.Binary op,
Class<? extends VectorMask<Byte>> maskType,
Vector<Byte> v, VectorMask<Byte> m) {
ByteVector that = (ByteVector) v;
if (opKind(op, VO_SPECIAL | VO_SHIFT)) {
if (op == FIRST_NONZERO) {
// FIXME: Support this in the JIT.
VectorMask<Byte> thisNZ
= this.viewAsIntegralLanes().compare(NE, (byte) 0);
that = that.blend((byte) 0, thisNZ.cast(vspecies()));
op = OR_UNCHECKED;
}
if (opKind(op, VO_SHIFT)) {
// As per shift specification for Java, mask the shift count.
// This allows the JIT to ignore some ISA details.
that = that.lanewise(AND, SHIFT_MASK);
}
if (op == AND_NOT) {
// FIXME: Support this in the JIT.
that = that.lanewise(NOT);
op = AND;
}
}
int opc = opCode(op);
return VectorSupport.binaryMaskOp(
opc, getClass(), maskType, byte.class, length(),
this, that, m,
BIN_MASK_IMPL.find(op, opc, (opc_) -> {
switch (opc_) {
case VECTOR_OP_ADD: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)(a + b));
case VECTOR_OP_SUB: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)(a - b));
case VECTOR_OP_MUL: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)(a * b));
case VECTOR_OP_DIV: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)(a / b));
case VECTOR_OP_MAX: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)Math.max(a, b));
case VECTOR_OP_MIN: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)Math.min(a, b));
case VECTOR_OP_AND: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)(a & b));
case VECTOR_OP_OR: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)(a | b));
case VECTOR_OP_XOR: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, b) -> (byte)(a ^ b));
case VECTOR_OP_LSHIFT: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, n) -> (byte)(a << n));
case VECTOR_OP_RSHIFT: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, n) -> (byte)(a >> n));
case VECTOR_OP_URSHIFT: return (v0, v1, vm) ->
v0.bOp(v1, vm, (i, a, n) -> (byte)((a & LSHR_SETUP_MASK) >>> n));
default: return null;
}}));
}
The additional benefit for the masked DIV is that it only needs one "div zero checking" for the active elements with this change, while it checks twice with the original (`blend`) codes. The original codes firstly check the active elements (calls by masked lanewise), and then check all the elements (calls by non-masked lanewise) which I think is duplicate.
Does it look ok for you? Thanks!
Best Regards,
Xiaohong
-------------
PR: https://git.openjdk.java.net/panama-vector/pull/57
More information about the panama-dev
mailing list