Skip to content

Commit

Permalink
8252848: Optimize small primitive arrayCopy operations through partia…
Browse files Browse the repository at this point in the history
…l inlining using AVX-512 masked instructions.
  • Loading branch information
Jatin Bhateja committed Sep 13, 2020
1 parent 03a4df0 commit 1601fba
Show file tree
Hide file tree
Showing 27 changed files with 560 additions and 16 deletions.
39 changes: 39 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2589,6 +2589,38 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vect
emit_operand(dst, src);
}

void Assembler::evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type) {
assert(VM_Version::supports_avx512vlbw(), "");
InstructionMark im(this);
bool wide = type == T_SHORT || type == T_LONG || type == T_CHAR;
bool bwinstr = type == T_BYTE || type == T_SHORT || type == T_CHAR;
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
int prefix = bwinstr ? VEX_SIMD_F2 : VEX_SIMD_F3;
vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
emit_int8(0x6F);
emit_operand(dst, src);
}

void Assembler::evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(src != xnoreg, "sanity");
InstructionMark im(this);
bool wide = type == T_SHORT || type == T_LONG || type == T_CHAR;
bool bwinstr = type == T_BYTE || type == T_SHORT || type == T_CHAR;
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.reset_is_clear_context();
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
int prefix = bwinstr ? VEX_SIMD_F2 : VEX_SIMD_F3;
vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
emit_int8(0x7F);
emit_operand(src, dst);
}

void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionMark im(this);
Expand Down Expand Up @@ -7803,6 +7835,13 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) {
emit_int16((unsigned char)0xF7, (0xC0 | encode));
}

void Assembler::shrxq(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xF7, (0xC0 | encode));
}

#ifndef _LP64

void Assembler::incl(Register dst) {
Expand Down
8 changes: 7 additions & 1 deletion src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,6 @@ class Assembler : public AbstractAssembler {

void decl(Register dst);
void decl(Address dst);
void decq(Register dst);
void decq(Address dst);

void incl(Register dst);
Expand Down Expand Up @@ -911,6 +910,7 @@ class Assembler : public AbstractAssembler {
void popa_uncached();
#endif
void vzeroupper_uncached();
void decq(Register dst);

void pusha();
void popa();
Expand Down Expand Up @@ -1519,6 +1519,10 @@ class Assembler : public AbstractAssembler {
void evmovdquq(XMMRegister dst, Address src, int vector_len);
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);

// Generic move instructions.
void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type);
void evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type);

// Move lower 64bit to high 64bit in 128bit register
void movlhps(XMMRegister dst, XMMRegister src);

Expand Down Expand Up @@ -2021,6 +2025,8 @@ class Assembler : public AbstractAssembler {

void shlxl(Register dst, Register src1, Register src2);
void shlxq(Register dst, Register src1, Register src2);
void shrxq(Register dst, Register src1, Register src2);


//====================VECTOR ARITHMETIC=====================================

Expand Down
14 changes: 14 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1218,6 +1218,20 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
reduce_operation_256(opcode, vtmp2, vtmp2, src2);
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
}

void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
if (ArrayCopyPartialInlineSize <= 32) {
mov64(dst, 1);
shlxq(dst, dst, len);
decq(dst);
} else {
mov64(dst, -1);
movq(temp, len);
negptr(temp);
addptr(temp, 64);
shrxq(dst, dst, temp);
}
}
#endif // _LP64

void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#ifdef _LP64
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void genmask(Register dst, Register len, Register temp);
#endif // _LP64

// dst = reduce(op, src2) using vtmp as temps
Expand Down
27 changes: 26 additions & 1 deletion src/hotspot/cpu/x86/vm_version_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,8 @@ void VM_Version::get_processor_features() {
if (is_intel()) { // Intel cpus specific settings
if (is_knights_family()) {
_features &= ~CPU_VZEROUPPER;
_features &= ~CPU_AVX512BW;
_features &= ~CPU_AVX512VL;
}
}

Expand Down Expand Up @@ -1162,7 +1164,7 @@ void VM_Version::get_processor_features() {
#endif // COMPILER2 && ASSERT

if (!FLAG_IS_DEFAULT(AVX3Threshold)) {
if (!is_power_of_2(AVX3Threshold)) {
if (AVX3Threshold !=0 && !is_power_of_2(AVX3Threshold)) {
warning("AVX3Threshold must be a power of 2");
FLAG_SET_DEFAULT(AVX3Threshold, 4096);
}
Expand Down Expand Up @@ -1411,6 +1413,29 @@ void VM_Version::get_processor_features() {
MaxLoopPad = 11;
}
#endif // COMPILER2

if (FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize) ||
(!FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize) &&
ArrayCopyPartialInlineSize != 0 &&
ArrayCopyPartialInlineSize != 32 &&
ArrayCopyPartialInlineSize != 64)) {
int pi_size = 0;
if (MaxVectorSize > 32 && AVX3Threshold == 0) {
pi_size = 64;
} else if (MaxVectorSize >= 32) {
pi_size = 32;
}
if(!FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize)) {
warning("Setting ArrayCopyPartialInlineSize as %d", pi_size);
}
ArrayCopyPartialInlineSize = pi_size;
}

if (ArrayCopyPartialInlineSize > MaxVectorSize) {
ArrayCopyPartialInlineSize = MaxVectorSize;
warning("Setting ArrayCopyPartialInlineSize as MaxVectorSize");
}

if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
}
Expand Down
66 changes: 66 additions & 0 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1405,6 +1405,13 @@ const bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_VectorMaskGen:
case Op_VectorMaskedLoad:
case Op_VectorMaskedStore:
if (UseAVX < 3) {
return false;
}
break;
#ifndef _LP64
case Op_AddReductionVF:
case Op_AddReductionVD:
Expand Down Expand Up @@ -1477,6 +1484,16 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false;
}
break;
case Op_VectorMaskGen:
case Op_VectorMaskedLoad:
case Op_VectorMaskedStore:
if (!VM_Version::supports_avx512bw()) {
return false;
}
if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
return false;
}
break;
case Op_CMoveVD:
if (vlen != 4) {
return false; // implementation limitation (only vcmov4D_reg is present)
Expand Down Expand Up @@ -5444,3 +5461,52 @@ instruct vprorate(vec dst, vec src, vec shift) %{
ins_pipe( pipe_slow );
%}

#ifdef _LP64
// ---------------------------------- Masked Block Copy ------------------------------------

instruct vmasked_load64(vec dst, memory mem, rRegL mask) %{
match(Set dst (VectorMaskedLoad mem mask));
format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
ins_encode %{
BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(this);
//TODO: KRegister to be made valid "bound" operand to promote sharing.
__ kmovql(k2, $mask$$Register);
__ evmovdqu($dst$$XMMRegister, k2, $mem$$Address, vector_len, elmType);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_gen(rRegL dst, rRegL len, rRegL tempLen) %{
match(Set dst (VectorMaskGen len));
effect(TEMP_DEF dst, TEMP tempLen);
format %{ "vector_mask_gen $len \t! vector mask generator" %}
ins_encode %{
__ genmask($dst$$Register, $len$$Register, $tempLen$$Register);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_gen_imm(rRegL dst, immL len) %{
match(Set dst (VectorMaskGen len));
format %{ "vector_mask_gen $len \t! vector mask generator" %}
ins_encode %{
__ mov64($dst$$Register, (1L << ($len$$constant & 63)) -1);
%}
ins_pipe( pipe_slow );
%}

instruct vmasked_store64(memory mem, vec src, rRegL mask) %{
match(Set mem (VectorMaskedStore mem (Binary src mask)));
format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
ins_encode %{
const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
BasicType elmType = src_node->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(src_node);
//TODO: KRegister to be made valid "bound" operand to promote sharing.
__ kmovql(k2, $mask$$Register);
__ evmovdqu($mem$$Address, k2, $src$$XMMRegister, vector_len, elmType);
%}
ins_pipe( pipe_slow );
%}
#endif // _LP64
2 changes: 2 additions & 0 deletions src/hotspot/share/adlc/forms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ Form::DataType Form::is_load_from_memory(const char *opType) const {
if( strcmp(opType,"LoadRange")==0 ) return Form::idealI;
if( strcmp(opType,"LoadS")==0 ) return Form::idealS;
if( strcmp(opType,"LoadVector")==0 ) return Form::idealV;
if( strcmp(opType,"VectorMaskedLoad")==0 ) return Form::idealV;
assert( strcmp(opType,"Load") != 0, "Must type Loads" );
return Form::none;
}
Expand All @@ -284,6 +285,7 @@ Form::DataType Form::is_store_to_memory(const char *opType) const {
if( strcmp(opType,"StoreN")==0) return Form::idealN;
if( strcmp(opType,"StoreNKlass")==0) return Form::idealNKlass;
if( strcmp(opType,"StoreVector")==0 ) return Form::idealV;
if( strcmp(opType,"VectorMaskedStore")==0 ) return Form::idealV;
assert( strcmp(opType,"Store") != 0, "Must type Stores" );
return Form::none;
}
Expand Down
7 changes: 4 additions & 3 deletions src/hotspot/share/adlc/formssel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,7 @@ bool InstructForm::captures_bottom_type(FormDict &globals) const {
!strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeP") ||
!strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeN") ||
#endif
!strcmp(_matrule->_rChild->_opType,"VectorMaskGen")||
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeP") ||
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeN"))) return true;
else if ( is_ideal_load() == Form::idealP ) return true;
Expand Down Expand Up @@ -3484,7 +3485,7 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const {
"StoreB","StoreC","Store" ,"StoreFP",
"LoadI", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF" ,
"LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" ,
"StoreVector", "LoadVector",
"StoreVector", "LoadVector", "VectorMaskedLoad", "VectorMaskedStore",
"LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned",
"LoadPLocked",
"StorePConditional", "StoreIConditional", "StoreLConditional",
Expand Down Expand Up @@ -4168,8 +4169,8 @@ bool MatchRule::is_vector() const {
"RShiftVB","RShiftVS","RShiftVI","RShiftVL",
"URShiftVB","URShiftVS","URShiftVI","URShiftVL",
"ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
"RoundDoubleModeV","RotateLeftV" , "RotateRightV", "LoadVector","StoreVector",
"FmaVD", "FmaVF","PopCountVI",
"RoundDoubleModeV", "RotateLeftV" , "RotateRightV", "LoadVector","StoreVector",
"FmaVD", "FmaVF","PopCountVI","VectorMaskedLoad","VectorMaskedStore",
// Next are not supported currently.
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
"ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
Expand Down
29 changes: 29 additions & 0 deletions src/hotspot/share/opto/arraycopynode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ void ArrayCopyNode::connect_outputs(GraphKit* kit, bool deoptimize_on_exception)
kit->set_all_memory_call(this);
}


#ifndef PRODUCT
const char* ArrayCopyNode::_kind_names[] = {"arraycopy", "arraycopy, validated arguments", "clone", "oop array clone", "CopyOf", "CopyOfRange"};

Expand Down Expand Up @@ -670,13 +671,28 @@ bool ArrayCopyNode::may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTra
CallNode* call = NULL;
guarantee(c != NULL, "step_over_gc_barrier failed, there must be something to step to.");
if (c->is_Region()) {
PhiNode* phi = NULL;
for (uint i = 1; i < c->req(); i++) {
if (c->in(i) != NULL) {
Node* n = c->in(i)->in(0);
if (may_modify_helper(t_oop, n, phase, call)) {
ac = call->isa_ArrayCopy();
assert(c == mb->in(0), "only for clone");
return true;
} else if (n != NULL && n->is_Region() &&
(phi = n->as_Region()->has_phi()) &&
phi->in(1)->Opcode() == Op_VectorMaskedStore) {
return true;
} else {
for (DUIterator_Fast imax, i = c->fast_outs(imax); i < imax; i++) {
Node* phi = c->fast_out(i);
if (phi->is_Phi()) {
assert(phi->in(0) == c, "phi region validation");
if(phi->in(1) && phi->in(1)->Opcode() == Op_VectorMaskedStore) {
return true;
}
}
}
}
}
}
Expand Down Expand Up @@ -734,3 +750,16 @@ bool ArrayCopyNode::modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransf
}
return false;
}

// As an optimization, choose optimum vector size for copy length known at compile time.
int ArrayCopyNode::get_partial_inline_vector_lane_count(BasicType type, int con_len) {
int lane_count = ArrayCopyPartialInlineSize/type2aelembytes(type);
if (con_len > 0) {
int size_in_bytes = con_len * type2aelembytes(type);
if (size_in_bytes <= 16)
lane_count = 16/type2aelembytes(type);
else if (size_in_bytes > 16 && size_in_bytes <= 32)
lane_count = 32/type2aelembytes(type);
}
return lane_count;
}
3 changes: 3 additions & 0 deletions src/hotspot/share/opto/arraycopynode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,9 @@ class ArrayCopyNode : public CallNode {
bool has_negative_length_guard() const { return _has_negative_length_guard; }

static bool may_modify(const TypeOopPtr *t_oop, MemBarNode* mb, PhaseTransform *phase, ArrayCopyNode*& ac);

static int get_partial_inline_vector_lane_count(BasicType type, int con_len);

bool modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransform* phase, bool must_modify) const;

#ifndef PRODUCT
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/share/opto/c2_globals.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@
"actual size could be less depending on elements type") \
range(0, max_jint) \
\
product(intx, ArrayCopyPartialInlineSize, -1, DIAGNOSTIC, \
"Partial inline size used for array copy acceleration.") \
range(-1, 64) \
\
product(bool, AlignVector, true, \
"Perform vector store/load alignment in loop") \
\
Expand Down
Loading

0 comments on commit 1601fba

Please sign in to comment.