Skip to content

Commit

Permalink
VECT: Apply MASK_LEN_{LOAD_LANES, STORE_LANES} into vectorizer
Browse files Browse the repository at this point in the history
Hi, Richard and Richi.

This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into vectorizer.

Consider this simple case:

void __attribute__ ((noinline, noclone))
foo (int *__restrict a, int *__restrict b, int *__restrict c,
	  int *__restrict d, int *__restrict e, int *__restrict f,
	  int *__restrict g, int *__restrict h, int *__restrict j, int n)
{
  for (int i = 0; i < n; ++i)
    {
      a[i] = j[i * 8];
      b[i] = j[i * 8 + 1];
      c[i] = j[i * 8 + 2];
      d[i] = j[i * 8 + 3];
      e[i] = j[i * 8 + 4];
      f[i] = j[i * 8 + 5];
      g[i] = j[i * 8 + 6];
      h[i] = j[i * 8 + 7];
    }
}

RVV Gimple IR:

  _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
  ivtmp_125 = _79 * 32;
  vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 0);
  vect__8.9_122 = vect_array.8[0];
  vect__8.10_121 = vect_array.8[1];
  vect__8.11_120 = vect_array.8[2];
  vect__8.12_119 = vect_array.8[3];
  vect__8.13_118 = vect_array.8[4];
  vect__8.14_117 = vect_array.8[5];
  vect__8.15_116 = vect_array.8[6];
  vect__8.16_115 = vect_array.8[7];
  vect_array.8 ={v} {CLOBBER};
  ivtmp_114 = _79 * 4;
  .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
  .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
  .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
  .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
  .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
  .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
  .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
  .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);

ASM:

foo:
	lw	t4,8(sp)
	ld	t5,0(sp)
	ble	t4,zero,.L5
.L3:
	vsetvli	t1,t4,e8,mf4,ta,ma
	vlseg8e32.v	v8,(t5)
	slli	t3,t1,2
	slli	t6,t1,5
	vse32.v	v8,0(a0)
	vse32.v	v9,0(a1)
	vse32.v	v10,0(a2)
	vse32.v	v11,0(a3)
	vse32.v	v12,0(a4)
	vse32.v	v13,0(a5)
	vse32.v	v14,0(a6)
	vse32.v	v15,0(a7)
	sub	t4,t4,t1
	add	t5,t5,t6
	add	a0,a0,t3
	add	a1,a1,t3
	add	a2,a2,t3
	add	a3,a3,t3
	add	a4,a4,t3
	add	a5,a5,t3
	add	a6,a6,t3
	add	a7,a7,t3
	bne	t4,zero,.L3
.L5:
	ret

The details of the approach:

Step 1 - Modifiy the LANES LOAD/STORE support function (vect_load_lanes_supported/vect_store_lanes_supported):

+/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
+   vectors of type VECTYPE.  MASKED_P says whether the masked form is needed. */

-bool
+internal_fn
 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
 			   bool masked_p)
 {
-  if (masked_p)
-    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
-					 vec_mask_load_lanes_optab,
-					 vectype, count);
+  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
+				    vec_mask_len_load_lanes_optab,
+				    vectype, count))
+    return IFN_MASK_LEN_LOAD_LANES;
+  else if (masked_p)
+    {
+      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
+					vec_mask_load_lanes_optab,
+					vectype, count))
+	return IFN_MASK_LOAD_LANES;
+    }
   else
-    return vect_lanes_optab_supported_p ("vec_load_lanes",
-					 vec_load_lanes_optab,
-					 vectype, count);
+    {
+      if (vect_lanes_optab_supported_p ("vec_load_lanes",
+					vec_load_lanes_optab,
+					vectype, count))
+	return IFN_LOAD_LANES;
+    }
+  return IFN_LAST;
 }

Instead of returning TRUE or FALSE whether target support the LANES LOAD/STORE.
I change it into return internal_fn of the LANES LOAD/STORE that target support,
If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.

Step 2 - Compute IFN for LANES LOAD/STORE (Only compute once).

      if (!STMT_VINFO_STRIDED_P (first_stmt_info)
	  && (can_overrun_p || !would_overrun_p)
	  && compare_step_with_zero (vinfo, stmt_info) > 0)
	{
	  /* First cope with the degenerate case of a single-element
	     vector.  */
	  if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U))
	    ;

	  else
	    {
	      /* Otherwise try using LOAD/STORE_LANES.  */
	      *lanes_ifn
		= vls_type == VLS_LOAD
		    ? vect_load_lanes_supported (vectype, group_size, masked_p)
		    : vect_store_lanes_supported (vectype, group_size,
						  masked_p);
	      if (*lanes_ifn != IFN_LAST)
		{
		  *memory_access_type = VMAT_LOAD_STORE_LANES;
		  overrun_p = would_overrun_p;
		}

	      /* If that fails, try using permuting loads.  */
	      else if (vls_type == VLS_LOAD
			 ? vect_grouped_load_supported (vectype,
							single_element_p,
							group_size)
			 : vect_grouped_store_supported (vectype, group_size))
		{
		  *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
		  overrun_p = would_overrun_p;
		}
	    }
	}

Step 3 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:

+	  if (lanes_ifn == IFN_MASK_LEN_STORE_LANES)
+	    {
+	      if (loop_lens)
+		final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					       ncopies, vectype, j, 1);
+	      else
+		final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
+	      signed char biasval
+		= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+	      bias = build_int_cst (intQI_type_node, biasval);
+	      if (!final_mask)
+		{
+		  mask_vectype = truth_type_for (vectype);
+		  final_mask = build_minus_one_cst (mask_vectype);
+		}
+	    }
+
 	  gcall *call;
-	  if (final_mask)
+	  if (final_len && final_mask)
+	    {
+	      /* Emit:
+		   MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
+					 LEN, BIAS, VEC_ARRAY).  */
+	      unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
+	      tree alias_ptr = build_int_cst (ref_type, align);
+	      call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
+						 dataref_ptr, alias_ptr,
+						 final_mask, final_len, bias,
+						 vec_array);
+	    }
+	  else if (final_mask)

The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.

gcc/ChangeLog:

	* internal-fn.cc (internal_load_fn_p): Apply
	MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
	(internal_store_fn_p): Ditto.
	(internal_fn_len_index): Ditto.
	(internal_fn_mask_index): Ditto.
	(internal_fn_stored_value_index): Ditto.
	* tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
	(vect_load_lanes_supported): Ditto.
	* tree-vect-loop.cc: Ditto.
	* tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
	* tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
	(get_group_load_store_type): Ditto.
	(vectorizable_store): Ditto.
	(vectorizable_load): Ditto.
	* tree-vectorizer.h (vect_store_lanes_supported): Ditto.
	(vect_load_lanes_supported): Ditto.
  • Loading branch information
zhongjuzhe authored and Incarnation-p-lee committed Aug 16, 2023
1 parent c6f65ce commit d5acdd6
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 63 deletions.
7 changes: 7 additions & 0 deletions gcc/internal-fn.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4578,6 +4578,7 @@ internal_load_fn_p (internal_fn fn)
case IFN_MASK_LOAD:
case IFN_LOAD_LANES:
case IFN_MASK_LOAD_LANES:
case IFN_MASK_LEN_LOAD_LANES:
case IFN_GATHER_LOAD:
case IFN_MASK_GATHER_LOAD:
case IFN_MASK_LEN_GATHER_LOAD:
Expand All @@ -4600,6 +4601,7 @@ internal_store_fn_p (internal_fn fn)
case IFN_MASK_STORE:
case IFN_STORE_LANES:
case IFN_MASK_STORE_LANES:
case IFN_MASK_LEN_STORE_LANES:
case IFN_SCATTER_STORE:
case IFN_MASK_SCATTER_STORE:
case IFN_MASK_LEN_SCATTER_STORE:
Expand Down Expand Up @@ -4672,6 +4674,8 @@ internal_fn_len_index (internal_fn fn)
case IFN_COND_LEN_NEG:
case IFN_MASK_LEN_LOAD:
case IFN_MASK_LEN_STORE:
case IFN_MASK_LEN_LOAD_LANES:
case IFN_MASK_LEN_STORE_LANES:
return 3;

default:
Expand All @@ -4689,8 +4693,10 @@ internal_fn_mask_index (internal_fn fn)
{
case IFN_MASK_LOAD:
case IFN_MASK_LOAD_LANES:
case IFN_MASK_LEN_LOAD_LANES:
case IFN_MASK_STORE:
case IFN_MASK_STORE_LANES:
case IFN_MASK_LEN_STORE_LANES:
case IFN_MASK_LEN_LOAD:
case IFN_MASK_LEN_STORE:
return 2;
Expand Down Expand Up @@ -4726,6 +4732,7 @@ internal_fn_stored_value_index (internal_fn fn)
return 4;

case IFN_MASK_LEN_STORE:
case IFN_MASK_LEN_STORE_LANES:
return 5;

default:
Expand Down
61 changes: 40 additions & 21 deletions gcc/tree-vect-data-refs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5438,22 +5438,31 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
return false;
}

/* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT vectors
of type VECTYPE. MASKED_P says whether the masked form is needed. */

/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
type VECTYPE. MASKED_P says whether the masked form is needed. */

bool
internal_fn
vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
bool masked_p)
{
if (masked_p)
return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
vec_mask_store_lanes_optab,
vectype, count);
if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
vec_mask_len_store_lanes_optab, vectype,
count))
return IFN_MASK_LEN_STORE_LANES;
else if (masked_p)
{
if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
vec_mask_store_lanes_optab, vectype,
count))
return IFN_MASK_STORE_LANES;
}
else
return vect_lanes_optab_supported_p ("vec_store_lanes",
vec_store_lanes_optab,
vectype, count);
{
if (vect_lanes_optab_supported_p ("vec_store_lanes",
vec_store_lanes_optab, vectype, count))
return IFN_STORE_LANES;
}
return IFN_LAST;
}


Expand Down Expand Up @@ -6056,21 +6065,31 @@ vect_grouped_load_supported (tree vectype, bool single_element_p,
return false;
}

/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
type VECTYPE. MASKED_P says whether the masked form is needed. */
/* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT vectors
of type VECTYPE. MASKED_P says whether the masked form is needed. */

bool
internal_fn
vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
bool masked_p)
{
if (masked_p)
return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
vec_mask_load_lanes_optab,
vectype, count);
if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
vec_mask_len_load_lanes_optab, vectype,
count))
return IFN_MASK_LEN_LOAD_LANES;
else if (masked_p)
{
if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
vec_mask_load_lanes_optab, vectype,
count))
return IFN_MASK_LOAD_LANES;
}
else
return vect_lanes_optab_supported_p ("vec_load_lanes",
vec_load_lanes_optab,
vectype, count);
{
if (vect_lanes_optab_supported_p ("vec_load_lanes", vec_load_lanes_optab,
vectype, count))
return IFN_LOAD_LANES;
}
return IFN_LAST;
}

/* Function vect_permute_load_chain.
Expand Down
11 changes: 6 additions & 5 deletions gcc/tree-vect-loop.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2839,7 +2839,8 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
instructions record it and move on to the next instance. */
if (loads_permuted
&& SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
&& vect_store_lanes_supported (vectype, group_size, false))
&& vect_store_lanes_supported (vectype, group_size, false)
!= IFN_LAST)
{
FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
{
Expand All @@ -2848,9 +2849,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
/* Use SLP for strided accesses (or if we can't
load-lanes). */
if (STMT_VINFO_STRIDED_P (stmt_vinfo)
|| ! vect_load_lanes_supported
|| vect_load_lanes_supported
(STMT_VINFO_VECTYPE (stmt_vinfo),
DR_GROUP_SIZE (stmt_vinfo), false))
DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
break;
}

Expand Down Expand Up @@ -3153,7 +3154,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
unsigned int size = DR_GROUP_SIZE (vinfo);
tree vectype = STMT_VINFO_VECTYPE (vinfo);
if (! vect_store_lanes_supported (vectype, size, false)
if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
&& ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
&& ! vect_grouped_store_supported (vectype, size))
return opt_result::failure_at (vinfo->stmt,
Expand All @@ -3165,7 +3166,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
size = DR_GROUP_SIZE (vinfo);
vectype = STMT_VINFO_VECTYPE (vinfo);
if (! vect_load_lanes_supported (vectype, size, false)
if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
&& ! vect_grouped_load_supported (vectype, single_element_p,
size))
return opt_result::failure_at (vinfo->stmt,
Expand Down
2 changes: 1 addition & 1 deletion gcc/tree-vect-slp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3094,7 +3094,7 @@ vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
|| multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
return false;
return vect_store_lanes_supported (vectype, group_size, false);
return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
}

/* Analyze an SLP instance starting from a group of grouped stores. Call
Expand Down
Loading

0 comments on commit d5acdd6

Please sign in to comment.