Contributors
Ben Ashbaugh, Intel
Eugene Chereshnev, Intel
Junjie Gu, Intel
Bartosz Koscielak, Intel
Mike MacPherson, Intel
Ritesh Patel, Intel
Lukasz Towarek, Intel
Dependencies
This extension is written against the OpenCL 3.0 C Language specification, V3.0.10.
This extension requires support for subgroups.
This extension depends on cl_intel_required_subgroup_size
to query the subgroup sizes supported by a device or to require a subgroup size for a kernel.
Overview
The goal of this extension is to allow programmers to access specialized hardware to compute the product of an M x K matrix with a K x N matrix and then add an M x N matrix accumulation value. This is a commonly used building block to compute the product of two large matrices. When used in an OpenCL kernel, all work items in the subgroup cooperate to perform this operation.
This is a lowlevel extension for expert programmers seeking to access this functionality directly in custom kernels. Most users will access this functionality via highlevel libraries or frameworks.
New OpenCL C Functions
// These functions are available to devices where the minimum subgroup
// size is 8. For these devices, the subgroup size must be 8 (the
// minimum supported subgroup size). Calling these functions on other
// devices or from kernels with a different subgroup size is undefined
// behavior:
// 8bit matrices:
int intel_sub_group_i8_i8_matrix_mad_k32(int a, int8 b, int acc); // M = 1
int2 intel_sub_group_i8_i8_matrix_mad_k32(int2 a, int8 b, int2 acc); // M = 2
int4 intel_sub_group_i8_i8_matrix_mad_k32(int4 a, int8 b, int4 acc); // M = 4
int8 intel_sub_group_i8_i8_matrix_mad_k32(int8 a, int8 b, int8 acc); // M = 8
int intel_sub_group_i8_u8_matrix_mad_k32(int a, uint8 b, int acc); // ...
int2 intel_sub_group_i8_u8_matrix_mad_k32(int2 a, uint8 b, int2 acc);
int4 intel_sub_group_i8_u8_matrix_mad_k32(int4 a, uint8 b, int4 acc);
int8 intel_sub_group_i8_u8_matrix_mad_k32(int8 a, uint8 b, int8 acc);
int intel_sub_group_u8_i8_matrix_mad_k32(uint a, int8 b, int acc);
int2 intel_sub_group_u8_i8_matrix_mad_k32(uint2 a, int8 b, int2 acc);
int4 intel_sub_group_u8_i8_matrix_mad_k32(uint4 a, int8 b, int4 acc);
int8 intel_sub_group_u8_i8_matrix_mad_k32(uint8 a, int8 b, int8 acc);
int intel_sub_group_u8_u8_matrix_mad_k32(uint a, uint8 b, int acc);
int2 intel_sub_group_u8_u8_matrix_mad_k32(uint2 a, uint8 b, int2 acc);
int4 intel_sub_group_u8_u8_matrix_mad_k32(uint4 a, uint8 b, int4 acc);
int8 intel_sub_group_u8_u8_matrix_mad_k32(uint8 a, uint8 b, int8 acc);
// bfloat16 matrices:
float intel_sub_group_bf16_bf16_matrix_mad_k16(int a, int8 b, float acc);
float2 intel_sub_group_bf16_bf16_matrix_mad_k16(int2 a, int8 b, float2 acc);
float4 intel_sub_group_bf16_bf16_matrix_mad_k16(int4 a, int8 b, float4 acc);
float8 intel_sub_group_bf16_bf16_matrix_mad_k16(int8 a, int8 b, float8 acc);
// fp16 matrices:
float intel_sub_group_f16_f16_matrix_mad_k16(int a, int8 b, float acc);
float2 intel_sub_group_f16_f16_matrix_mad_k16(int2 a, int8 b, float2 acc);
float4 intel_sub_group_f16_f16_matrix_mad_k16(int4 a, int8 b, float4 acc);
float8 intel_sub_group_f16_f16_matrix_mad_k16(int8 a, int8 b, float8 acc);
// These functions are available to devices where the minimum subgroup
// size is 16. For these devices, the subgroup size must be 16 (the
// minimum supported subgroup size). Calling these functions on other
// devices or from kernels with a different subgroup size is undefined
// behavior:
// 8bit matrices:
int intel_sub_group_i8_i8_matrix_mad_k32(short a, int8 b, int acc); // M = 1
int2 intel_sub_group_i8_i8_matrix_mad_k32(short2 a, int8 b, int2 acc); // M = 2
int4 intel_sub_group_i8_i8_matrix_mad_k32(short4 a, int8 b, int4 acc); // M = 4
int8 intel_sub_group_i8_i8_matrix_mad_k32(short8 a, int8 b, int8 acc); // M = 8
int intel_sub_group_i8_u8_matrix_mad_k32(short a, uint8 b, int acc); // ...
int2 intel_sub_group_i8_u8_matrix_mad_k32(short2 a, uint8 b, int2 acc);
int4 intel_sub_group_i8_u8_matrix_mad_k32(short4 a, uint8 b, int4 acc);
int8 intel_sub_group_i8_u8_matrix_mad_k32(short8 a, uint8 b, int8 acc);
int intel_sub_group_u8_i8_matrix_mad_k32(ushort a, int8 b, int acc);
int2 intel_sub_group_u8_i8_matrix_mad_k32(ushort2 a, int8 b, int2 acc);
int4 intel_sub_group_u8_i8_matrix_mad_k32(ushort4 a, int8 b, int4 acc);
int8 intel_sub_group_u8_i8_matrix_mad_k32(ushort8 a, int8 b, int8 acc);
int intel_sub_group_u8_u8_matrix_mad_k32(ushort a, uint8 b, int acc);
int2 intel_sub_group_u8_u8_matrix_mad_k32(ushort2 a, uint8 b, int2 acc);
int4 intel_sub_group_u8_u8_matrix_mad_k32(ushort4 a, uint8 b, int4 acc);
int8 intel_sub_group_u8_u8_matrix_mad_k32(ushort8 a, uint8 b, int8 acc);
// bfloat16 matrices:
float intel_sub_group_bf16_bf16_matrix_mad_k16(short a, int8 b, float acc);
float2 intel_sub_group_bf16_bf16_matrix_mad_k16(short2 a, int8 b, float2 acc);
float4 intel_sub_group_bf16_bf16_matrix_mad_k16(short4 a, int8 b, float4 acc);
float8 intel_sub_group_bf16_bf16_matrix_mad_k16(short8 a, int8 b, float8 acc);
// fp16 matrices:
float intel_sub_group_f16_f16_matrix_mad_k16(short a, int8 b, float acc);
float2 intel_sub_group_f16_f16_matrix_mad_k16(short2 a, int8 b, float2 acc);
float4 intel_sub_group_f16_f16_matrix_mad_k16(short4 a, int8 b, float4 acc);
float8 intel_sub_group_f16_f16_matrix_mad_k16(short8 a, int8 b, float8 acc);
Modifications to the OpenCL C Specification
Add a new Section 6.13.X  Subgroup Matrix Multiply Accumulate Instructions
This section describes a family of builtin functions that multiply two matrix sources a
and b
and then add a matrix accumulation value to produce a matrix result value.
a
is the first matrix operand and has M rows and K columns.
b
is the second matrix operand and has K rows and N columns.
acc
is the matrix accumulation value and has M rows and N columns.
The result value also has M rows and N columns.
All work items in the subgroup cooperate to perform this operation.
These functions must be encountered by all work items in the subgroup executing the kernel.
The dimensions of the two source matrices and the elements of each source matrix are described by the builtin function name and its arguments.
As an example, given the function:
int2 intel_sub_group_u8_i8_matrix_mad_k32(uint2 a, int8 b, int2 acc);

a
is the first source matrix operand and hasM
rows andK
columns.
The value for
M
is determined by the number of vector components in the source operanda
. In the example above,a
is auint2
argument, therefore the matrixa
operand hasM
equal to 2 rows. 
The value of
K
is described by the function name. In this case, the value ofK
is 32, therefore the matrixa
operand hasK
equal to 32 columns. 
The matrix component data type is also described by the function name. In this case, the matrix
a
component data type isu8
, indicating that the elements of the matrixa
operand are unsigned 8bit integers. 
Each work item contributes part of this matrix. In this case, since the elements of the matrix
a
are 8bit integers, and since each work item is contributing 32 bits (the size of auint
) of data per row of this matrix, each work item is contributing four 8bit integer values per row. 
Since
K
is 32, and each work item is contributing four 8bit values per row, the number of work items in the subgroup must be equal to 8.


b
is the second source matrix operand and hasK
rows andN
columns.
Each work item contributes one column of this matrix. Therefore, the number of columns
N
is equivalent to the subgroup size. 
As above, the value of
K
is described by the function name. In this case, the value ofK
is 32, therefore the matrixb
operand hasK
equal to 32 rows. 
As above, the matrix component data type is described by the function name. In this case, the matrix
b
component data type isi8
, indicating that the elements of the matrixb
operand are signed 8bit integers. 
Since
K
is 32 and the elements of the matrixb
are 8bit integers, each work item must contribute 256 bits of source data to contributeK
values. The 256 bits of source data are packed and passed as theint8
argumentb
.


acc
specifies the accumulation value and hasM
rows andN
columns.
As above, the value of
M
is determined by the number of components in the source operandacc
. In the example above,acc
is anint2
argument, therefore the accumulation value operand hasM
equal to 2 rows. 
Since both
a
andacc
specify operands withM
rows, and since the value ofM
is determined by the number of components in the source operand, both thea
andacc
operands will be vector operands with the same number of components. 
As above, each work item contributes one column of accumulation values. Therefore, the number of columns
N
is equivalent to the subgroup size. 
The
acc
operand is a "full precision" accumulation value. In the example above, the matrices contain integer data, therefore theacc
operand is a vector ofint
data.


The result value returned by the function also has
M
rows andN
columns.
As above, the value of
M
is determined by the number of components in the return type. In the example above, the return type isint2
, therefore the result value hasM
equal to 2 rows. 
Since the result value,
a
, andacc
all specify values withM
rows, and since the value ofM
is determined by the number of components in the source operand or return type, the return tye,a
, andacc
will all be vectors with the same number of components. 
As above, each work item will receive one column of result values. Therefore, the number of columns
N
is equivalent to the subgroup size. 
Similar to the
acc
operand, the return value is a "full precision" result value. In the example above, the matrices contain integer data, therefore the return type is a vector ofint
data.

The full list of supported functions is described in the overview, above. For this list of functions:

M
may be equal to 1, 2, 4, or 8. 
N
must be equal to 8 for some devices or 16 for other devices. In other words, the only supported subgroup sizes are 8 and 16. 
Supported integer matrix types for
a
andb
are any combination of signed or unsigned 8bit integers. For these integer matrix types, the accumulation valueacc
and result value are signed 32bit integers, andK
must be equal to 32. 
The supported floatingpoint matrix types for
a
andb
are fp16 (half) or bfloat16. For these floatingpoint matrix type, the accumulation valueacc
and result value are 32bit floatingpoint values, andK
must be equal to 16.
Coding Sample
// The code below shows a functional implementation of one of the
// builtin functions added by this extension. For this builtin
// function:
// * M = 2, since the result value, a operand, and acc operand
// are all vectors with two components.
// * N = 8, and is equal to the subgroup size.
// * K = 32, as described by the function name.
// * The elements of both matrix a and matrix b are signed 8bit
// integers.
// This is a helper function that performs the dot product of
// two vectors of four components of 8bit integer data, and then
// adds a 32bit integer accumulation value.
static int __intel_dot_product_accumulate( char4 a, char4 b, int acc )
{
return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w + acc;
}
// This is a helper function that computes the product of a
// 1 x 32 row vector value shared across the subgroup and a 32 x 1
// column vector, that is added to a full precision accumulation
// value.
static int __intel_vector_matrix_multiply_accumulate_k32( int v, int8 b, int acc )
{
// Note: 8 is the size of the subgroup.
// As K is 32, and the size of the subgroup is 8, each
// work item contributes 4 elements of the 1 x K vector.
// as_char4() is used to reinterpret 32bits of data
// as four components of 8bit data.
int result = acc;
result = __intel_dot_product_accumulate(
as_char4( sub_group_broadcast( v, 0 ) ), as_char4( b.s0 ), result );
result = __intel_dot_product_accumulate(
as_char4( sub_group_broadcast( v, 1 ) ), as_char4( b.s1 ), result );
result = __intel_dot_product_accumulate(
as_char4( sub_group_broadcast( v, 2 ) ), as_char4( b.s2 ), result );
result = __intel_dot_product_accumulate(
as_char4( sub_group_broadcast( v, 3 ) ), as_char4( b.s3 ), result );
result = __intel_dot_product_accumulate(
as_char4( sub_group_broadcast( v, 4 ) ), as_char4( b.s4 ), result );
result = __intel_dot_product_accumulate(
as_char4( sub_group_broadcast( v, 5 ) ), as_char4( b.s5 ), result );
result = __intel_dot_product_accumulate(
as_char4( sub_group_broadcast( v, 6 ) ), as_char4( b.s6 ), result );
result = __intel_dot_product_accumulate(
as_char4( sub_group_broadcast( v, 7 ) ), as_char4( b.s7 ), result );
return result;
}
int2 intel_sub_group_i8_i8_matrix_mad_k32(int2 a, int8 b, int2 acc)
{
int2 result;
result.x = __intel_vector_matrix_multiply_accumulate_k32( a.x, b, acc.x );
result.y = __intel_vector_matrix_multiply_accumulate_k32( a.y, b, acc.y );
return result;
}
Issues
None.

Should this extension use signed or unsigned types to represent fp16 and bf16 data?
RESOLVED
: This extension will use signed types to represent fp16 and bf16 data even though this is inconsistent with other extensions such as cl_intel_bfloat16 conversions. This inconsistency may be addressed in a future extension or in a future version of this extension. Applications are encouraged to useas_type
to reinterpret unsigned data as signed data as needed to use the functions added by this extension.