Registered Extension Number




Extension and Version Dependencies


Extension Proposal

Other Extension Metadata

Last Modified Date


Interactions and External Dependencies
  • Yuchang Wang, Huawei

  • Juntao Li, Huawei

  • Pan Gao, Huawei

  • Jie Cao, Huawei

  • Yunjin Zhang, Huawei

  • Shujie Zhou, Huawei

  • Chaojun Wang, Huawei

  • Jiajun Hu, Huawei

  • Cong Zhang, Huawei


Cluster Culling Shader(CCS) is similar to the existing compute shader; its main purpose is to provide an execution environment in order to perform coarse-level geometry culling and level-of-detail selection more efficiently on GPU.

The traditional 2-pass GPU culling solution using compute shader needs a pipeline barrier between compute pipeline and graphics pipeline, sometimes, in order to optimize performance, an additional compaction process may also be required. this extension improve the above mentioned shortcomings which can allow compute shader directly emit visible clusters to following graphics pipeline.

A set of new built-in output variables are used to express visible cluster, in addition, a new built-in function is used to emit these variables from CCS to IA stage, then IA can use these variables to fetches vertices of visible cluster and drive vertex shader to shading these vertices. Note that ccs do not work at the same time with geometry shader or tessellation shader.

As stated above, both IA and vertex shader are preserved, vertex shader still used for vertices position shading, instead of directly outputting a set of transformed vertices from compute shader, this makes CCS more suitable for mobile GPUs.

New Commands

New Structures

New Enum Constants



  • Extending VkPipelineStageFlagBits2:


  • Extending VkQueryPipelineStatisticFlagBits:


  • Extending VkShaderStageFlagBits:


  • Extending VkStructureType:



New Built-In Variables

New SPIR-V Capability

Sample code

Example of cluster culling in a GLSL shader

#extension GL_HUAWEI_cluster_culling_shader: enable

#define GPU_WARP_SIZE                   32
#define GPU_GROUP_SIZE                  GPU_WARP_SIZE


// Number of threads per workgroup
// - 1D only
// - warpsize = 32
layout(local_size_x=GPU_GROUP_SIZE, local_size_y=1, local_size_z=1) in;

#define GPU_DRAW_BUFFER_BINDING             1

const float pi_half = 1.570795;
uint instance_id;

struct BoundingSphere
  vec3 center;
  float radius;

struct BoundingCone
  vec3 normal;
  float angle;

struct ClusterDescriptor
  BoundingSphere sphere;
  BoundingCone cone;
  uint instance_idx;

struct InstanceData
  mat4 mvp_matrix;                      // mvp matrix.
  vec4 frustum_planes[6];               // six frustum planes
  mat4 model_matrix_transpose_inverse;  // inverse transpose of model matrix.
  vec3 view_origin;                     // view original

struct InstanceDescriptor
  uint begin;
  uint end;
  uint cluster_count;
  uint debug;
  BoundingSphere sphere;
  InstanceData instance_data;

struct DrawElementsCommand{
  uint indexcount;
  uint instanceCount;
  uint firstIndex;
  int  vertexoffset;
  uint firstInstance;
  uint cluster_id;

// indexed mode
out gl_PerClusterHUAWEI{
  uint gl_IndexCountHUAWEI;
  uint gl_InstanceCountHUAWEI;
  uint gl_FirstIndexHUAWEI;
  int  gl_VertexOffsetHUAWEI;
  uint gl_FirstInstanceHUAWEI;
  uint gl_ClusterIDHUAWEI;

layout(binding = GPU_CLUSTER_DESCRIPTOR_BINDING, std430) readonly buffer cluster_descriptor_ssbo
        ClusterDescriptor cluster_descriptors[];

layout(binding = GPU_DRAW_BUFFER_BINDING, std430) buffer draw_indirect_ssbo
        DrawElementsCommand draw_commands[];

layout(binding = GPU_INSTANCE_DESCRIPTOR_BINDING, std430) buffer instance_descriptor_ssbo
        InstanceDescriptor instance_descriptors[];

bool isFrontFaceVisible( vec3 sphere_center, float sphere_radius, vec3 cone_normal, float cone_angle )
  vec3 sphere_center_dir = normalize(sphere_center -

  float sin_cone_angle = sin(min(cone_angle, pi_half));
  return dot(cone_normal, sphere_center_dir) < sin_cone_angle;

bool isSphereOutsideFrustum( vec3 sphere_center, float sphere_radius )
  bool isInside = false;

  for(int i = 0; i < 6; i++)
      isInside = isInside ||
      sphere_center) + instance_descriptors[instance_id].instance_data.frustum_planes[i].w <
  return isInside;

void main()
    uint cluster_id = gl_GlobalInvocationID.x;
    ClusterDescriptor desc = cluster_descriptors[cluster_id];

    // get instance description
    instance_id = desc.instance_idx;
    InstanceDescriptor inst_desc = instance_descriptors[instance_id];

    //instance based culling
    bool instance_render = !isSphereOutsideFrustum(, inst_desc.sphere.radius);

    if( instance_render)
        // cluster based culling
        bool render = (!isSphereOutsideFrustum(,
        desc.sphere.radius) && isFrontFaceVisible(, desc.sphere.radius, desc.cone.norm
        al, desc.cone.angle));

        if (render)
            // this cluster passed coarse-level culling, update built-in output variable.
            // in case of indexed mode:
            gl_IndexCountHUAWEI     = draw_commands[cluster_id].indexcount;
            gl_InstanceCountHUAWEI  = draw_commands[cluster_id].instanceCount;
            gl_FirstIndexHUAWEI     = draw_commands[cluster_id].firstIndex;
            gl_VertexOffsetHUAWEI   = draw_commands[cluster_id].vertexoffset;
            gl_FirstInstanceHUAWEI  = draw_commands[cluster_id].firstInstance;
            gl_ClusterIDHUAWEI      = draw_commands[cluster_id].cluster_id;

            // emit built-in output variables as a drawing command to subsequent
            // rendering pipeline.

Example of graphics pipeline creation with cluster culling shader

// create a cluster culling shader stage info structure.
VkPipelineShaderStageCreateInfo ccsStageInfo{};
ccsStageInfo.module = clustercullingshaderModule;
ccsStageInfo.pName =  "main";

// pipeline shader stage creation
VkPipelineShaderStageCreateInfo shaderStages[] = { ccsStageInfo, vertexShaderStageInfo, fragmentShaderStageInfo };

// create graphics pipeline
VkGraphicsPipelineCreateInfo pipelineInfo{};
pipelineInfo.stageCount = 3;
pipelineInfo.pStage = shaderStages;
pipelineInfo.pVertexInputState = &vertexInputInfo;
// ...
VkPipeline graphicsPipeline;
VkCreateGraphicsPipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &graphicsPipeline);

Example of launching the execution of cluster culling shader

vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_GRAPHICS, graphicsPipeline);
vkCmdDrawClusterHUAWEI(commandBuffer, groupCountX, 1, 1);

Version History

  • Revision 1, 2022-11-18 (YuChang Wang)

    • Internal revisions

See Also

Document Notes

For more information, see the Vulkan Specification

This page is a generated document. Fixes and changes should be made to the generator scripts, not directly.

Copyright 2014-2023 The Khronos Group Inc.

SPDX-License-Identifier: CC-BY-4.0