Skip to content

MobileNetV4

MobileNetV4 backbone based on MobileNetV4 -- Universal Models for the Mobile Ecosystem.

Field list

Field Description
name (str) Name must be "mobilenetv4" to use MobileNetV4 backbone.
stage_params[n] case1: Conv2D (list) Build Conv2D layer under following format: ['conv', out_channels, kernel_size, stride].
stage_params[n] case2: FusedIB (list) Build FusedIB block under following format: ['fi', out_channels, hidden_channels, kernel_size, stride].
stage_params[n] case3: UniversalInvertedResidualBlock (list) Build UniversalInvertedResidualBlock block under following format: ['uir', out_channels, hidden_channels, extra_dw, extra_dw_kernel_size, middle_dw, middle_dw_kernel_size, stride].
stage_params[n] case4: MobileMultiQueryAttention2D (list) Build MobileMultiQueryAttention2D block under following format: ['mmqa', out_channels, attention_channel, num_attention_heads, query_pooling_stride, key_val_downsample, key_val_downsample_kernel_size, key_val_downsample_stride, stride].

Model configuration examples

MobileNetV4-conv-small
model:
  architecture:
    backbone:
      name: mobilenetv4
      params:
        stem_out_channel: 32
        stem_kernel_size: 3
        stem_stride: 2
        final_conv_out_channel: 960
        final_conv_kernel_size: 1
        final_conv_stride: 1
        norm_type: batch_norm
        act_type: relu
        return_stage_idx: ~
        layer_scale: 0.1
      stage_params:
        # Conv2D: ['conv', out_channels, kernel_size, stride]
        # FusedIB: ['fi', out_channels, hidden_channels, kernel_size, stride]
        # UniversalInvertedResidualBlock: ['uir', out_channels, hidden_channels, extra_dw, extra_dw_kernel_size, middle_dw, middle_dw_kernel_size, stride]
        # MobileMultiQueryAttention2D: ['mmqa', out_channels, attention_channel, num_attention_heads, query_pooling_stride, key_val_downsample, key_val_downsample_kernel_size, key_val_downsample_stride, stride]
        - 
          - ['conv', 32, 3, 2]
          - ['conv', 32, 1, 1]
        - 
          - ['conv', 96, 3, 2]
          - ['conv', 64, 1, 1]
        - 
          - ['uir', 96, 192, True, 5, True, 5, 2]
          - ['uir', 96, 192, False, ~, True, 3, 1]
          - ['uir', 96, 192, False, ~, True, 3, 1]
          - ['uir', 96, 192, False, ~, True, 3, 1]
          - ['uir', 96, 192, False, ~, True, 3, 1]
          - ['uir', 96, 384, True, 3, False, ~, 1]
        - 
          - ['uir', 128, 576, True, 3, True, 3, 2]
          - ['uir', 128, 512, True, 5, True, 5, 1]
          - ['uir', 128, 512, False, ~, True, 5, 1]
          - ['uir', 128, 384, False, ~, True, 5, 1]
          - ['uir', 128, 512, False, ~, True, 3, 1]
          - ['uir', 128, 512, False, ~, True, 3, 1]
MobileNetV4-conv-medium
model:
  architecture:
    backbone:
      name: mobilenetv4
      params:
        stem_out_channel: 32
        stem_kernel_size: 3
        stem_stride: 2
        final_conv_out_channel: 960
        final_conv_kernel_size: 1
        final_conv_stride: 1
        norm_type: batch_norm
        act_type: relu
        return_stage_idx: ~
        layer_scale: ~
      stage_params:
        # Conv2D: ['conv', out_channels, kernel_size, stride]
        # FusedIB: ['fi', out_channels, hidden_channels, kernel_size, stride]
        # UniversalInvertedResidualBlock: ['uir', out_channels, hidden_channels, extra_dw, extra_dw_kernel_size, middle_dw, middle_dw_kernel_size, stride]
        # MobileMultiQueryAttention2D: ['mmqa', out_channels, attention_channel, num_attention_heads, query_pooling_stride, key_val_downsample, key_val_downsample_kernel_size, key_val_downsample_stride, stride]
        - 
          - ['fi', 48, 128, 3, 2]
        - 
          - ['uir', 80, 192, True, 3, True, 5, 2]
          - ['uir', 80, 160, True, 3, True, 3, 1]
        - 
          - ['uir', 160, 480, True, 3, True, 5, 2]
          - ['uir', 160, 640, True, 3, True, 3, 1]
          - ['uir', 160, 640, True, 3, True, 3, 1]
          - ['uir', 160, 640, True, 3, True, 5, 1]
          - ['uir', 160, 640, True, 3, True, 3, 1]
          - ['uir', 160, 640, True, 3, False, ~, 1]
          - ['uir', 160, 320, False, ~, False, ~, 1]
          - ['uir', 160, 640, True, 3, False, ~, 1]
        - 
          - ['uir', 256, 960, True, 5, True, 5, 2]
          - ['uir', 256, 1024, True, 5, True, 5, 1]
          - ['uir', 256, 1024, True, 3, True, 5, 1]
          - ['uir', 256, 1024, True, 3, True, 5, 1]
          - ['uir', 256, 1024, False, ~, False, ~, 1]
          - ['uir', 256, 1024, True, 3, False, ~, 1]
          - ['uir', 256, 512, True, 3, True, 5, 1]
          - ['uir', 256, 1024, True, 5, True, 5, 1]
          - ['uir', 256, 1024, False, ~, False, ~, 1]
          - ['uir', 256, 1024, False, ~, False, ~, 1]
          - ['uir', 256, 512, True, 5, False, ~, 1]
MobileNetV4-conv-large
model:
  architecture:
    backbone:
      name: mobilenetv4
      params:
        stem_out_channel: 24
        stem_kernel_size: 3
        stem_stride: 2
        final_conv_out_channel: 960
        final_conv_kernel_size: 1
        final_conv_stride: 1
        norm_type: batch_norm
        act_type: relu
        return_stage_idx: ~
        layer_scale: ~
      stage_params:
        # Conv2D: ['conv', out_channels, kernel_size, stride]
        # FusedIB: ['fi', out_channels, hidden_channels, kernel_size, stride]
        # UniversalInvertedResidualBlock: ['uir', out_channels, hidden_channels, extra_dw, extra_dw_kernel_size, middle_dw, middle_dw_kernel_size, stride]
        # MobileMultiQueryAttention2D: ['mmqa', out_channels, attention_channel, num_attention_heads, query_pooling_stride, key_val_downsample, key_val_downsample_kernel_size, key_val_downsample_stride, stride]
        - 
          - ['fi', 48, 96, 3, 2]
        - 
          - ['uir', 96, 192, True, 3, True, 5, 2]
          - ['uir', 96, 384, True, 3, True, 3, 1]
        - 
          - ['uir', 192, 384, True, 3, True, 5, 2]
          - ['uir', 192, 768, True, 3, True, 3, 1]
          - ['uir', 192, 768, True, 3, True, 3, 1]
          - ['uir', 192, 768, True, 3, True, 3, 1]
          - ['uir', 192, 768, True, 3, True, 5, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['uir', 192, 768, True, 3, False, ~, 1]
        - 
          - ['uir', 512, 768, True, 5, True, 5, 2]
          - ['uir', 512, 2048, True, 5, True, 5, 1]
          - ['uir', 512, 2048, True, 5, True, 5, 1]
          - ['uir', 512, 2048, True, 5, True, 5, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['uir', 512, 2048, True, 5, True, 3, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['uir', 512, 2048, True, 5, True, 3, 1]
          - ['uir', 512, 2048, True, 5, True, 5, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
MobileNetV4-hybrid-medium
model:
  architecture:
    backbone:
      name: mobilenetv4
      params:
        stem_out_channel: 32
        stem_kernel_size: 3
        stem_stride: 2
        final_conv_out_channel: 960
        final_conv_kernel_size: 1
        final_conv_stride: 1
        norm_type: batch_norm
        act_type: relu
        return_stage_idx: ~
        layer_scale: 0.1
      stage_params:
        # Conv2D: ['conv', out_channels, kernel_size, stride]
        # FusedIB: ['fi', out_channels, hidden_channels, kernel_size, stride]
        # UniversalInvertedResidualBlock: ['uir', out_channels, hidden_channels, extra_dw, extra_dw_kernel_size, middle_dw, middle_dw_kernel_size, stride]
        # MobileMultiQueryAttention2D: ['mmqa', out_channels, attention_channel, num_attention_heads, query_pooling_stride, key_val_downsample, key_val_downsample_kernel_size, key_val_downsample_stride, stride]
        - 
          - ['fi', 48, 128, 3, 2]
        - 
          - ['uir', 80, 192, True, 3, True, 5, 2]
          - ['uir', 80, 160, True, 3, True, 3, 1]
        - 
          - ['uir', 160, 480, True, 3, True, 5, 2]
          - ['uir', 160, 320, False, ~, False, ~, 1]
          - ['uir', 160, 640, True, 3, True, 3, 1]
          - ['uir', 160, 640, True, 3, True, 5, 1]
          - ['mmqa', 160, 256, 4, ~, True, 3, 2, 1]
          - ['uir', 160, 640, True, 3, True, 3, 1]
          - ['mmqa', 160, 256, 4, ~, True, 3, 2, 1]
          - ['uir', 160, 640, True, 3, False, ~, 1]
          - ['mmqa', 160, 256, 4, ~, True, 3, 2, 1]
          - ['uir', 160, 640, True, 3, True, 3, 1]
          - ['mmqa', 160, 256, 4, ~, True, 3, 2, 1]
          - ['uir', 160, 640, True, 3, False, ~, 1]
        - 
          - ['uir', 256, 960, True, 5, True, 5, 2]
          - ['uir', 256, 1024, True, 5, True, 5, 1]
          - ['uir', 256, 1024, True, 3, True, 5, 1]
          - ['uir', 256, 1024, True, 3, True, 5, 1]
          - ['uir', 256, 512, False, ~, False, ~, 1]
          - ['uir', 256, 512, True, 3, True, 5, 1]
          - ['uir', 256, 512, False, ~, False, ~, 1]
          - ['uir', 256, 1024, False, ~, False, ~, 1]
          - ['mmqa', 256, 256, 4, ~, False, ~, ~, 1]
          - ['uir', 256, 1024, True, 3, False, ~, 1]
          - ['mmqa', 256, 256, 4, ~, False, ~, ~, 1]
          - ['uir', 256, 1024, True, 5, True, 5, 1]
          - ['mmqa', 256, 256, 4, ~, False, ~, ~, 1]
          - ['uir', 256, 1024, True, 5, False, ~, 1]
          - ['mmqa', 256, 256, 4, ~, False, ~, ~, 1]
          - ['uir', 256, 1024, True, 5, False, ~, 1]
MobileNetV4-hybrid-large
model:
  architecture:
    backbone:
      name: mobilenetv4
      params:
        stem_out_channel: 24
        stem_kernel_size: 3
        stem_stride: 2
        final_conv_out_channel: 960
        final_conv_kernel_size: 1
        final_conv_stride: 1
        norm_type: batch_norm
        act_type: gelu
        return_stage_idx: ~
        layer_scale: 0.1
      stage_params:
        # Conv2D: ['conv', out_channels, kernel_size, stride]
        # FusedIB: ['fi', out_channels, hidden_channels, kernel_size, stride]
        # UniversalInvertedResidualBlock: ['uir', out_channels, hidden_channels, extra_dw, extra_dw_kernel_size, middle_dw, middle_dw_kernel_size, stride]
        # MobileMultiQueryAttention2D: ['mmqa', out_channels, attention_channel, num_attention_heads, query_pooling_stride, key_val_downsample, key_val_downsample_kernel_size, key_val_downsample_stride, stride]
        - 
          - ['fi', 48, 96, 3, 2]
        - 
          - ['uir', 96, 192, True, 3, True, 5, 2]
          - ['uir', 96, 384, True, 3, True, 3, 1]
        - 
          - ['uir', 192, 384, True, 3, True, 5, 2]
          - ['uir', 192, 768, True, 3, True, 3, 1]
          - ['uir', 192, 768, True, 3, True, 3, 1]
          - ['uir', 192, 768, True, 3, True, 3, 1]
          - ['uir', 192, 768, True, 3, True, 5, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['mmqa', 192, 384, 8, ~, True, 3, 2, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['mmqa', 192, 384, 8, ~, True, 3, 2, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['mmqa', 192, 384, 8, ~, True, 3, 2, 1]
          - ['uir', 192, 768, True, 5, True, 3, 1]
          - ['mmqa', 192, 384, 8, ~, True, 3, 2, 1]
          - ['uir', 192, 768, True, 3, False, ~, 1]
        - 
          - ['uir', 512, 768, True, 5, True, 5, 2]
          - ['uir', 512, 2048, True, 5, True, 5, 1]
          - ['uir', 512, 2048, True, 5, True, 5, 1]
          - ['uir', 512, 2048, True, 5, True, 5, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['uir', 512, 2048, True, 5, True, 3, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['uir', 512, 2048, True, 5, True, 3, 1]
          - ['uir', 512, 2048, True, 5, True, 5, 1]
          - ['mmqa', 512, 512, 8, ~, False, ~, ~, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['mmqa', 512, 512, 8, ~, False, ~, ~, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['mmqa', 512, 512, 8, ~, False, ~, ~, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]
          - ['mmqa', 512, 512, 8, ~, False, ~, ~, 1]
          - ['uir', 512, 2048, True, 5, False, ~, 1]