from dataclasses import dataclass, field
from enum import Enum
from typing import List, Optional, Union
from netspresso.np_qai.options.common import CommonOptions
[docs]class TfliteDelegates(str, Enum):
QNN = "qnn"
QNN_GPU = "qnn-gpu"
NNAPI = "nnapi"
NNAPI_GPU = "nnapi-gpu"
GPU = "gpu"
XNNPACK = "xnnpack"
[docs]class ExecutionMode(str, Enum):
SEQUENTIAL = "SEQUENTIAL"
PARALLEL = "PARALLEL"
[docs]class GraphOptimizationLevel(str, Enum):
DISABLE_ALL = "DISABLE_ALL"
ENABLE_BASIC = "ENABLE_BASIC"
ENABLE_EXTENDED = "ENABLE_EXTENDED"
ENABLE_ALL = "ENABLE_ALL"
[docs]class OnnxExecutionProviders(str, Enum):
QNN = "qnn"
QNN_GPU = "qnn-gpu"
DIRECTML = "directml"
[docs]class QnnLogLevel(str, Enum):
K_LOG_OFF = "kLogOff"
K_LOG_LEVEL_ERROR = "kLogLevelError"
K_LOG_LEVEL_WARN = "kLogLevelWarn"
K_LOG_LEVEL_INFO = "kLogLevelInfo"
K_LOG_LEVEL_VERBOSE = "kLogLevelVerbose"
K_LOG_LEVEL_DEBUG = "kLogLevelDebug"
[docs]class QnnGraphPriority(str, Enum):
K_QNN_PRIORITY_DEFAULT = "kQnnPriorityDefault"
K_QNN_PRIORITY_LOW = "kQnnPriorityLow"
K_QNN_PRIORITY_NORMAL = "kQnnPriorityNormal"
K_QNN_PRIORITY_NORMAL_HIGH = "kQnnPriorityNormalHigh"
K_QNN_PRIORITY_HIGH = "kQnnPriorityHigh"
K_QNN_PRIORITY_UNDEFINED = "kQnnPriorityUndefined"
[docs]class QnnGpuPrecision(str, Enum):
K_GPU_USER_PROVIDED = "kGpuUserProvided"
K_GPU_FP32 = "kGpuFp32"
K_GPU_FP16 = "kGpuFp16"
K_GPU_HYBRID = "kGpuHybrid"
[docs]class QnnDspEncoding(str, Enum):
K_DSP_STATIC = "kDspStatic"
K_DSP_DYNAMIC = "kDspDynamic"
[docs]class QnnHtpPrecision(str, Enum):
K_HTP_QUANTIZED = "kHtpQuantized"
K_HTP_FP16 = "kHtpFp16"
[docs]class QnnHtpOptimizationStrategy(str, Enum):
K_HTP_OPTIMIZE_FOR_INFERENCE = "kHtpOptimizeForInference"
K_HTP_OPTIMIZE_FOR_PREPARE = "kHtpOptimizeForPrepare"
[docs]class GpuInferencePreference(str, Enum):
TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER = "TFLITE_GPU_INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER"
TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED = "TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED"
TFLITE_GPU_INFERENCE_PREFERENCE_BALANCED = "TFLITE_GPU_INFERENCE_PREFERENCE_BALANCED"
[docs]class GpuInferencePriority(str, Enum):
TFLITE_GPU_INFERENCE_PREFERENCE_BALANCED = "TFLITE_GPU_INFERENCE_PREFERENCE_BALANCED"
TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION = "TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION"
TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY = "TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY"
TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE = "TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE"
[docs]class NnapiExecutionPreference(str, Enum):
K_LOW_POWER = "kLowPower"
K_FAST_SINGLE_ANSWER = "kFastSingleAnswer"
K_SUSTAINED_SPEED = "kSustainedSpeed"
[docs]class ContextErrorReportingOptionsLevel(str, Enum):
BRIEF = "BRIEF"
DETAILED = "DETAILED"
[docs]class Priority(str, Enum):
LOW = "LOW"
NORMAL = "NORMAL"
NORMAL_HIGH = "NORMAL_HIGH"
HIGH = "HIGH"
[docs]class ContextGpuPerformanceHint(str, Enum):
LOW = "LOW"
NORMAL = "NORMAL"
HIGH = "HIGH"
[docs]class ContextHtpPerformanceMode(str, Enum):
EXTREME_POWER_SAVER = "EXTREME_POWER_SAVER"
LOW_POWER_SAVER = "LOW_POWER_SAVER"
POWER_SAVER = "POWER_SAVER"
HIGH_POWER_SAVER = "HIGH_POWER_SAVER"
LOW_BALANCED = "LOW_BALANCED"
BALANCED = "BALANCED"
HIGH_PERFORMANCE = "HIGH_PERFORMANCE"
SUSTAINED_HIGH_PERFORMANCE = "SUSTAINED_HIGH_PERFORMANCE"
BURST = "BURST"
[docs]class DefaultGraphGpuPrecision(str, Enum):
FLOAT32 = "FLOAT32"
FLOAT16 = "FLOAT16"
HYBRID = "HYBRID"
USER_PROVIDED = "USER_PROVIDED"
[docs]class DefaultGraphHtpOptimizationType(str, Enum):
FINALIZE_OPTIMIZATION_FLAG = "FINALIZE_OPTIMIZATION_FLAG"
[docs]class DefaultGraphHtpPrecision(str, Enum):
FLOAT16 = "FLOAT16"
[docs]@dataclass
class OnnxOptions:
execution_mode: Optional[ExecutionMode] = ExecutionMode.SEQUENTIAL
intra_op_num_threads: Optional[int] = 0
inter_op_num_threads: Optional[int] = 0
enable_memory_pattern: Optional[bool] = False
enable_cpu_memory_arena: Optional[bool] = False
graph_optimization_level: Optional[GraphOptimizationLevel] = GraphOptimizationLevel.ENABLE_ALL
[docs] def to_cli_string(self) -> str:
args = []
if self.execution_mode is not None:
args.append(f"execution_mode={self.execution_mode}")
if self.intra_op_num_threads is not None:
args.append(f"intra_op_num_threads={self.intra_op_num_threads}")
if self.inter_op_num_threads is not None:
args.append(f"inter_op_num_threads={self.inter_op_num_threads}")
if self.enable_memory_pattern is not None:
args.append(f"enable_memory_pattern={'true' if self.enable_memory_pattern else 'false'}")
if self.enable_cpu_memory_arena is not None:
args.append(f"enable_cpu_memory_arena={'true' if self.enable_cpu_memory_arena else 'false'}")
if self.graph_optimization_level is not None:
args.append(f"graph_optimization_level={self.graph_optimization_level}")
return f"--onnx_options {';'.join(args)}"
[docs]@dataclass
class OnnxQnnOptions(OnnxOptions):
qnn_htp_performance_mode: Optional[OnnxQnnHtpPerformanceMode] = OnnxQnnHtpPerformanceMode.BURST
qnn_htp_graph_optimization_mode: Optional[str] = 3
qnn_enable_htp_fp16_precision: Optional[str] = 1
[docs] def to_cli_string(self) -> str:
base_string = super().to_cli_string().split(" ")[1] # Get base TfliteOptions part
args = [base_string]
if self.qnn_htp_performance_mode is not None:
args.append(f"qnn_htp_performance_mode={self.qnn_htp_performance_mode}")
if self.qnn_htp_graph_optimization_mode is not None:
args.append(f"qnn_htp_graph_optimization_mode={self.qnn_htp_graph_optimization_mode}")
if self.qnn_enable_htp_fp16_precision is not None:
args.append(f"qnn_enable_htp_fp16_precision={self.qnn_enable_htp_fp16_precision}")
return f"--onnx_options {';'.join(args)}"
[docs]@dataclass
class TfliteOptions:
enable_fallback: Optional[bool] = True
invoke_interpreter_on_cold_load: Optional[bool] = False
allow_fp32_as_fp16: Optional[bool] = True
force_opengl: Optional[bool] = False
number_of_threads: Optional[int] = -1
release_dynamic_tensors: Optional[bool] = False
[docs] def to_cli_string(self) -> str:
args = []
if self.enable_fallback is not None:
args.append(f"enable_fallback={'true' if self.enable_fallback else 'false'}")
if self.invoke_interpreter_on_cold_load is not None:
args.append(
f"invoke_interpreter_on_cold_load={'true' if self.invoke_interpreter_on_cold_load else 'false'}"
)
if self.allow_fp32_as_fp16 is not None:
args.append(f"allow_fp32_as_fp16={'true' if self.allow_fp32_as_fp16 else 'false'}")
if self.force_opengl is not None:
args.append(f"force_opengl={'true' if self.force_opengl else 'false'}")
if self.number_of_threads is not None:
args.append(f"number_of_threads={self.number_of_threads}")
if self.release_dynamic_tensors is not None:
args.append(f"release_dynamic_tensors={'true' if self.release_dynamic_tensors else 'false'}")
return f"--tflite_options {';'.join(args)}"
[docs]@dataclass
class TfliteQnnOptions(TfliteOptions):
qnn_log_level: Optional[QnnLogLevel] = QnnLogLevel.K_LOG_LEVEL_WARN
qnn_graph_priority: Optional[QnnGraphPriority] = QnnGraphPriority.K_QNN_PRIORITY_DEFAULT
qnn_gpu_precision: Optional[QnnGpuPrecision] = QnnGpuPrecision.K_GPU_FP16
qnn_gpu_performance_mode: Optional[QnnGpuPerformanceMode] = QnnGpuPerformanceMode.K_GPU_HIGH
qnn_dsp_performance_mode: Optional[QnnDspPerformanceMode] = QnnDspPerformanceMode.K_DSP_BURST
qnn_dsp_encoding: Optional[QnnDspEncoding] = QnnDspEncoding.K_DSP_STATIC
qnn_htp_performance_mode: Optional[TfliteQnnHtpPerformanceMode] = TfliteQnnHtpPerformanceMode.K_HTP_BURST
qnn_htp_precision: Optional[QnnHtpPrecision] = QnnHtpPrecision.K_HTP_FP16
qnn_htp_optimization_strategy: Optional[QnnHtpOptimizationStrategy] = (
QnnHtpOptimizationStrategy.K_HTP_OPTIMIZE_FOR_INFERENCE
)
qnn_htp_use_conv_hmx: Optional[bool] = True
qnn_htp_use_fold_relu: Optional[bool] = False
qnn_htp_vtcm_size: Optional[int] = None
qnn_htp_num_hvx_threads: Optional[int] = None
[docs] def to_cli_string(self) -> str:
base_string = super().to_cli_string().split(" ")[1] # Get base TfliteOptions part
args = [base_string]
if self.qnn_log_level is not None:
args.append(f"qnn_log_level={self.qnn_log_level.value}")
if self.qnn_graph_priority is not None:
args.append(f"qnn_graph_priority={self.qnn_graph_priority.value}")
if self.qnn_gpu_precision is not None:
args.append(f"qnn_gpu_precision={self.qnn_gpu_precision.value}")
if self.qnn_gpu_performance_mode is not None:
args.append(f"qnn_gpu_performance_mode={self.qnn_gpu_performance_mode.value}")
if self.qnn_dsp_performance_mode is not None:
args.append(f"qnn_dsp_performance_mode={self.qnn_dsp_performance_mode.value}")
if self.qnn_dsp_encoding is not None:
args.append(f"qnn_dsp_encoding={self.qnn_dsp_encoding.value}")
if self.qnn_htp_performance_mode is not None:
args.append(f"qnn_htp_performance_mode={self.qnn_htp_performance_mode.value}")
if self.qnn_htp_precision is not None:
args.append(f"qnn_htp_precision={self.qnn_htp_precision.value}")
if self.qnn_htp_optimization_strategy is not None:
args.append(f"qnn_htp_optimization_strategy={self.qnn_htp_optimization_strategy.value}")
if self.qnn_htp_use_conv_hmx is not None:
args.append(f"qnn_htp_use_conv_hmx={'true' if self.qnn_htp_use_conv_hmx else 'false'}")
if self.qnn_htp_use_fold_relu is not None:
args.append(f"qnn_htp_use_fold_relu={'true' if self.qnn_htp_use_fold_relu else 'false'}")
if self.qnn_htp_vtcm_size is not None:
args.append(f"qnn_htp_vtcm_size={self.qnn_htp_vtcm_size}")
if self.qnn_htp_num_hvx_threads is not None:
args.append(f"qnn_htp_num_hvx_threads={self.qnn_htp_num_hvx_threads}")
return f"--tflite_options {';'.join(args)}"
[docs]@dataclass
class TfliteGpuv2Options(TfliteOptions):
gpu_inference_preference: Optional[GpuInferencePreference] = (
GpuInferencePreference.TFLITE_GPU_INFERENCE_PREFERENCE_SUSTAINED_SPEED
)
gpu_inference_priority1: Optional[GpuInferencePriority] = (
GpuInferencePriority.TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY
)
gpu_inference_priority2: Optional[GpuInferencePriority] = (
GpuInferencePriority.TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE
)
gpu_inference_priority3: Optional[GpuInferencePriority] = (
GpuInferencePriority.TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION
)
gpu_max_delegated_partitions: Optional[int] = 1
[docs] def to_cli_string(self) -> str:
base_string = super().to_cli_string().split(" ")[1] # Get base TfliteOptions part
args = [base_string]
if self.gpu_inference_preference is not None:
args.append(f"gpu_inference_preference={self.gpu_inference_preference.value}")
if self.gpu_inference_priority1 is not None:
args.append(f"gpu_inference_priority1={self.gpu_inference_priority1.value}")
if self.gpu_inference_priority2 is not None:
args.append(f"gpu_inference_priority2={self.gpu_inference_priority2.value}")
if self.gpu_inference_priority3 is not None:
args.append(f"gpu_inference_priority3={self.gpu_inference_priority3.value}")
if self.gpu_max_delegated_partitions is not None:
args.append(f"gpu_max_delegated_partitions={self.gpu_max_delegated_partitions}")
return f"--tflite_options {';'.join(args)}"
[docs]@dataclass
class TfliteNnapiOptions(TfliteOptions):
nnapi_execution_preference: Optional[NnapiExecutionPreference] = NnapiExecutionPreference.K_SUSTAINED_SPEED
nnapi_max_number_delegated_partitions: Optional[int] = 3
nnapi_allow_fp16: Optional[bool] = True
[docs] def to_cli_string(self) -> str:
base_string = super().to_cli_string().split(" ")[1] # Get base TfliteOptions part
args = [base_string]
if self.nnapi_execution_preference is not None:
args.append(f"nnapi_execution_preference={self.nnapi_execution_preference.value}")
if self.nnapi_max_number_delegated_partitions is not None:
args.append(f"nnapi_max_number_delegated_partitions={self.nnapi_max_number_delegated_partitions}")
if self.nnapi_allow_fp16 is not None:
args.append(f"nnapi_allow_fp16={'true' if self.nnapi_allow_fp16 else 'false'}")
return f"--tflite_options {';'.join(args)}"
[docs]@dataclass
class QnnOptions:
default_graph_htp_optimization_value: Optional[int] = True
context_async_execution_queue_depth_numeric: Optional[int] = None
context_enable_graphs: Optional[List[str]] = None
context_error_reporting_options_level: Optional[ContextErrorReportingOptionsLevel] = None
context_error_reporting_options_storage_limit: Optional[int] = None
context_memory_limit_hint: Optional[int] = None
context_priority: Optional[Priority] = None
context_gpu_performance_hint: Optional[ContextGpuPerformanceHint] = ContextGpuPerformanceHint.HIGH
context_gpu_use_gl_buffers: Optional[bool] = None
context_htp_performance_mode: Optional[ContextHtpPerformanceMode] = ContextHtpPerformanceMode.BURST
default_graph_priority: Optional[Priority] = True
default_graph_gpu_precision: Optional[DefaultGraphGpuPrecision] = DefaultGraphGpuPrecision.USER_PROVIDED
default_graph_gpu_disable_memory_optimizations: Optional[bool] = None
default_graph_gpu_disable_node_optimizations: Optional[bool] = None
default_graph_gpu_disable_queue_recording: Optional[bool] = None
default_graph_htp_disable_fold_relu_activation_into_conv: Optional[bool] = False
default_graph_htp_num_hvx_threads: Optional[int] = 4
default_graph_htp_optimization_type: Optional[DefaultGraphHtpOptimizationType] = (
DefaultGraphHtpOptimizationType.FINALIZE_OPTIMIZATION_FLAG
)
default_graph_htp_optimization_value: Optional[int] = field(default=None, metadata={"valid_values": [1, 2, 3]})
default_graph_htp_precision: Optional[DefaultGraphHtpPrecision] = DefaultGraphHtpPrecision.FLOAT16
default_graph_htp_disable_short_depth_conv_on_hmx: Optional[bool] = False
default_graph_htp_vtcm_size: Optional[int] = 4
def __post_init__(self):
valid_values = self.__dataclass_fields__["default_graph_htp_optimization_value"].metadata["valid_values"]
if (
self.default_graph_htp_optimization_value is not None
and self.default_graph_htp_optimization_value not in valid_values
):
raise ValueError(
f"default_graph_htp_optimization_value must be one of {valid_values}, "
f"got {self.default_graph_htp_optimization_value}"
)
[docs] def to_cli_string(self) -> str:
args = []
if self.default_graph_htp_optimization_value is not None:
args.append(f"default_graph_htp_optimization_value={self.default_graph_htp_optimization_value}")
if self.context_async_execution_queue_depth_numeric is not None:
args.append(
f"context_async_execution_queue_depth_numeric={self.context_async_execution_queue_depth_numeric}"
)
if self.context_enable_graphs is not None:
args.append(f"context_enable_graphs={','.join(self.context_enable_graphs)}")
if self.context_error_reporting_options_level is not None:
args.append(f"context_error_reporting_options_level={self.context_error_reporting_options_level}")
if self.context_error_reporting_options_storage_limit is not None:
args.append(
f"context_error_reporting_options_storage_limit={self.context_error_reporting_options_storage_limit}"
)
if self.context_memory_limit_hint is not None:
args.append(f"context_memory_limit_hint={self.context_memory_limit_hint}")
if self.context_priority is not None:
args.append(f"context_priority={self.context_priority}")
if self.context_gpu_performance_hint is not None:
args.append(f"context_gpu_performance_hint={self.context_gpu_performance_hint}")
if self.context_gpu_use_gl_buffers is not None:
args.append(f"context_gpu_use_gl_buffers={'true' if self.context_gpu_use_gl_buffers else 'false'}")
if self.context_htp_performance_mode is not None:
args.append(f"context_htp_performance_mode={self.context_htp_performance_mode}")
if self.default_graph_priority is not None:
args.append(f"default_graph_priority={self.default_graph_priority}")
if self.default_graph_gpu_precision is not None:
args.append(f"default_graph_gpu_precision={self.default_graph_gpu_precision}")
if self.default_graph_gpu_disable_memory_optimizations is not None:
args.append(
f"default_graph_gpu_disable_memory_optimizations={'true' if self.default_graph_gpu_disable_memory_optimizations else 'false'}"
)
if self.default_graph_gpu_disable_node_optimizations is not None:
args.append(
f"default_graph_gpu_disable_node_optimizations={'true' if self.default_graph_gpu_disable_node_optimizations else 'false'}"
)
if self.default_graph_gpu_disable_queue_recording is not None:
args.append(
f"default_graph_gpu_disable_queue_recording={'true' if self.default_graph_gpu_disable_queue_recording else 'false'}"
)
if self.default_graph_htp_disable_fold_relu_activation_into_conv is not None:
args.append(
f"default_graph_htp_disable_fold_relu_activation_into_conv={'true' if self.default_graph_htp_disable_fold_relu_activation_into_conv else 'false'}"
)
if self.default_graph_htp_num_hvx_threads is not None:
args.append(f"default_graph_htp_num_hvx_threads={self.default_graph_htp_num_hvx_threads}")
if self.default_graph_htp_optimization_type is not None:
args.append(f"default_graph_htp_optimization_type={self.default_graph_htp_optimization_type}")
if self.default_graph_htp_precision is not None:
args.append(f"default_graph_htp_precision={self.default_graph_htp_precision}")
if self.default_graph_htp_disable_short_depth_conv_on_hmx is not None:
args.append(
f"default_graph_htp_disable_short_depth_conv_on_hmx={'true' if self.default_graph_htp_disable_short_depth_conv_on_hmx else 'false'}"
)
if self.default_graph_htp_vtcm_size is not None:
args.append(f"default_graph_htp_vtcm_size={self.default_graph_htp_vtcm_size}")
return f"--qnn_options {';'.join(args)}"
[docs]@dataclass
class ProfileCommonOptions(CommonOptions):
dequantize_outputs: Optional[bool] = True
tflite_delegates: Optional[List[TfliteDelegates]] = None
tflite_options: Optional[Union[TfliteOptions, TfliteQnnOptions, TfliteGpuv2Options, TfliteNnapiOptions]] = None
qnn_options: Optional[QnnOptions] = None
onnx_options: Optional[Union[OnnxOptions, OnnxQnnOptions]] = None
onnx_execution_providers: Optional[List[OnnxExecutionProviders]] = None
max_profiler_iterations: Optional[int] = 100
max_profiler_time: Optional[int] = 600
[docs] def handle_tflite_options(self) -> str:
if isinstance(self.tflite_options, (TfliteOptions, TfliteQnnOptions, TfliteGpuv2Options, TfliteNnapiOptions)):
return self.tflite_options.to_cli_string()
else:
return str(self.tflite_options)
[docs] def handle_onnx_options(self) -> str:
if isinstance(self.onnx_options, (OnnxOptions, OnnxQnnOptions)):
return self.onnx_options.to_cli_string()
else:
return str(self.onnx_options)
[docs] def handle_qnn_options(self) -> str:
if isinstance(self.qnn_options, QnnOptions):
return self.qnn_options.to_cli_string()
else:
return str(self.qnn_options)
[docs] def handle_common_options(self) -> List[str]:
args = []
if self.compute_unit is not None:
compute_units = ",".join(list(self.compute_unit))
args.append(f"--compute_unit {compute_units}")
if self.dequantize_outputs:
args.append("--dequantize_outputs")
if self.tflite_delegates is not None:
tflite_delegates = ",".join(list(self.tflite_delegates))
args.append(f"--tflite_delegates {tflite_delegates}")
if self.tflite_options is not None:
args.append(self.handle_tflite_options())
if self.onnx_options is not None:
args.append(self.handle_onnx_options())
if self.qnn_options is not None:
args.append(self.handle_qnn_options())
if self.onnx_execution_providers is not None:
onnx_execution_providers = ",".join((self.onnx_execution_providers))
args.append(f"--onnx_execution_providers {onnx_execution_providers}")
if self.max_profiler_iterations is not None:
args.append(f"--max_profiler_iterations {self.max_profiler_iterations}")
if self.max_profiler_time is not None:
args.append(f"--max_profiler_time {self.max_profiler_time}")
return args
[docs] def to_cli_string(self) -> str:
args = self.handle_common_options()
return " ".join(args)
[docs]@dataclass
class ProfileOptions(ProfileCommonOptions):
"""
Profile options for the model.
Note:
For details, see `ProfileOptions in QAI Hub API <https://app.aihub.qualcomm.com/docs/hub/api.html#profile-inference-options>`_.
"""
pass
[docs]@dataclass
class InferenceOptions(ProfileCommonOptions):
"""
Inference options for the model.
Note:
For details, see `InferenceOptions in QAI Hub API <https://app.aihub.qualcomm.com/docs/hub/api.html#profile-inference-options>`_.
"""
pass