/**
  ******************************************************************************
  * @file    kpm32xx_ddl_dnu.c
  * @author  Kiwi Software Team
  * @brief   DNU(Deep Neural Network Unit) DDL module driver.
  *          This file provides firmware functions to manage the following
  *          functionalities of the DNU peripheral:
  *           + peripheral initializes and deInitializes
  *           + Convolutional operator
  *           + Full connect operator
  *           + Max pooling operator
  *           + Matrix/Vetor add operator
  *           + Relu Batch normal operator
  * @note
  *          V1.0.0, 2024/12/20.
  *
  * Copyright (c) 2024, Kiwi Instruments Co,. Ltd.
  *
  * Redistribution and use in source and binary forms, with or without modification,
  * are permitted provided that the following conditions are met:
  *
  *   1. Redistributions of source code must retain the above copyright notice,
  *      this list of conditions and the following disclaimer.
  *
  *   2. Redistributions in binary form must reproduce the above copyright notice,
  *      this list of conditions and the following disclaimer in the documentation
  *      and/or other materials provided with the distribution.
  *
  *   3. Neither the name of the copyright holder nor the names of its contributors
  *      may be used to endorse or promote products derived from this software without
  *      specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  ******************************************************************************
  */



#include <string.h>
#include "kpm32xx_ddl.h"



#ifdef DDL_DNU_MODULE_ENABLED
DMA_ChannelConfig_T pDMAStruct0;
DMA_ChannelConfig_T pDMAStruct1;
DMA_ChannelConfig_T pDMAStruct2;


/**
  * @brief  Allocate one DMA channel for DNU
  * @param  pDMAStruct    Pointer to a DMA_ChannelConfig_T structure that contains
  *         the configuration information for the given DMA module.
  * @param  channel       Specify the DMA channel to use.
  * @param  increment     Specify DMA channel address increment type.
  * @param  dataAlignment Specify DMA channel data alignment.
  * @param  src           Specify DMA channel source address.
  * @param  dst           Specify DMA channel destination address.
  * @param  len           Specify DMA channel transmit data length.
  * @retval DDL status.
  */
static DDL_Status_T DNU_Alloc_DMA_Channel(DMA_ChannelConfig_T *pDMAStruct, DMA_ChannelIndex_T channel, DMA_AddrIncrement_T increment, uint32_t dataAlignment, uint32_t src, uint32_t dst, uint32_t len)
{
	DDL_DMA_ChannelStructInit(pDMAStruct);
	pDMAStruct->transmitType = DMA_TRANSMIT_NORMAL;
	pDMAStruct->increment = increment;
	pDMAStruct->dataAlignment = dataAlignment;
	pDMAStruct->srcAddress = src;
    pDMAStruct->dstAddress = dst;
    pDMAStruct->dataLength = len;

	DDL_DMA_SetChannelConfig(DMA, channel, pDMAStruct);

	return DDL_OK;
}


/**
  * @brief  Make specified DNU active.
  * @param  DNUx        DNU instance.
  * @retval None.
  */
DDL_Status_T DDL_DNU_Instance_Active(DNU_Type *DNUx)
{
	__DDL_RCC_DNU_ACTIVE();
	__DDL_RCC_DNU_CLK_RELEASE();

	return DDL_OK;
}


/**
  * @brief  Make specified DNU deactive.
  * @param  DNUx        DNU instance.
  * @retval None.
  */
DDL_Status_T DDL_DNU_Instance_Deactive(DNU_Type *DNUx)
{
	__DDL_RCC_DNU_DEACTIVE();
	__DDL_RCC_DNU_CLK_HOLD();

	return DDL_OK;
}


/**
  * @brief  Enable DNU interrupt.
  * @param  DNUx        DNU instance.
  * @param  priority    Specify interrupt priority.
  * @retval None.
  */
void DDL_DNU_IntEnable(DNU_Type *DNUx, uint32_t priority)
{
	__DDL_DNU_ENABLE_IT();
	NVIC_SetPriority(DNU_IRQn, priority);
	NVIC_EnableIRQ(DNU_IRQn);
}


/**
  * @brief  Disable DNU interrupt.
  * @param  DNUx        DNU instance.
  * @retval None.
  */
void DDL_DNU_IntDisable(DNU_Type *DNUx)
{
	__DDL_DNU_DISABLE_IT();
	NVIC_DisableIRQ(DNU_IRQn);
}


/**
  * @brief  Start DNU.
  * @param  DNUx        DNU instance.
  * @retval None.
  */
void DDL_DNU_Start(DNU_Type *DNUx)
{
	__DDL_DNU_ENABLE();
}


/**
  * @brief  Stop DNU.
  * @param  DNUx        DNU instance.
  * @retval None.
  */
void DDL_DNU_Stop(DNU_Type *DNUx)
{
	__DDL_DNU_DISABLE();
}


/**
  * @brief  Implement the max pooling operation, currently only support 2*2 max pooling, accuracy is INT8.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  poolParam   specify the pooling params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_MaxPool_INT8(DNU_Dims_T * inputDims, const int8_t * src, DNU_Dims_T * outputDims, int8_t * dst,
							DNU_PoolParams_T * poolParam)
{
	uint32_t inLength = 0;
	uint32_t outLength = 0;

	/* Config pooling parameters for DNU */
	if (poolParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT8 << DNU_CTRL_ACCURACY_Pos);
	if (poolParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, poolParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, poolParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_MAXPOOL << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);

	/* Each channel only need to transmit (inputDims->h - 1) line data */
	inLength = ((inputDims->h - 1) * inputDims->w % 4) ? ((inputDims->h - 1) * inputDims->w / 4 + 1) : ((inputDims->h - 1) * inputDims->w / 4);
	/* Config input channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)(src + 1 * inputDims->w), (uint32_t)(&DNU->CH1), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	outLength = (outputDims->h * outputDims->w % 4) ? (outputDims->h * outputDims->w / 4 + 1) : (outputDims->h * outputDims->w / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);
	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the max pooling operation, currently only support 2*2 max pooling, accuracy is INT16.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  poolParam   specify the pooling params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_MaxPool_INT16(DNU_Dims_T * inputDims, const int16_t * src, DNU_Dims_T * outputDims, int16_t * dst,
							DNU_PoolParams_T * poolParam)
{
	uint32_t inLength = 0;
	uint32_t outLength = 0;

	/* Config pooling parameters for DNU */
	if (poolParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT16 << DNU_CTRL_ACCURACY_Pos);
	if (poolParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, poolParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, poolParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_MAXPOOL << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);

	/* Each channel only need to transmit (inputDims->h - 1) line data */
	inLength = ((inputDims->h - 1) * inputDims->w % 2) ? ((inputDims->h - 1) * inputDims->w / 2 + 1) : ((inputDims->h - 1) * inputDims->w / 2);
	/* Config input channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)(src + 1 * inputDims->w), (uint32_t)(&DNU->CH1), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	outLength = (outputDims->h * outputDims->w % 2) ? (outputDims->h * outputDims->w / 2 + 1) : (outputDims->h * outputDims->w / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);
	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the max pooling operation, currently only support 2*2 max pooling, accuracy is float.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  poolParam   specify the pooling params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_MaxPool_Float(DNU_Dims_T * inputDims, const float * src, DNU_Dims_T * outputDims, float * dst,
							DNU_PoolParams_T * poolParam)
{
	/* Config pooling parameters for DNU */
	if (poolParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_SINGLE<< DNU_CTRL_ACCURACY_Pos);
	if (poolParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, poolParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, poolParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_MAXPOOL << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), (inputDims->h - 1) * inputDims->w) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)(src + 1 * inputDims->w), (uint32_t)(&DNU->CH1), (inputDims->h - 1) * inputDims->w) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outputDims->h * outputDims->w) != DDL_OK)
	{
		return DDL_ERROR;
	}

	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);
	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the convolution operation, currently only support 1*1 filter, accuracy is INT8.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  filterData  specify the kernel data.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  convParam   specify the convolution params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Conv_INT8(DNU_Dims_T * inputDims, const int8_t * src, const int8_t filterData,
	                                   DNU_Dims_T * outputDims, int8_t * dst, DNU_ConvParams_T * convParam)
{
	uint32_t inLength = 0;
	uint32_t outLength = 0;

	/* Config pooling parameters for DNU */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT8 << DNU_CTRL_ACCURACY_Pos);
	if (convParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, convParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, convParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_CONV << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	}
	else
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	}
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);
	/* Config kernel data */
	WRITE_REG(DNU->COEFMUL, filterData);

	/* Config input channel */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		inLength = ((inputDims->h - 1) * inputDims->w % 4) ? ((inputDims->h - 1) * inputDims->w / 4 + 1) : ((inputDims->h - 1) * inputDims->w / 4);
	}
	else
	{
		inLength = (inputDims->h * inputDims->w % 4) ? (inputDims->h * inputDims->w / 4 + 1) : (inputDims->h * inputDims->w / 4);
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* Config output channel */
	outLength = (outputDims->h * outputDims->w % 4) ? (outputDims->h * outputDims->w / 4 + 1) : (outputDims->h * outputDims->w / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the convolution operation, currently only support 1*1 filter, accuracy is INT16.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  filterData  specify the kernel data.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  convParam   specify the convolution params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Conv_INT16(DNU_Dims_T * inputDims, const int16_t * src, const int16_t filterData,
	                                   DNU_Dims_T * outputDims, int16_t * dst, DNU_ConvParams_T * convParam)
{
	uint32_t inLength = 0;
	uint32_t outLength = 0;

	/* Config pooling parameters for DNU */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT16 << DNU_CTRL_ACCURACY_Pos);
	if (convParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, convParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, convParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_CONV << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	}
	else
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	}
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);
	/* Config kernel data */
	WRITE_REG(DNU->COEFMUL, filterData);

	/* Config input channel */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		inLength = ((inputDims->h - 1) * inputDims->w % 2) ? ((inputDims->h - 1) * inputDims->w / 2 + 1) : ((inputDims->h - 1) * inputDims->w / 2);
	}
	else
	{
		inLength = (inputDims->h * inputDims->w % 2) ? (inputDims->h * inputDims->w / 2 + 1) : (inputDims->h * inputDims->w / 2);
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* Config output channel */
	outLength = (outputDims->h * outputDims->w % 2) ? (outputDims->h * outputDims->w / 2 + 1) : (outputDims->h * outputDims->w / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the convolution operation, currently only support 1*1 filter, accuracy is float.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  filterData  specify the kernel data.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  convParam   specify the convolution params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Conv_Float(DNU_Dims_T * inputDims, const float * src, const float filterData,
	                                   DNU_Dims_T * outputDims, float * dst, DNU_ConvParams_T * convParam)
{
	uint32_t length = 0;

	/* Config pooling parameters for DNU */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_SINGLE << DNU_CTRL_ACCURACY_Pos);
	if (convParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, convParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, convParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_CONV << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	}
	else
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	}
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);
	/* Config kernel data */
	DNU->COEFMUL = *(volatile uint32_t *)(&filterData);

	/* Config input channel */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		length = (inputDims->h - 1) * inputDims->w;
	}
	else
	{
		length = inputDims->h * inputDims->w;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outputDims->h * outputDims->w) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the convolution psum operation, currently only support 1*1 filter,
  *         only support stride step 1, accuracy is INT8.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  psumDims    specify the psum's parameters.
  * @param  psum        point to the buffer storing psum, the size is the same as input feature map.
  * @param  filterData  specify the kernel data.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  convParam   specify the convolution params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_ConvPsum_INT8(DNU_Dims_T * inputDims, const int8_t * src, DNU_Dims_T * psumDims, const int8_t * psum, const int8_t filterData,
	                                   DNU_Dims_T * outputDims, int8_t * dst, DNU_ConvParams_T * convParam)
{
	uint32_t inLength = 0;
	uint32_t outLength = 0;
	uint32_t psumLength = 0;

	/* Config pooling parameters for DNU */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT8 << DNU_CTRL_ACCURACY_Pos);
	if (convParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, convParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, convParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_CONV_PSUM << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	}
	else
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	}
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);
	/* Config kernel data */
	WRITE_REG(DNU->COEFMUL, filterData);

	/* Config input channel */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		inLength = ((inputDims->h - 1) * inputDims->w % 4) ? ((inputDims->h - 1) * inputDims->w / 4 + 1) : ((inputDims->h - 1) * inputDims->w / 4);
	}
	else
	{
		inLength = (inputDims->h * inputDims->w % 4) ? (inputDims->h * inputDims->w / 4 + 1) : (inputDims->h * inputDims->w / 4);
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		psumLength = ((psumDims->h - 1) * psumDims->w % 4) ? ((psumDims->h - 1) * psumDims->w / 4 + 1) : ((psumDims->h - 1) * psumDims->w / 4);
	}
	else
	{
		psumLength = (psumDims->h * psumDims->w % 4) ? (psumDims->h * psumDims->w / 4 + 1) : (psumDims->h * psumDims->w / 4);
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)psum, (uint32_t)(&DNU->CH1), psumLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* Config output channel */
	outLength = (outputDims->h * outputDims->w % 4) ? (outputDims->h * outputDims->w / 4 + 1) : (outputDims->h * outputDims->w / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the convolution psum operation, currently only support 1*1 filter,
  *         only support stride step 1, accuracy is INT16.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  psumDims    specify the psum's parameters.
  * @param  psum        point to the buffer storing psum, the size is the same as input feature map.
  * @param  filterData  specify the kernel data.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  convParam   specify the convolution params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_ConvPsum_INT16(DNU_Dims_T * inputDims, const int16_t * src, DNU_Dims_T * psumDims, const int16_t * psum, const int16_t filterData,
	                                   DNU_Dims_T * outputDims, int16_t * dst, DNU_ConvParams_T * convParam)
{
	uint32_t inLength = 0;
	uint32_t outLength = 0;
	uint32_t psumLength = 0;

	/* Config pooling parameters for DNU */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT16 << DNU_CTRL_ACCURACY_Pos);
	if (convParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, convParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, convParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_CONV_PSUM << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	}
	else
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	}
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);
	/* Config kernel data */
	WRITE_REG(DNU->COEFMUL, filterData);

	/* Config input channel */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		inLength = ((inputDims->h - 1) * inputDims->w % 2) ? ((inputDims->h - 1) * inputDims->w / 2 + 1) : ((inputDims->h - 1) * inputDims->w / 2);
	}
	else
	{
		inLength = (inputDims->h * inputDims->w % 2) ? (inputDims->h * inputDims->w / 2 + 1) : (inputDims->h * inputDims->w / 2);
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		psumLength = ((psumDims->h - 1) * psumDims->w % 2) ? ((psumDims->h - 1) * psumDims->w / 2 + 1) : ((psumDims->h - 1) * psumDims->w / 2);
	}
	else
	{
		psumLength = (psumDims->h * psumDims->w % 2) ? (psumDims->h * psumDims->w / 2 + 1) : (psumDims->h * psumDims->w / 2);
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)psum, (uint32_t)(&DNU->CH1), psumLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* Config output channel */
	outLength = (outputDims->h * outputDims->w % 2) ? (outputDims->h * outputDims->w / 2 + 1) : (outputDims->h * outputDims->w / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the convolution psum operation, currently only support 1*1 filter,
  *         only support stride step 1, accuracy is float.
  * @param  inputDims   specify the input feature map's parameters.
  * @param  src         store the input feature map.
  * @param  psumDims    specify the psum's parameters.
  * @param  psum        point to the buffer storing psum, the size is the same as input feature map.
  * @param  filterData  specify the kernel data.
  * @param  outputDims  specify the output feature map's parameters.
  * @param  dst         store the output feature map.
  * @param  convParam   specify the convolution params.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_ConvPsum_Float(DNU_Dims_T * inputDims, const float * src, DNU_Dims_T * psumDims, const float * psum, const float filterData,
	                                   DNU_Dims_T * outputDims, float * dst, DNU_ConvParams_T * convParam)
{
	uint32_t inLength = 0;
	uint32_t psumLength = 0;

	/* Config pooling parameters for DNU */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}
	else
	{
		CLEAR_BIT(DNU->CTRL, DNU_CTRL_STRIDE);
	}

	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_SINGLE << DNU_CTRL_ACCURACY_Pos);
	if (convParam->pos == PADDING_POS_BEGIN)
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADBGN_Mask, convParam->paddingNUM << DNU_CTRL_PADBGN_Pos);
	}
	else
	{
		SET_BITMASK(DNU->CTRL, DNU_CTRL_PADEND_Mask, convParam->paddingNUM << DNU_CTRL_PADEND_Pos);
	}
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_CONV_PSUM << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, (inputDims->h - 1) << DNU_IN_HEIGHT_Pos);
	}
	else
	{
		SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	}
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);
	/* Config kernel data */
	DNU->COEFMUL = *(volatile uint32_t *)(&filterData);

	/* Config input channel */
	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		inLength = (inputDims->h - 1) * inputDims->w;
	}
	else
	{
		inLength = inputDims->h * inputDims->w;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)src, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	if (convParam->strideStep == STRIDE_STEP_TWO)
	{
		psumLength = (psumDims->h - 1) * psumDims->w;
	}
	else
	{
		psumLength = psumDims->h * psumDims->w;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)psum, (uint32_t)(&DNU->CH1), psumLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)dst, outputDims->h * outputDims->w) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the full connect operation, accuracy is INT8.
  * @param  pInput     point to the input vector.
  * @param  dimInput   the dimension of input.
  * @param  pOut       store the output data.
  * @param  dimOutput  the dimension of output.
  * @param  pM         point to the weight matrix, dimInput ROW dimOutput Column
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_FullConnect_INT8(const int8_t * pInput, uint32_t dimInput, int8_t * pOut, uint32_t dimOutput, const int8_t * pM)
{
	uint32_t inLength = 0;

	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT8 << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_FC << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, dimInput << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, 0x1 << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, dimOutput << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, 0x1 << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	inLength = (dimInput % 4) ? (dimInput / 4 + 1) : (dimInput / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pInput, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	DDL_DMA_EnableChannelCfgHold(DMA, DMA_CHANNEL_0);

	inLength = (dimInput * dimOutput % 4) ? (dimInput * dimOutput / 4 + 1) : (dimInput * dimOutput / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM, (uint32_t)(&DNU->CH1), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_BYTE, (uint32_t)(&DNU->CH2), (uint32_t)pOut, dimOutput) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the full connect operation, accuracy is INT16.
  * @param  pInput     point to the input vector.
  * @param  dimInput   the dimension of input.
  * @param  pOut       store the output data.
  * @param  dimOutput  the dimension of input.
  * @param  pM         point to the weight matrix, dimInput ROW dimOutput Column
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_FullConnect_INT16(const int16_t * pInput, uint32_t dimInput, int16_t * pOut, uint32_t dimOutput, const int16_t * pM)
{
	uint32_t inLength = 0;

	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT16 << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_FC << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, dimInput << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, 0x1 << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, dimOutput << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, 0x1 << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	inLength = (dimInput % 2) ? (dimInput / 2 + 1) : (dimInput / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pInput, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	DDL_DMA_EnableChannelCfgHold(DMA, DMA_CHANNEL_0);

	inLength = (dimInput * dimOutput % 2) ? (dimInput * dimOutput / 2 + 1) : (dimInput * dimOutput / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM, (uint32_t)(&DNU->CH1), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_HALFWORD, (uint32_t)(&DNU->CH2), (uint32_t)pOut, dimOutput) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the full connect operation, accuracy is flaot.
  * @param  pInput     point to the input vector.
  * @param  dimInput   the dimension of input.
  * @param  pOut       store the output data.
  * @param  dimOutput  the dimension of input.
  * @param  pM         point to the weight matrix, dimInput ROW dimOutput Column
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_FullConnect_Float(const float * pInput, uint32_t dimInput, float * pOut, uint32_t dimOutput, const float * pM)
{
	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_SINGLE << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_FC << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, dimInput << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, 0x1 << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, dimOutput << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, 0x1 << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pInput, (uint32_t)(&DNU->CH0), dimInput) != DDL_OK)
	{
		return DDL_ERROR;
	}
	DDL_DMA_EnableChannelCfgHold(DMA, DMA_CHANNEL_0);

	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM, (uint32_t)(&DNU->CH1), dimInput * dimOutput) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)pOut, dimOutput) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the matrix add operation, accuracy is INT8.
  * @param  pM0        point to matrix 0 data.
  * @param  pM1        point to matrix 1 data.
  * @param  matrixDim  specify the dimension of matrix.
  * @param  pOut       store the output matrix.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Add_INT8(const int8_t * pM0, const int8_t * pM1, DNU_Dims_T * matrixDim, int8_t * pOut)
{
	uint32_t length = 0;

	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT8 << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_ADD << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, matrixDim->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, matrixDim->h << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, matrixDim->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, matrixDim->h << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	length = (matrixDim->h * matrixDim->w % 4) ? (matrixDim->h * matrixDim->w / 4 + 1) : (matrixDim->h * matrixDim->w / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM0, (uint32_t)(&DNU->CH0), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM1, (uint32_t)(&DNU->CH1), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)pOut, length) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the matrix add operation, accuracy is INT16.
  * @param  pM0        point to matrix 0 data.
  * @param  pM1        point to matrix 1 data.
  * @param  matrixDim  specify the dimension of matrix.
  * @param  pOut       store the output matrix.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Add_INT16(const int16_t * pM0, const int16_t * pM1, DNU_Dims_T * matrixDim, int16_t * pOut)
{
	uint32_t length = 0;

	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT16 << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_ADD << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, matrixDim->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, matrixDim->h << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, matrixDim->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, matrixDim->h << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	length = (matrixDim->h * matrixDim->w % 2) ? (matrixDim->h * matrixDim->w / 2 + 1) : (matrixDim->h * matrixDim->w / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM0, (uint32_t)(&DNU->CH0), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM1, (uint32_t)(&DNU->CH1), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)pOut, length) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the matrix add operation, accuracy is float.
  * @param  pM0        point to matrix 0 data.
  * @param  pM1        point to matrix 1 data.
  * @param  matrixDim  specify the dimension of matrix.
  * @param  pOut       store the output matrix.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Add_Float(const float * pM0, const float * pM1, DNU_Dims_T * matrixDim, float * pOut)
{
	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_SINGLE << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_ADD << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, matrixDim->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, matrixDim->h << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, matrixDim->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, matrixDim->h << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM0, (uint32_t)(&DNU->CH0), matrixDim->h * matrixDim->w) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM1, (uint32_t)(&DNU->CH1), matrixDim->h * matrixDim->w) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)pOut, matrixDim->h * matrixDim->w) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the relu and batch normal operation, accuracy is INT8.
  * @param  inputDims    specify the input feature map's parameters.
  * @param  data         point to input data.
  * @param  outputDims   specify the output feature map's parameters.
  * @param  out          store the output data.
  * @param  normalParam  specify the beta and gamma.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_ReluBN_INT8(DNU_Dims_T * inputDims, const int8_t * data, DNU_Dims_T * outputDims,
										int8_t * out, DNU_BatchNormalParams_T * normalParam)
{
	uint32_t inLength = 0;
	uint32_t outLength = 0;

	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT8 << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_RELU_BN << DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);

	/* Set gamma and beta */
	WRITE_REG(DNU->COEFMUL, normalParam->gamma.gamma_int8);
	WRITE_REG(DNU->COEFADD, normalParam->beta.beta_int8);

	/* Config input channel */
	inLength = (inputDims->h * inputDims->w % 4) ? (inputDims->h * inputDims->w / 4 + 1) : (inputDims->h * inputDims->w / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)data, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	outLength = (outputDims->h * outputDims->w % 4) ? (outputDims->h * outputDims->w / 4 + 1) : (outputDims->h * outputDims->w / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)out, outLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the relu and batch normal operation, accuracy is INT16.
  * @param  inputDims    specify the input feature map's parameters.
  * @param  data         point to input data.
  * @param  outputDims   specify the output feature map's parameters.
  * @param  out          store the output data.
  * @param  normalParam  specify the beta and gamma.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_ReluBN_INT16(DNU_Dims_T * inputDims, const int16_t * data, DNU_Dims_T * outputDims,
                                           int16_t * out, DNU_BatchNormalParams_T * normalParam)
{
	uint32_t inLength = 0;
	uint32_t outLength = 0;

	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT16 << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_RELU_BN<< DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);

	/* Set gamma and beta */
	WRITE_REG(DNU->COEFMUL, normalParam->gamma.gamma_int16);
	WRITE_REG(DNU->COEFADD, normalParam->beta.beta_int16);

	/* Config input channel */
	inLength = (inputDims->h * inputDims->w % 2) ? (inputDims->h * inputDims->w / 2 + 1) : (inputDims->h * inputDims->w / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)data, (uint32_t)(&DNU->CH0), inLength) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	outLength = (outputDims->h * outputDims->w % 2) ? (outputDims->h * outputDims->w / 2 + 1) : (outputDims->h * outputDims->w / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)out, outLength) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the relu and batch normal operation, accuracy is Float.
  * @param  inputDims    specify the input feature map's parameters.
  * @param  data         point to input data.
  * @param  outputDims   specify the output feature map's parameters.
  * @param  out          store the output data.
  * @param  normalParam  specify the beta and gamma.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_ReluBN_Float(DNU_Dims_T * inputDims, const float * data, DNU_Dims_T * outputDims,
                                          float * out, DNU_BatchNormalParams_T * normalParam)
{
	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_SINGLE << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_RELU_BN<< DNU_CTRL_CMD_Pos);

	/* Config input feature map */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, inputDims->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, inputDims->h << DNU_IN_HEIGHT_Pos);
	/* Config output feature map */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, outputDims->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, outputDims->h << DNU_OUT_HEIGHT_Pos);

	/* Set gamma and beta */
	DNU->COEFMUL = *(volatile uint32_t *)(&normalParam->gamma.gamma_float);
	DNU->COEFADD = *(volatile uint32_t *)(&normalParam->beta.beta_float);

	/* Config input channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)data, (uint32_t)(&DNU->CH0), inputDims->h * inputDims->w) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)out, outputDims->h * outputDims->w) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the matrix multiply operation, accuracy is INT8.
  * @param  pM0        point to matrix 0 data.
  * @param  pM1        point to matrix 1 data.
  * @param  matrixDim  specify the dimension of matrix.
  * @param  pOut       store the output matrix.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Mul_INT8(const int8_t * pM0, const int8_t * pM1, DNU_Dims_T * matrixDim, int8_t * pOut)
{
	uint32_t length = 0;

	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT8 << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_MUL << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, matrixDim->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, matrixDim->h << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, matrixDim->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, matrixDim->h << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	length = (matrixDim->h * matrixDim->w % 4) ? (matrixDim->h * matrixDim->w / 4 + 1) : (matrixDim->h * matrixDim->w / 4);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM0, (uint32_t)(&DNU->CH0), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM1, (uint32_t)(&DNU->CH1), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)pOut, length) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the matrix multiply operation, accuracy is INT16.
  * @param  pM0        point to matrix 0 data.
  * @param  pM1        point to matrix 1 data.
  * @param  matrixDim  specify the dimension of matrix.
  * @param  pOut       store the output matrix.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Mul_INT16(const int16_t * pM0, const int16_t * pM1, DNU_Dims_T * matrixDim, int16_t * pOut)
{
	uint32_t length = 0;

	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_INT16 << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_MUL << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, matrixDim->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, matrixDim->h << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, matrixDim->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, matrixDim->h << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	length = (matrixDim->h * matrixDim->w % 2) ? (matrixDim->h * matrixDim->w / 2 + 1) : (matrixDim->h * matrixDim->w / 2);
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM0, (uint32_t)(&DNU->CH0), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM1, (uint32_t)(&DNU->CH1), length) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)pOut, length) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Implement the matrix multiply operation, accuracy is float.
  * @param  pM0        point to matrix 0 data.
  * @param  pM1        point to matrix 1 data.
  * @param  matrixDim  specify the dimension of matrix.
  * @param  pOut       store the output matrix.
  * @retval DDL status
  */
DDL_Status_T DDL_DNU_Mul_Float(const float * pM0, const float * pM1, DNU_Dims_T * matrixDim, float * pOut)
{
	/* Config full connect parameters for DNU */
	SET_BITMASK(DNU->CTRL, DNU_CTRL_ACCURACY_Mask, ACCURACY_SINGLE << DNU_CTRL_ACCURACY_Pos);
	SET_BITMASK(DNU->CTRL, DNU_CTRL_CMD_Mask, OPERATION_MUL << DNU_CTRL_CMD_Pos);

	/* Config input channels */
	SET_BITMASK(DNU->IN, DNU_IN_WIDTH_Mask, matrixDim->w << DNU_IN_WIDTH_Pos);
	SET_BITMASK(DNU->IN, DNU_IN_HEIGHT_Mask, matrixDim->h << DNU_IN_HEIGHT_Pos);
	/* Config output channels */
	SET_BITMASK(DNU->OUT, DNU_OUT_WIDTH_Mask, matrixDim->w << DNU_OUT_WIDTH_Pos);
	SET_BITMASK(DNU->OUT, DNU_OUT_HEIGHT_Mask, matrixDim->h << DNU_OUT_HEIGHT_Pos);

	/* Config input channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct0, DMA_CHANNEL_0, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM0, (uint32_t)(&DNU->CH0), matrixDim->h * matrixDim->w) != DDL_OK)
	{
		return DDL_ERROR;
	}
	if (DNU_Alloc_DMA_Channel(&pDMAStruct1, DMA_CHANNEL_1, DMA_SRC_INC_DST_KEEP, DMA_DATAALIGN_WORD, (uint32_t)pM1, (uint32_t)(&DNU->CH1), matrixDim->h * matrixDim->w) != DDL_OK)
	{
		return DDL_ERROR;
	}
	/* Config output channel */
	if (DNU_Alloc_DMA_Channel(&pDMAStruct2, DMA_CHANNEL_2, DMA_SRC_KEEP_DST_INC, DMA_DATAALIGN_WORD, (uint32_t)(&DNU->CH2), (uint32_t)pOut, matrixDim->h * matrixDim->w) != DDL_OK)
	{
		return DDL_ERROR;
	}

	/* transmit data */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_0, DMA_DEVICE_7, &pDMAStruct0);
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_1, DMA_DEVICE_7, &pDMAStruct1);

	/* Get output feature map */
	DDL_DMA_ChannelStart(DMA, DMA_CHANNEL_2, DMA_DEVICE_7, &pDMAStruct2);

	return DDL_OK;
}


/**
  * @brief  Free DMA channels.
  * @param  None.
  * @retval None.
  */
void DDL_DNU_FreeDMAChannels(void)
{
	DDL_DMA_DisableChannelCfgHold(DMA, DMA_CHANNEL_0);
	DDL_DMA_ChannelStop(DMA, DMA_CHANNEL_0);
	DDL_DMA_ChannelStop(DMA, DMA_CHANNEL_1);
	DDL_DMA_ChannelStop(DMA, DMA_CHANNEL_2);
}

#endif

