/*******************************************************************************
 *
 * MIT License
 *
 * Copyright (c) 2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 *******************************************************************************/

#include <cstddef>
#include <miopen/conv/solvers.hpp>
#include <miopen/env.hpp>
#include <miopen/handle.hpp>
#include <miopen/generic_search.hpp>
#include <miopen/conv/wrw_invoke_params.hpp>
#include <miopen/solver/implicitgemm_util.hpp>
#include <miopen/gcn_asm_utils.hpp>
#include <miopen/tensor_ops.hpp>
#include <miopen/conv/asm_implicit_gemm.hpp>
#include <miopen/batched_transpose_sol.hpp>
#include <miopen/buffer_info.hpp>

MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC)
MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16)

#define WRW_MAX_GEMM_K_SPLITS 10

namespace miopen {
namespace solver {
namespace conv {

using ProblemDescription = miopen::conv::ProblemDescription;

static inline std::size_t GetTypeSize(const std::string& s)
{
    if(s == "fp32")
        return miopen::GetTypeSize(miopenFloat);
    if(s == "fp16")
        return miopen::GetTypeSize(miopenHalf);
    else
        return miopen::GetTypeSize(miopenBFloat16);
}

static const inline std::vector<PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC>&
GetWrwXdlopsNHWCConfigList()
{
    // clang-format off
    static const  std::vector<PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC> kernel_param_list {
        {"wrw", "nhwc", miopenFloat,  0, 0, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128, 256,  16, 32, 32,  2, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128, 256,  16, 32, 32,  2, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128, 256,  16, 32, 32,  2, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128, 256,  16, 32, 32,  2, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128, 128,  16, 32, 32,  2, 1, 2, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128, 128,  16, 32, 32,  2, 1, 2, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128, 128,  16, 32, 32,  2, 1, 2, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128, 128,  16, 32, 32,  2, 1, 2, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256,  64,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64, 256,  16, 32, 32,  2, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128,  64,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128,  64,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128,  64,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128,  64,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64, 128,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 256,  32,  16, 32, 32,  2, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32, 256,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32, 256,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32, 256,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32, 256,  16, 32, 32,  2, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1,16}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64,  64,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64,  64,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64,  64,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64,  64,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128,  32,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0, 128,  32,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128,  32,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1, 128,  32,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 8}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32, 128,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32, 128,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32, 128,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32, 128,  16, 32, 32,  2, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64,  32,  16, 16, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  64,  32,  16, 16, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64,  32,  16, 16, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  64,  32,  16, 16, 16,  4, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 16,  1, 16}, { 1, 1, 1, 2}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32,  64,  16, 16, 16,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32,  64,  16, 16, 16,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32,  64,  16, 16, 16,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32,  64,  16, 16, 16,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 1, 1, 2}, {  1, 16,  1, 16}, { 1, 1, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32,  32,  32, 16, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 32,  1,  8}, { 1, 1, 1, 4}, {  1, 32,  1,  8}},
        {"wrw", "nhwc", miopenFloat,  0, 0,  32,  32,  32, 16, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 32,  1,  8}, { 1, 1, 1, 4}, {  1, 32,  1,  8}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32,  32,  32, 16, 16,  4, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 1, 1, 4}, {  1, 32,  1,  8}, { 1, 1, 1, 4}, {  1, 32,  1,  8}},
        {"wrw", "nhwc", miopenFloat,  0, 1,  32,  32,  32, 16, 16,  4, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 1, 1, 4}, {  1, 32,  1,  8}, { 1, 1, 1, 4}, {  1, 32,  1,  8}},

        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  64,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  64,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  32, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  16, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 1,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenHalf,  0, 0,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},

        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  32, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256, 128,  16, 32, 32,  8, 2, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  32, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 256,  16, 32, 32,  8, 1, 2, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128, 128,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  64,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  64,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  64,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  32, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 256,  16, 32, 32,  8, 1, 1, 2, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 4}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 128,  64,  32, 32, 32,  8, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1,  8,  1, 32}, { 1, 4, 1, 2}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  32, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 4}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64, 128,  16, 32, 32,  8, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 2}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0, 256,  32,  32, 64, 16,  4, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 0, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  32, 256,  32, 16, 64,  4, 1, 1, 1, 2, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  32, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 8, 1, 1}, {  1,  4,  1, 64}, { 1, 8, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  16, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 1}, {  1,  4,  1, 64}, { 1, 4, 1, 1}, {  1,  4,  1, 64}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 0, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  64,  64, 32, 32,  8, 1, 1, 1, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 4}, {  1, 16,  1, 16}, { 1, 4, 1, 4}, {  1, 16,  1, 16}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 1,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 0, 0, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
        {"wrw", "nhwc", miopenBFloat16,  0, 0,  64,  32,  32, 16, 16, 16, 1, 1, 2, 1, 0, 1, 1, 0, 0, { 1, 4, 1, 2}, {  1,  8,  1, 32}, { 1, 4, 1, 1}, {  1,  8,  1, 32}},
    };
    // clang-format on
    return kernel_param_list;
}

// clang-format off
static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
GetWrwXdlopsNHWCConfigLargestTileFp32()
{
    return {"wrw", "nhwc", miopenFloat,  0, 0, 256, 128,  16, 32, 32,  2, 2, 1, 2, 2, 0, 0, 0, 0, 0, { 1, 1, 1,16}, {  1, 16,  1, 16}, { 1, 1, 1, 8}, {  1, 16,  1, 16}};
}

static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
GetWrwXdlopsNHWCConfigLargestTileFp16()
{
    return {"wrw", "nhwc", miopenHalf,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}};
}

static inline PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
GetWrwXdlopsNHWCConfigLargestTileBf16()
{
    return {"wrw", "nhwc", miopenBFloat16,  0, 1, 256, 256,  32, 32, 32,  8, 2, 2, 2, 2, 0, 0, 0, 0, 0, { 1, 4, 1, 8}, {  1,  8,  1, 32}, { 1, 4, 1, 8}, {  1,  8,  1, 32}};
}
// clang-format on

static std::tuple<size_t, // block_size
                  size_t, // grid_size
                  size_t> // occupancy
GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(
    const ProblemDescription& problem,
    const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config)
{
    const int k      = problem.GetInChannels();
    const int c      = problem.GetOutChannels();
    const int y      = problem.GetWeightsHeight();
    const int x      = problem.GetWeightsWidth();
    const auto group = problem.GetGroupCount();

    // c need to be carefully padded
    const auto c_vec_min = config.tensor_b_thread_lengths[3];
    const auto c_padded  = ((c / group) + c_vec_min - 1) / c_vec_min * c_vec_min;
    const auto gemm_n = (c_padded * y * x + config.gemm_n_per_block - 1) / config.gemm_n_per_block *
                        config.gemm_n_per_block;

    const auto gemm_m = k / group;
    size_t block_size = config.BlockSize();
    size_t grid_size  = static_cast<size_t>(group) *
                       integer_divide_ceil(gemm_m, config.gemm_m_per_block) *
                       integer_divide_ceil(gemm_n, config.gemm_n_per_block);
    size_t occupancy = config.ComputeKernelOccupancy();
    return std::make_tuple(block_size, grid_size, occupancy);
}

size_t PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::ComputeKernelOccupancy() const
{
    size_t acc_usage = gemm_m_per_block * gemm_n_per_block / BlockSize();
    size_t vgpr_usage;
    size_t aux_vgpr_usage;
    size_t a_elements_per_vgpr = 1;
    size_t b_elements_per_vgpr = 1;
    size_t lds_a               = GetTypeSize(precision) * gemm_m_per_block * gemm_k_per_block;
    size_t lds_b               = GetTypeSize(precision) * gemm_n_per_block * gemm_k_per_block;

    size_t lds_single = lds_a >= lds_b ? lds_a * 2 : lds_b * 2;
    size_t lds_usage;
    size_t occupancy;

    const auto lds_size        = 64 * 1024;
    const auto num_vpgrs       = 256;
    const auto num_acc         = 256;
    const auto half_lds_size   = lds_size / 2;
    const auto quater_lds_size = lds_size / 4;
    const auto eighth_lds_size = lds_size / 8;
    const auto half_acc        = num_acc / 2;
    const auto third_vpgrs     = num_vpgrs / 3;

    if(nxe == 0)
    {
        aux_vgpr_usage = 36;
    }
    else
    {
        aux_vgpr_usage = 42;
    }

    if(GetTypeSize(precision) == 2 && tensor_a_thread_lengths[3] > 1)
    {
        a_elements_per_vgpr = 2;
    }
    if(GetTypeSize(precision) == 2 && tensor_b_thread_lengths[3] > 1)
    {
        b_elements_per_vgpr = 2;
    }

    size_t sz_per_element = precision == "fp16" ? 2 : 1;

    vgpr_usage = static_cast<size_t>(tensor_a_thread_lengths[1]) * tensor_a_thread_lengths[3] /
                     a_elements_per_vgpr +
                 static_cast<size_t>(tensor_b_thread_lengths[1]) * tensor_b_thread_lengths[3] /
                     b_elements_per_vgpr +
                 static_cast<size_t>(tensor_a_thread_lengths[1]) * tensor_a_thread_lengths[3] /
                     sz_per_element +
                 static_cast<size_t>(tensor_b_thread_lengths[1]) * tensor_b_thread_lengths[3] /
                     sz_per_element +
                 aux_vgpr_usage;
    if(GetTypeSize(precision) == 2)
    {
        if(lds_single >= half_lds_size ||
           (lds_single <= quater_lds_size && lds_single > eighth_lds_size && acc_usage < half_acc &&
            vgpr_usage < third_vpgrs))
        {
            lds_usage = lds_single;
        }
        else
        {
            // use lds double buffer
            lds_usage = lds_single * 2;
        }
    }
    else
    {
        lds_usage = lds_single;
    }

    MIOPEN_LOG_T("lds_usage=" << lds_usage << ", acc_usage=" << acc_usage
                              << ", vgpr_usage=" << vgpr_usage);

    occupancy =
        std::min(lds_size / lds_usage, std::min(num_acc / acc_usage, num_vpgrs / vgpr_usage));
    return occupancy;
}

void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::SetParamsForKSplit(
    const ProblemDescription& problem, const size_t& occupancy)
{
    if(problem.IsFp16())
    {
        if(tensor_b_thread_lengths[3] == 1 ||
           env::disabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16))
            vector_store = 1;
    }
    else if(problem.IsBfp16() && tensor_b_thread_lengths[3] == 1)
    {
        vector_store = 1;
    }
    gemm_k_global_split = occupancy;
}

void PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::HeuristicInit(
    const ExecutionContext& ctx, const ProblemDescription& problem)
{
    static const std::vector<std::tuple<int, int, int>> tile_list_fp32 = {
        std::make_tuple(128, 128, 16),
        std::make_tuple(128, 64, 16),
        std::make_tuple(64, 128, 16),
        std::make_tuple(128, 32, 16),
        std::make_tuple(256, 64, 16),
        std::make_tuple(64, 256, 16),
        std::make_tuple(64, 64, 16),
        std::make_tuple(64, 32, 16),
        std::make_tuple(32, 64, 16),
        std::make_tuple(32, 32, 32),
    };

    static const std::vector<std::tuple<int, int, int>> tile_list_fp16 = {

        std::make_tuple(256, 256, 32), std::make_tuple(256, 128, 16), std::make_tuple(256, 128, 32),
        std::make_tuple(128, 256, 16), std::make_tuple(128, 256, 32), std::make_tuple(128, 128, 16),
        std::make_tuple(128, 128, 32), std::make_tuple(256, 64, 16),  std::make_tuple(256, 64, 32),
        std::make_tuple(64, 256, 16),  std::make_tuple(64, 256, 32),  std::make_tuple(128, 64, 32),
        std::make_tuple(64, 128, 16),  std::make_tuple(64, 128, 32),  std::make_tuple(64, 64, 64),
        std::make_tuple(64, 64, 32),   std::make_tuple(256, 32, 32),  std::make_tuple(32, 256, 32),
        std::make_tuple(64, 32, 32),   std::make_tuple(64, 64, 16),
    };

    static const std::vector<std::tuple<int, int, int>> tile_list_bfp16 = {

        std::make_tuple(256, 128, 16), std::make_tuple(256, 128, 32), std::make_tuple(128, 256, 16),
        std::make_tuple(128, 256, 32), std::make_tuple(128, 128, 16), std::make_tuple(128, 128, 32),
        std::make_tuple(256, 64, 16),  std::make_tuple(256, 64, 32),  std::make_tuple(64, 256, 16),
        std::make_tuple(64, 256, 32),  std::make_tuple(128, 64, 32),  std::make_tuple(64, 128, 16),
        std::make_tuple(64, 128, 32),  std::make_tuple(64, 64, 64),   std::make_tuple(64, 64, 32),
        std::make_tuple(256, 32, 32),  std::make_tuple(32, 256, 32),  std::make_tuple(64, 32, 32),
        std::make_tuple(64, 64, 16),
    };

#ifndef NDEBUG
    const auto& c_list = GetWrwXdlopsNHWCConfigList();
    for(const auto& tile : tile_list_fp16)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp32" || config.precision == "bf16")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp)
            {
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp16 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }
    for(const auto& tile : tile_list_fp32)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp16" || config.precision == "bf16")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp)
            {
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp32 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }
    for(const auto& tile : tile_list_bfp16)
    {
        int mp, np, kp;
        std::tie(mp, np, kp) = tile;
        bool found           = false;
        for(const auto& config : c_list)
        {
            if(config.precision == "fp16" || config.precision == "fp32")
                continue;
            if(config.gemm_m_per_block == mp && config.gemm_n_per_block == np &&
               config.gemm_k_per_block == kp)
            {
                found = true;
                break;
            }
        }
        if(!found)
        {
            MIOPEN_LOG_E("fp32 list can't find " << mp << "x" << np << "x" << kp);
            MIOPEN_THROW(miopenStatusInternalError);
        }
    }
#endif

    const int k           = problem.GetInChannels();
    const int c           = problem.GetOutChannels();
    const int y           = problem.GetWeightsHeight();
    const int x           = problem.GetWeightsWidth();
    const auto stride_h   = problem.GetKernelStrideH();
    const auto stride_w   = problem.GetKernelStrideW();
    const auto dilation_h = problem.GetWeightsHeight() > 1 ? problem.GetDilationH() : 1;
    const auto dilation_w = problem.GetWeightsWidth() > 1 ? problem.GetDilationW() : 1;
    const auto pad_h      = problem.GetPadH();
    const auto pad_w      = problem.GetPadW();
    const auto group      = problem.GetGroupCount();

    const auto num_cu             = ctx.GetStream().GetMaxComputeUnits();
    const auto non_split_gridsize = 600;

    auto gemm_n        = (c / group) * y * x;
    const auto& gemm_m = k / group;

    bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) &&
                     (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0);
    bool not_support_vector_store =
        (problem.IsFp16() || problem.IsBfp16()) && ((c / group) % 2 != 0);
    int m_per_block, n_per_block, k_per_block;

    std::tie(m_per_block, n_per_block, k_per_block) = HeuristicInitMacroTileNoPadGemmK(
        gemm_m,
        gemm_n,
        0,
        problem.IsFp32() ? tile_list_fp32 : (problem.IsFp16() ? tile_list_fp16 : tile_list_bfp16));

    auto find_with_gemm_k_pad = [&]() {
        // not found, let's try  gemm_k pad now.
        const auto& config_list = GetWrwXdlopsNHWCConfigList();
        size_t min_pad_pixel    = std::numeric_limits<std::size_t>::max();
        size_t selected_index   = 0;
        for(size_t i = 0; i < config_list.size(); i++)
        {
            const auto& config = config_list[i];
            if(!((problem.IsFp16() && config.precision == "fp16") ||
                 (problem.IsBfp16() && config.precision == "bf16") ||
                 (problem.IsFp32() && config.precision == "fp32")))
                continue;

            if(problem.IsFp16() || problem.IsBfp16())
            {
                if((c / group) % config.tensor_b_thread_lengths[3] != 0)
                {
                    continue;
                }
                if((k / group) % config.tensor_a_thread_lengths[3] != 0)
                {
                    continue;
                }
            }

            if(problem.IsFp32())
            {
                // c need to be carefully padded
                const auto c_vec_min = config.tensor_b_thread_lengths[3];
                const auto c_padded  = ((c / group) + c_vec_min - 1) / c_vec_min * c_vec_min;
                gemm_n               = (c_padded * y * x + config.gemm_n_per_block - 1) /
                         config.gemm_n_per_block * config.gemm_n_per_block;
            }

            size_t cur_pad_pixel =
                ComputeMatrixPadSize(gemm_m, config.gemm_m_per_block, 0, config.gemm_k_per_block) +
                ComputeMatrixPadSize(gemm_n, config.gemm_n_per_block, 0, config.gemm_k_per_block) +
                ComputeMatrixPadSize(
                    gemm_m, config.gemm_m_per_block, gemm_n, config.gemm_n_per_block);
            if(cur_pad_pixel < min_pad_pixel)
            {
                min_pad_pixel  = cur_pad_pixel;
                selected_index = i;
            }
        }

        size_t current_grid_size;
        size_t occupancy;
        std::tie(std::ignore, current_grid_size, occupancy) =
            GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config_list[selected_index]);
        bool need_k_split = current_grid_size <= non_split_gridsize;
        size_t gks = ComputeGemmKGlobalSplitsWith2DMerge(current_grid_size, occupancy, num_cu);
        need_k_split |= gks != 0;

        CopyParameters(config_list[selected_index]);
        if(need_k_split)
        {
            SetParamsForKSplit(problem, occupancy);
        }
    };

    if((m_per_block == 0 && n_per_block == 0 && k_per_block == 0) || not_support_vector_store)
    {
        // not found, let's try gemm_k pad now.
        find_with_gemm_k_pad();
    }
    else
    {
        /// \todo Fix this
        /// clang-tidy: DIV/0 in GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel()
        if(n_per_block == 0)
            MIOPEN_THROW(miopenStatusInternalError);

        // found a suitable m/n/k, now let's prepare other parmater and initialize one
        const auto& config_list = GetWrwXdlopsNHWCConfigList();
        for(const auto& config : config_list)
        {
            if(!((problem.IsFp16() && config.precision == "fp16") ||
                 (problem.IsBfp16() && config.precision == "bf16") ||
                 (problem.IsFp32() && config.precision == "fp32")))
                continue;

            if(m_per_block == config.gemm_m_per_block && n_per_block == config.gemm_n_per_block &&
               k_per_block == config.gemm_k_per_block)
            {
                size_t current_grid_size;
                size_t occupancy;
                std::tie(std::ignore, current_grid_size, occupancy) =
                    GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config);
                bool need_k_split = current_grid_size <= non_split_gridsize;
                size_t gks =
                    ComputeGemmKGlobalSplitsWith2DMerge(current_grid_size, occupancy, num_cu);
                need_k_split |= gks != 0;

                if((unit_conv && config.nxe == 0) || (!unit_conv && config.nxe != 0))
                {
                    if(!config.IsValid(problem)) // last check before assigning a heuristic value
                        continue;
                    CopyParameters(config);
                    if(need_k_split)
                    {
                        SetParamsForKSplit(problem, occupancy);
                    }
                    return;
                }
                else
                    continue;
            }
        }
        // last try
        find_with_gemm_k_pad();
    }
}

bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::SetNextValue(const ProblemDescription&)
{
    if(use_spare_set)
    {
        const auto& config_list = GetWrwXdlopsNHWCConfigList();
        if(IsDefaultConstructed())
        {
            CopyParameters(config_list[index]);
        }
        else
        {
            if(gemm_k_global_split != 0)
            {
                if(NextLinear<1, WRW_MAX_GEMM_K_SPLITS>(gemm_k_global_split))
                    index++;
                else
                    return true;
            }
            else
            {
                index++;
            }
            if(index >= config_list.size())
                return false;
            CopyParameters(config_list[index]);
        }
        return true;
    }
    else
    {
        // always break generic search of main set (no spare), make sure we can use spare set
        return false;
    }
}

bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::IsValidValue() const
{
    if(IsDefaultConstructed())
        return true;
    const auto& config_list = GetWrwXdlopsNHWCConfigList();
    if(index < config_list.size() && *this == config_list[index])
        return true;
    return miopen::any_of(config_list, [&](auto v) { return (*this == v); });
}

bool PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC::IsValid(
    const ProblemDescription& problem) const
{
    if(IsDefaultConstructed())
        return false;

    if(!((problem.IsFp16() && precision == "fp16") || (problem.IsFp32() && precision == "fp32") ||
         (problem.IsBfp16() && precision == "bf16")))
    {
        return false;
    }

    if(env::disabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_PK_ATOMIC_ADD_FP16))
    {
        if(problem.IsFp16() && tensor_b_thread_lengths[3] != 1 && gemm_k_global_split != 0 &&
           vector_store != 1)
        {
            return false;
        }
    }

    const int k           = problem.GetInChannels();
    const int c           = problem.GetOutChannels();
    const int y           = problem.GetWeightsHeight();
    const int x           = problem.GetWeightsWidth();
    const auto stride_h   = problem.GetKernelStrideH();
    const auto stride_w   = problem.GetKernelStrideW();
    const auto dilation_h = problem.GetWeightsHeight() > 1 ? problem.GetDilationH() : 1;
    const auto dilation_w = problem.GetWeightsWidth() > 1 ? problem.GetDilationW() : 1;
    const auto pad_h      = problem.GetPadH();
    const auto pad_w      = problem.GetPadW();
    const auto precision =
        problem.IsFp16() ? miopenHalf : (problem.IsBfp16() ? miopenBFloat16 : miopenFloat);
    const auto group = problem.GetGroupCount();

    {
        size_t current_block_size, current_grid_size, current_splits_4G;
        std::tie(current_block_size, current_grid_size, current_splits_4G) =
            GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, *this);

        if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
            return false;

        if(current_splits_4G == 0)
            return false;
    }

    bool unit_conv = (x == 1) && (y == 1) && (stride_h == 1) && (stride_w == 1) &&
                     (dilation_h == 1) && (dilation_w == 1) && (pad_h == 0) && (pad_w == 0);

    if((nxe == 0) && !unit_conv)
    {
        return false;
    }

    if(precision != miopenFloat)
    {
        if((c / group) % tensor_b_thread_lengths[3] != 0)
        {
            return false;
        }
        if((k / group) % tensor_a_thread_lengths[3] != 0)
        {
            return false;
        }
    }

    // add more restriction for spare
    if(use_spare_set)
    {
        // non 1x1 kernel(except padding gemm_k) can't run 1x1 case
        if(unit_conv && nxe != 0)
            return false;
    }

    return true;
}

PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetDefaultPerformanceConfig(
    const ExecutionContext& ctx, const ProblemDescription& problem) const
{
    PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC pp;
    pp.HeuristicInit(ctx, problem);
    MIOPEN_LOG_I(pp.ToString());
    return pp;
}
bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsValidPerformanceConfig(
    const ExecutionContext&,
    const ProblemDescription& problem,
    const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config) const
{
    return config.IsValidValue() && config.IsValid(problem);
}

PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC
ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::Search(const ExecutionContext& ctx,
                                                   const ProblemDescription& problem,
                                                   const AnyInvokeParams& invoke_ctx) const
{
    return GenericSearch(*this, ctx, problem, invoke_ctx);
}

bool ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::IsApplicable(
    const ExecutionContext& ctx, const ProblemDescription& problem) const
{
    if(env::disabled(MIOPEN_DEBUG_CONV_IMPLICIT_GEMM_ASM_WRW_GTC_XDLOPS_NHWC))
        return false;

    if(problem.GetConv().attribute.deterministic)
        return false;

#if WORKAROUND_ISSUE_1979
    if(problem.GetGroupCount() > 1)
        return false;
#endif

#if WORKAROUND_ISSUE_2867
    {
        const int hi        = problem.GetOutHeight();
        const int wi        = problem.GetOutWidth();
        const int k         = problem.GetInChannels();
        const int c         = problem.GetOutChannels();
        const int y         = problem.GetWeightsHeight();
        const int x         = problem.GetWeightsWidth();
        const auto stride_h = problem.GetKernelStrideH();
        const auto stride_w = problem.GetKernelStrideW();
        const auto pad_h    = problem.GetPadH();
        const auto pad_w    = problem.GetPadW();

        if((c == 1 && k == 1 && hi == 1 && wi == 1 && y == 3 && x == 3 && pad_h == 2 &&
            pad_w == 2 && stride_h == 2 && stride_w == 2) ||
           (c == 1 && k == 1 && hi == 1 && wi == 5 && y == 3 && x == 6 && pad_h == 2 &&
            pad_w == 1 && stride_h == 2 && stride_w == 1) ||
           (c == 1 && k == 1 && hi == 1 && wi == 1 && y == 2 && x == 3 && pad_h == 1 &&
            pad_w == 2 && stride_h == 1 && stride_w == 2))
            return false;
    }
#endif

    const auto device_name = ctx.GetStream().GetDeviceName();
    if((device_name != "gfx908") && (device_name != "gfx90a") && (device_name != "gfx942") &&
       (!StartsWith(device_name, "gfx95")))
        return false;

    if(!ctx.use_asm_kernels)
        return false;

    if(!problem.IsDirectionBackwardWrW())
        return false;

    if(!problem.Is2d())
        return false;

    if(problem.HasNonPackedTensors())
        return false;

    if(!problem.AllTensorsDimsFitIntoInt())
        return false;

    if(!problem.IsFp32() && !problem.IsFp16() &&
       !(problem.IsBfp16() &&
         (device_name == "gfx90a" || device_name == "gfx942" || StartsWith(device_name, "gfx95"))))
        return false;

    if(problem.IsTensorsCasted())
        return false;

    if(!ctx.rmv.IsV3())
        return false;

    const auto& target = ctx.GetStream().GetTargetProperties();
    if(target.Xnack() && *target.Xnack())
        return false; // NOLINT (readability-simplify-boolean-expr)

    if(0 == igemm_split_batch_size(problem.GetOutHeight(),
                                   problem.GetOutWidth(),
                                   problem.GetInHeight(),
                                   problem.GetInWidth(),
                                   problem.GetBatchSize(),
                                   problem.GetInChannels(),
                                   problem.GetOutChannels(),
                                   miopen::GetTypeSize(problem.GetInDataType())))
        return false;

    {
        auto largest_config = problem.IsFp32()
                                  ? GetWrwXdlopsNHWCConfigLargestTileFp32()
                                  : (problem.IsFp16() ? GetWrwXdlopsNHWCConfigLargestTileFp16()
                                                      : GetWrwXdlopsNHWCConfigLargestTileBf16());
        size_t current_block_size, current_grid_size, current_splits_4G;
        std::tie(current_block_size, current_grid_size, current_splits_4G) =
            GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, largest_config);

        if(current_block_size * current_grid_size * current_splits_4G > 0xffffffffULL)
            return false;
    }

    return true;
}

static std::vector<OpKernelArg>
ComputeDynamicIGemmWrwKernelArgsNHWC(const ProblemDescription& problem,
                                     const int gemm_k_global_splits,
                                     const int gemm_k_per_wg,
                                     const int splits_4G)
{
    int hi         = problem.GetOutHeight();
    int wi         = problem.GetOutWidth();
    int n          = problem.GetInBatchSize();
    int k          = problem.GetInChannels();
    int c          = problem.GetOutChannels();
    int ho         = problem.GetInHeight();
    int wo         = problem.GetInWidth();
    int stride_h   = problem.GetOutHeight() > 1 ? problem.GetKernelStrideH() : 1;
    int stride_w   = problem.GetOutWidth() > 1 ? problem.GetKernelStrideW() : 1;
    int dilation_h = problem.GetWeightsHeight() > 1 ? problem.GetDilationH() : 1;
    int dilation_w = problem.GetWeightsWidth() > 1 ? problem.GetDilationW() : 1;
    int pad_h      = problem.GetPadH();
    int pad_w      = problem.GetPadW();
    int y          = problem.GetWeightsHeight();
    int x          = problem.GetWeightsWidth();
    int group      = problem.GetGroupCount();

    std::vector<OpKernelArg> opArgs;
    opArgs.emplace_back(0); // placeholder
    opArgs.emplace_back(0); // placeholder
    opArgs.emplace_back(0); // placeholder
    opArgs.emplace_back(hi);
    opArgs.emplace_back(wi);
    opArgs.emplace_back(n / splits_4G);
    opArgs.emplace_back(k / group);
    opArgs.emplace_back(c / group);
    opArgs.emplace_back(ho);
    opArgs.emplace_back(wo);
    opArgs.emplace_back(stride_h);
    opArgs.emplace_back(stride_w);
    opArgs.emplace_back(dilation_h);
    opArgs.emplace_back(dilation_w);
    opArgs.emplace_back(pad_h);
    opArgs.emplace_back(pad_w);
    opArgs.emplace_back(y);
    opArgs.emplace_back(x);
    opArgs.emplace_back(gemm_k_global_splits);
    opArgs.emplace_back(group);
    opArgs.emplace_back(gemm_k_per_wg);

    return opArgs;
}

size_t ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetWorkspaceSize(
    const ExecutionContext& ctx, const ProblemDescription& problem) const
{
    const int hi       = problem.GetOutHeight();
    const int wi       = problem.GetOutWidth();
    const int n        = problem.GetBatchSize();
    const int k        = problem.GetInChannels();
    const int c        = problem.GetOutChannels();
    const int ho       = problem.GetInHeight();
    const int wo       = problem.GetInWidth();
    const int y        = problem.GetWeightsHeight();
    const int x        = problem.GetWeightsWidth();
    const auto group   = problem.GetGroupCount();
    const auto is_nchw = problem.IsLayoutDefault();

    size_t size_trans_input  = 0;
    size_t size_trans_weight = 0;
    size_t size_trans_output = 0;
    size_t size_tensor_cast  = 0;

    size_t workspace_size = 0;
    if(is_nchw)
    {
        TransposeSolutionDefault2Nhwc trans_input(ctx, problem.GetOutDataType(), n, c, hi, wi);
        TransposeSolutionNhwc2Default trans_weight(ctx,
                                                   problem.GetWeightsDataType(),
                                                   k,
                                                   c / group,
                                                   y,
                                                   x); // group * k_per_group as batch for weight
        TransposeSolutionDefault2Nhwc trans_output(ctx, problem.GetInDataType(), n, k, ho, wo);
        if(!trans_input.IsSkippable())
            size_trans_input = trans_input.GetOutputTensorSize();
        if(!trans_weight.IsSkippable())
            size_trans_weight = trans_weight.GetOutputTensorSize();
        if(!trans_output.IsSkippable())
            size_trans_output = trans_output.GetOutputTensorSize();
    }

    if(!problem.IsFp32())
    {
        size_tensor_cast =
            miopen::GetTypeSize(miopenFloat) // The intermediate output of the 1st
                                             // kernel is FP32, when using FP32 atomic
            * (k / group) * c * y * x;
    }

    MultiBufferWorkspaceTraits wt(
        {size_trans_input, size_trans_weight, size_trans_output, size_tensor_cast});
    workspace_size = wt.GetSize();

    return workspace_size;
}

ConvSolution ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC::GetSolution(
    const ExecutionContext& ctx,
    const ProblemDescription& problem,
    const PerformanceConfigAsmImplicitGemmGTCWrwXdlopsNHWC& config) const
{
    ConvSolution result;
    KernelInfo kernel;

    size_t block_size;
    size_t grid_size;

    std::tie(block_size, grid_size, std::ignore) =
        GetImplicitGemmGtcDynamicWrwXdlopsNHWCKernel(problem, config);

    std::string kernel_name = config.ToKernelName(ctx);

    const int hi     = problem.GetOutHeight();
    const int wi     = problem.GetOutWidth();
    const int n      = problem.GetBatchSize();
    const int k      = problem.GetInChannels();
    const int c      = problem.GetOutChannels();
    const int ho     = problem.GetInHeight();
    const int wo     = problem.GetInWidth();
    const int y      = problem.GetWeightsHeight();
    const int x      = problem.GetWeightsWidth();
    const auto group = problem.GetGroupCount();

    auto splits_4G = igemm_split_batch_size(
        hi, wi, ho, wo, n, k, c, miopen::GetTypeSize(problem.GetInDataType()));

    size_t gemm_k_global_splits =
        config.gemm_k_global_split >= 1
            ? ComputeGemmKGlobalSplitsWith2DMerge(
                  grid_size, config.gemm_k_global_split, ctx.GetStream().GetMaxComputeUnits())
            : 1;
    size_t min_n_per_block = config.nxe == 1 ? config.tensor_a_thread_lengths[1] : 1;
    size_t nb_per_block =
        config.nxe == 1 ? config.tensor_a_cluster_lengths[1] : config.gemm_k_per_block;

    if(gemm_k_global_splits == 0)
        gemm_k_global_splits = 1;

    // compute workload for 1 workgroup and update gemmk splits (remove the ones compute 0 data)
    size_t gemmk = integer_divide_ceil(problem.GetBatchSize() / splits_4G, min_n_per_block) *
                   problem.GetInHeight() * problem.GetInWidth();
    size_t gemmk_per_wg = integer_divide_ceil(gemmk, gemm_k_global_splits);

    gemmk_per_wg         = (gemmk_per_wg + nb_per_block - 1) / nb_per_block * nb_per_block;
    gemm_k_global_splits = integer_divide_ceil(gemmk, gemmk_per_wg);

    const auto required_workspace_size = GetWorkspaceSize(ctx, problem);
    result.workspace_sz                = required_workspace_size;

    kernel.kernel_file = kernel_name + ".s";
    kernel.kernel_name = kernel_name;
    kernel.g_wk.clear();
    kernel.g_wk.push_back(grid_size * block_size);
    kernel.g_wk.push_back(splits_4G);
    kernel.g_wk.push_back(gemm_k_global_splits);
    kernel.l_wk.clear();
    kernel.l_wk.push_back(block_size);
    kernel.l_wk.push_back(1);
    kernel.l_wk.push_back(1);

    const auto isFp16                 = problem.IsFp16();
    const auto isGfx90aFp16altSupport = (ctx.GetStream().GetDeviceName() == "gfx90a") && isFp16;
    const bool need_cast              = (problem.IsBfp16() && config.gemm_k_global_split >= 1) ||
                           (isFp16 && config.gemm_k_global_split >= 1 &&
                            (config.tensor_b_thread_lengths[3] == 1 || config.vector_store == 1));

    const auto is_nchw = problem.IsLayoutDefault();

    result.construction_params.push_back(kernel); // Intentionally without options.
    std::ostringstream options;                   // Common options for both kernels.
    std::ostringstream msg;
    GenerateClangDefsym(options, "ROCM_METADATA_VERSION", ctx.rmv.UseV3() ? 5 : 4);
    if(ctx.GetStream().GetDeviceName() == "gfx942")
    {
        GenerateClangDefsym(options, "force_sc0_sc1", 0);
        GenerateClangDefsym(options, "atomic_add_using_cas", 0);
        if(miopen::IsLogging(LoggingLevel::Info2))
            msg << ", force_sc0_sc1:0, atomic_add_using_cas:0 (gfx942)";
    }

    std::ostringstream opts_0(options.str(), std::ios_base::ate); // Options for normal kernel.
    if(isGfx90aFp16altSupport)
        GenerateClangDefsym(opts_0, "igemm_wrw_fp16_alt_impl", 0);
    result.construction_params[0].comp_options = opts_0.str();

    if(isGfx90aFp16altSupport)
    {
        result.construction_params.push_back(kernel);
        std::ostringstream opts_1(options.str(), std::ios_base::ate); // Options for alt kernel.
        GenerateClangDefsym(opts_1, "igemm_wrw_fp16_alt_impl", 1);
        result.construction_params[1].comp_options = opts_1.str();
        if(miopen::IsLogging(LoggingLevel::Info2))
            msg << ", fp16_alt:" << problem.GetConv().attribute.gfx90aFp16alt.GetWrW();
    }

    const auto lowp_quant = problem.GetConv().lowp_quant;

    auto opArgs = ComputeDynamicIGemmWrwKernelArgsNHWC(
        problem, gemm_k_global_splits, gemmk_per_wg, splits_4G);
    std::vector<std::vector<OpKernelArg>> opArgsTrans;
    size_t trans_input_offset = 0;
    size_t trans_input_size   = 0;

    size_t trans_weight_offset = 0;
    size_t trans_weight_size   = 0;

    size_t trans_output_offset = 0;
    size_t trans_output_size   = 0;

    bool trans_input_skippable  = false;
    bool trans_weight_skippable = false;
    bool trans_output_skippable = false;

    int trans_input_idx  = -1;
    int trans_weight_idx = -1;
    int trans_output_idx = -1;

    if(is_nchw)
    {
        TransposeSolutionDefault2Nhwc trans_input(ctx, problem.GetOutDataType(), n, c, hi, wi);
        TransposeSolutionNhwc2Default trans_weight(ctx,
                                                   problem.GetWeightsDataType(),
                                                   k,
                                                   c / group,
                                                   y,
                                                   x); // group * k_per_group as batch for weight
        TransposeSolutionDefault2Nhwc trans_output(ctx, problem.GetInDataType(), n, k, ho, wo);

        trans_input_skippable  = trans_input.IsSkippable();
        trans_weight_skippable = trans_weight.IsSkippable();
        trans_output_skippable = trans_output.IsSkippable();

        if(!trans_input_skippable)
        {
            result.construction_params.push_back(trans_input.GetKernelInfo());
            opArgsTrans.emplace_back(trans_input.GetKernelArg());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", inp trans:" << trans_input.GetKernelName();
        }
        if(!trans_weight_skippable)
        {
            result.construction_params.push_back(trans_weight.GetKernelInfo());
            opArgsTrans.emplace_back(trans_weight.GetKernelArg());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", wei trans:" << trans_weight.GetKernelName();
        }
        if(!trans_output_skippable)
        {
            result.construction_params.push_back(trans_output.GetKernelInfo());
            opArgsTrans.emplace_back(trans_output.GetKernelArg());
            if(miopen::IsLogging(LoggingLevel::Info2))
                msg << ", out trans:" << trans_output.GetKernelName();
        }

        trans_input_size  = trans_input_skippable ? 0 : trans_input.GetOutputTensorSize();
        trans_weight_size = trans_weight_skippable ? 0 : trans_weight.GetOutputTensorSize();
        trans_output_size = trans_output_skippable ? 0 : trans_output.GetOutputTensorSize();

        int idx = 0;
        if(!trans_input_skippable)
            trans_input_idx = idx++;
        if(!trans_weight_skippable)
            trans_weight_idx = idx++;
        if(!trans_output_skippable)
            trans_output_idx = idx++;
    }

    MIOPEN_LOG_I2(SolverDbId() << ": " << config.ToString() << msg.str());

    const size_t cast_size =
        need_cast ? miopen::GetTypeSize(miopenFloat) * k * (c / group) * y * x : 0;

    MultiBufferWorkspaceTraits wt(
        {trans_input_size, trans_weight_size, trans_output_size, cast_size});

    trans_input_offset  = wt.GetOffset(0);
    trans_weight_offset = wt.GetOffset(1);
    trans_output_offset = wt.GetOffset(2);

    const size_t cast_offset = wt.GetOffset(3);

    const int kID_trans_start = isGfx90aFp16altSupport ? 2 : 1;

    const TensorDescriptor cast_desc(
        miopenFloat, problem.GetWeights().GetLengths(), problem.GetWeights().GetStrides());
    auto null_buf = shared<Data_t>{};

    if(need_cast)
    {
        result.invoker_factory = [=](const std::vector<Kernel>& kernels) mutable {
            return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) mutable {
                decltype(auto) wrw_invoke_params =
                    primitive_parameters.CastTo<miopen::conv::WrWInvokeParams>();
                const auto& tensors = wrw_invoke_params.tensors;
                const auto ker      = handle.Run(
                    kernels[(isGfx90aFp16altSupport && wrw_invoke_params.gfx90aFp16alt) ? 1 : 0]);
                const auto& workSpace     = wrw_invoke_params.workSpace;
                const auto& workSpaceSize = wrw_invoke_params.workSpaceSize;
                float elapsed             = 0;
                float zero                = 0.f;

                if(workSpace == nullptr || workSpaceSize < required_workspace_size)
                {
                    MIOPEN_THROW("Not enough workspace has been provided for "
                                 "ConvAsmImplicitGemmGTCDynamicWrwXdlopsNHWC with fp16 and atomic "
                                 "add.");
                }
                auto trans_input_buf =
                    trans_input_size == 0
                        ? null_buf
                        : handle.CreateSubBuffer(workSpace, trans_input_offset, trans_input_size);
                auto trans_weight_buf =
                    trans_weight_size == 0
                        ? null_buf
                        : handle.CreateSubBuffer(workSpace, trans_weight_offset, trans_weight_size);
                auto trans_output_buf =
                    trans_output_size == 0
                        ? null_buf
                        : handle.CreateSubBuffer(workSpace, trans_output_offset, trans_output_size);
                auto cast_buf = cast_size == 0
                                    ? null_buf
                                    : handle.CreateSubBuffer(workSpace, cast_offset, cast_size);

                SetTensor(handle, cast_desc, cast_buf.get(), &zero);
                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                if(is_nchw)
                {
                    if(!trans_input_skippable)
                    {
                        auto& karg_input = opArgsTrans[trans_input_idx];
                        karg_input[0]    = OpKernelArg(trans_input_buf.get());
                        karg_input[1]    = OpKernelArg(tensors.x);
                        handle.Run(kernels[kID_trans_start + trans_input_idx])(karg_input);
                        if(handle.IsProfilingEnabled())
                            elapsed += handle.GetKernelTime();
                    }
                    if(!trans_output_skippable)
                    {
                        auto& karg_output = opArgsTrans[trans_output_idx];
                        karg_output[0]    = OpKernelArg(trans_output_buf.get());
                        karg_output[1]    = OpKernelArg(tensors.dy);
                        handle.Run(kernels[kID_trans_start + trans_output_idx])(karg_output);
                        if(handle.IsProfilingEnabled())
                            elapsed += handle.GetKernelTime();
                    }
                }

                opArgs[0] = (is_nchw && !trans_input_skippable) ? OpKernelArg(trans_input_buf.get())
                                                                : OpKernelArg(tensors.x);
                opArgs[1] = OpKernelArg(cast_buf.get());
                opArgs[2] = (is_nchw && !trans_output_skippable)
                                ? OpKernelArg(trans_output_buf.get())
                                : OpKernelArg(tensors.dy);

                ker(opArgs);
                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                CastTensor(handle,
                           &lowp_quant,
                           false,
                           cast_desc,
                           cast_buf.get(),
                           tensors.dwDesc,
                           (is_nchw && !trans_weight_skippable) ? trans_weight_buf.get()
                                                                : tensors.dw,
                           0,
                           0);

                if(is_nchw && !trans_weight_skippable)
                {
                    auto& karg_weight = opArgsTrans[trans_weight_idx];
                    karg_weight[0]    = OpKernelArg(tensors.dw);
                    karg_weight[1]    = OpKernelArg(trans_weight_buf.get());
                    handle.Run(kernels[kID_trans_start + trans_weight_idx])(karg_weight);
                    if(handle.IsProfilingEnabled())
                        elapsed += handle.GetKernelTime();
                }

                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                if(handle.IsProfilingEnabled())
                {
                    handle.ResetKernelTime();
                    handle.AccumKernelTime(elapsed);
                }
            };
        };
    }
    else
    {
        result.invoker_factory = [=](const std::vector<Kernel>& kernels) mutable {
            return [=](const Handle& handle, const AnyInvokeParams& primitive_parameters) mutable {
                decltype(auto) wrw_invoke_params =
                    primitive_parameters.CastTo<miopen::conv::WrWInvokeParams>();
                const auto& tensors = wrw_invoke_params.tensors;
                const auto ker      = handle.Run(
                    kernels[(isGfx90aFp16altSupport && wrw_invoke_params.gfx90aFp16alt) ? 1 : 0]);
                const auto& workSpace = wrw_invoke_params.workSpace;
                float elapsed         = 0;
                float zero            = 0.f;

                auto trans_input_buf =
                    trans_input_size == 0
                        ? null_buf
                        : handle.CreateSubBuffer(workSpace, trans_input_offset, trans_input_size);
                auto trans_weight_buf =
                    trans_weight_size == 0
                        ? null_buf
                        : handle.CreateSubBuffer(workSpace, trans_weight_offset, trans_weight_size);
                auto trans_output_buf =
                    trans_output_size == 0
                        ? null_buf
                        : handle.CreateSubBuffer(workSpace, trans_output_offset, trans_output_size);
                auto cast_buf = cast_size == 0
                                    ? null_buf
                                    : handle.CreateSubBuffer(workSpace, cast_offset, cast_size);

                opArgs[0] = (is_nchw && !trans_input_skippable) ? OpKernelArg(trans_input_buf.get())
                                                                : OpKernelArg(tensors.x);
                opArgs[1] = (is_nchw && !trans_weight_skippable)
                                ? OpKernelArg(trans_weight_buf.get())
                                : OpKernelArg(tensors.dw);
                opArgs[2] = (is_nchw && !trans_output_skippable)
                                ? OpKernelArg(trans_output_buf.get())
                                : OpKernelArg(tensors.dy);

                SetTensor(handle,
                          tensors.dwDesc,
                          (is_nchw && !trans_weight_skippable) ? trans_weight_buf.get()
                                                               : tensors.dw,
                          &zero);
                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                if(is_nchw)
                {
                    if(!trans_input_skippable)
                    {

                        auto& karg_input = opArgsTrans[trans_input_idx];
                        karg_input[0]    = OpKernelArg(trans_input_buf.get());
                        karg_input[1]    = OpKernelArg(tensors.x);
                        handle.Run(kernels[kID_trans_start + trans_input_idx])(karg_input);
                        if(handle.IsProfilingEnabled())
                            elapsed += handle.GetKernelTime();
                    }
                    if(!trans_output_skippable)
                    {

                        auto& karg_output = opArgsTrans[trans_output_idx];
                        karg_output[0]    = OpKernelArg(trans_output_buf.get());
                        karg_output[1]    = OpKernelArg(tensors.dy);
                        handle.Run(kernels[kID_trans_start + trans_output_idx])(karg_output);
                        if(handle.IsProfilingEnabled())
                            elapsed += handle.GetKernelTime();
                    }
                }

                ker(opArgs);
                if(handle.IsProfilingEnabled())
                    elapsed += handle.GetKernelTime();

                if(is_nchw && !trans_weight_skippable)
                {
                    auto& karg_weight = opArgsTrans[trans_weight_idx];
                    karg_weight[0]    = OpKernelArg(tensors.dw);
                    karg_weight[1]    = OpKernelArg(trans_weight_buf.get());
                    handle.Run(kernels[kID_trans_start + trans_weight_idx])(karg_weight);
                    if(handle.IsProfilingEnabled())
                        elapsed += handle.GetKernelTime();
                }

                if(handle.IsProfilingEnabled())
                {
                    handle.ResetKernelTime();
                    handle.AccumKernelTime(elapsed);
                }
            };
        };
    }

    return result;
}

} // namespace conv
} // namespace solver
} // namespace miopen
