diff --git a/tools/aie-chess-simulation/mixed_bfp16_bf16/Makefile b/tools/aie-chess-simulation/mixed_bfp16_bf16/Makefile
new file mode 100644
index 00000000000..532b5b1ab49
--- /dev/null
+++ b/tools/aie-chess-simulation/mixed_bfp16_bf16/Makefile
@@ -0,0 +1,15 @@
+all: build sim
+.PHONY : all build sim clean
+
+build: 
+	rm -rf output && mkdir output && xchesscc --aiearch aie2p -p me -C Release_LLVM -D__AIENGINE__ - -I ${AIETOOLS_ROOT}/include -I ${AIETOOLS_ROOT}/include/aie_api -P ${AIETOOLS_ROOT}/data/aie2p/lib -d -f -g +s +w work +o work -I. -I $../../../.. test.cc
+	
+sim: 
+	xca_udm_dbg --aiearch aie2p -qf -T -P ${AIETOOLS_ROOT}/data/aie2p/lib -t "sim.tcl work/a.out"
+
+clean:
+	rm -rf work *txt *mem *.output output 
+
+
+
+
diff --git a/tools/aie-chess-simulation/mixed_bfp16_bf16/aie_kernel_utils.h b/tools/aie-chess-simulation/mixed_bfp16_bf16/aie_kernel_utils.h
new file mode 100644
index 00000000000..7d1a4d2162b
--- /dev/null
+++ b/tools/aie-chess-simulation/mixed_bfp16_bf16/aie_kernel_utils.h
@@ -0,0 +1,79 @@
+/*
+    Copyright (C) 2014 - 2022 Xilinx, Inc. All rights reserved.
+    Copyright (C) 2022 - 2025 Advanced Micro Devices, Inc. All rights reserved.
+    SPDX-License-Identifier: MIT
+*/
+
+#ifndef _AIE_KERNEL_UTILS_
+#define _AIE_KERNEL_UTILS_
+
+#if defined(__chess__)
+#define AIE_LOOP_UNROLL(x) [[chess::unroll_loop(x)]]
+#define AIE_LOOP_UNROLL_FULL [[chess::unroll_loop()]]
+#define AIE_LOOP_NO_UNROLL [[chess::no_unroll]]
+#define AIE_LOOP_MIN_ITERATION_COUNT(x) [[chess::min_loop_count(x)]]
+#define AIE_LOOP_MAX_ITERATION_COUNT(x) [[chess::max_loop_count(x)]]
+#define AIE_LOOP_RANGE(a, ...)                                                 \
+  [[chess::min_loop_count(a)]] __VA_OPT__(                                     \
+      [[chess::max_loop_count(__VA_ARGS__)]])
+#define AIE_PREPARE_FOR_PIPELINING [[chess::prepare_for_pipelining]]
+#define AIE_NO_PREPARE_FOR_PIPELINING [[chess::no_prepare_for_pipelining]]
+#define AIE_MODULO_SCHEDULING_BUDGET_RATIO(x)                                  \
+  [[chess::modulo_scheduling_budget_ratio(x)]]
+#define AIE_KEEP_SW_LOOP [[chess::keep_sw_loop]]
+#define AIE_PEEL_PIPELINED_LOOP(x) [[chess::peel_pipelined_loop(x)]]
+#define AIE_KEEP_FREE_FOR_PIPELINING(x) [[chess::keep_free_for_pipelining(x)]]
+#define AIE_ALLOCATE(x) [[chess::allocate(x)]]
+#define AIE_NO_HW_LOOP [[chess::no_hw_loop]]
+#define AIE_TRY_INITIATION_INTERVAL(x)
+#define AIE_PREPARE_FOR_POSTPIPELINING
+#define AIE_LOOP_FLATTEN chess_flatten_loop
+
+#elif defined(__AIECC__)
+#ifndef __STRINGIFY
+#define __STRINGIFY(a) #a
+#endif
+#define AIE_LOOP_UNROLL(x) _Pragma(__STRINGIFY(clang loop unroll_count(x)))
+#define AIE_LOOP_UNROLL_FULL _Pragma("clang loop unroll(full)")
+#define AIE_LOOP_NO_UNROLL _Pragma("clang loop unroll(disable)")
+#define AIE_LOOP_MIN_ITERATION_COUNT(x)                                        \
+  _Pragma(__STRINGIFY(clang loop min_iteration_count(x)))
+#define AIE_LOOP_MAX_ITERATION_COUNT(x)                                        \
+  _Pragma(__STRINGIFY(clang loop max_iteration_count(x)))
+#define AIE_LOOP_RANGE(a, ...)                                                 \
+  AIE_LOOP_MIN_ITERATION_COUNT(a)                                              \
+  __VA_OPT__(AIE_LOOP_MAX_ITERATION_COUNT(__VA_ARGS__))
+#define AIE_PREPARE_FOR_PIPELINING
+#define AIE_NO_PREPARE_FOR_PIPELINING
+#define AIE_MODULO_SCHEDULING_BUDGET_RATIO(x)
+#define AIE_KEEP_SW_LOOP
+#define AIE_PEEL_PIPELINED_LOOP(x)
+#define AIE_KEEP_FREE_FOR_PIPELINING(x)
+#define AIE_ALLOCATE(x)
+#define AIE_NO_HW_LOOP
+#define AIE_TRY_INITIATION_INTERVAL(x)                                         \
+  _Pragma(__STRINGIFY(clang loop pipeline_initiation_interval(x)))
+#define AIE_PREPARE_FOR_POSTPIPELINING _Pragma("clang loop pipeline(disable)")
+#define AIE_LOOP_FLATTEN 
+
+#else
+#define AIE_LOOP_UNROLL(x)
+#define AIE_LOOP_UNROLL_FULL
+#define AIE_LOOP_NO_UNROLL
+#define AIE_LOOP_MIN_ITERATION_COUNT(x)
+#define AIE_LOOP_MAX_ITERATION_COUNT(x)
+#define AIE_LOOP_RANGE(a, ...)
+#define AIE_PREPARE_FOR_PIPELINING
+#define AIE_NO_PREPARE_FOR_PIPELINING
+#define AIE_MODULO_SCHEDULING_BUDGET_RATIO(x)
+#define AIE_KEEP_SW_LOOP
+#define AIE_PEEL_PIPELINED_LOOP(x)
+#define AIE_KEEP_FREE_FOR_PIPELINING(x)
+#define AIE_ALLOCATE(x)
+#define AIE_NO_HW_LOOP
+#define AIE_TRY_INITIATION_INTERVAL(x)
+#define AIE_PREPARE_FOR_POSTPIPELINING
+#define AIE_LOOP_FLATTEN 
+#endif
+
+#endif
\ No newline at end of file
diff --git a/tools/aie-chess-simulation/mixed_bfp16_bf16/helper.h b/tools/aie-chess-simulation/mixed_bfp16_bf16/helper.h
new file mode 100644
index 00000000000..ba137dfad79
--- /dev/null
+++ b/tools/aie-chess-simulation/mixed_bfp16_bf16/helper.h
@@ -0,0 +1,175 @@
+//===- helper.h -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2025, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include "io_helpers.h"
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <vector>
+#include <cstdio>
+#include <cmath>
+#include <cstdlib>
+#include <utility>
+#include <string_view>
+#include "aie_api/aie.hpp"
+
+// block - block size
+// size  - length of the input array
+// array - the array
+// returnArray - the array to be filled with the quantized values
+// rounding - 0 for zero, 1 for nearest (tie to even)
+// verbose - make some noise
+// Quantization of an array of floats to bfp16.
+// The return array is structured as follows:
+// 1. The first byte is the shared exponent (max exponent of the block).
+// 2. The next *block* bytes are the quantized values.
+inline std::vector<uint8_t> floatToBfp16(int block, int size, float *array, int rounding = 0) {
+  std::vector<uint8_t> res(size * 1.125);
+
+  int mbits = 7;
+  int start = 0, end, i, currentIndex = 1;
+  unsigned int sign, exp, maxExp;
+  unsigned int *p, mantissa;
+  uint8_t valueInt8;
+
+  while (true) {
+    // decide on the block (starting and ending point)
+    end = start + block;
+    end = end > size ? size : end;
+
+    // Find max exp
+    maxExp = 0;
+    for (i = start; i < end; i++) {
+      p = (unsigned int *)(array + i);
+      exp = *p >> 23;    // Get rid of mantissa
+      exp &= 0x000000FF; // Keep the last 8 bit exponent (remove sign)
+
+      maxExp = maxExp < exp ? exp : maxExp;
+    }
+
+    // Round each number
+    for (i = start; i < end; i++) {
+      p = (unsigned int *)(array + i);
+
+      sign = *p & 0x80000000;     // Sign
+      exp = *p >> 23;             // Get rid of mantissa
+      exp &= 0x000000FF;          // Keep the last 8 bit exponent (remove sign)
+      mantissa = *p & 0x007FFFFF; // 23-bit mantissa
+      if (exp)
+        mantissa |= 0x00800000; // add the implicit for normal value
+
+      if (exp >= 255)
+        continue; // Infinity or NaN remains
+
+      // The rouding mode for the mantissa in AIE2p is always truncation
+      // Each scalar value is stored in two's complement representation
+      mantissa = sign ? ~mantissa + 1 : mantissa;
+      // At least erase 23 - mbits + 1 (+1 is for making the implicit bit
+      // explicit)
+      valueInt8 = mantissa >> (23 - mbits + 1);
+
+      // Note that shifting by more than 32 bits is undefined behavior in C++
+      if (maxExp - exp >= 32) {
+        valueInt8 = sign ? 0xff : 0x00;
+      } else {
+        // Perform an arithmetic right shift
+        // Again, the rounding mode is truncation for AIE2p
+        valueInt8 = static_cast<int8_t>(valueInt8) >> (maxExp - exp);
+      }
+
+      res[currentIndex] = valueInt8;
+      currentIndex++;
+    }
+    res[currentIndex - 9] = (uint8_t)maxExp;
+    currentIndex++;
+    start = end;
+    if (start >= size)
+      break;
+  }
+
+  return res;
+}
+
+
+// Helper to print matrix in required format using C-style FILE*
+void print_matrix_float(const char* filename, float* data, int rows, int cols) {
+  FILE* fp = open_file(filename, "w+");
+  fprintf(fp, "(%d, %d)\n", rows, cols);
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      fprintf(fp, "%f", (float)data[i * cols + j]);
+      if (j < cols - 1) fprintf(fp, " ");
+    }
+    fprintf(fp, "\n");
+  }
+  fclose(fp);
+}
+
+// Helper to print matrix in required format using C-style FILE*
+void print_matrix_bfloat16(const char* filename, bfloat16* data, int rows, int cols) {
+  FILE* fp = open_file(filename, "w+");
+  fprintf(fp, "(%d, %d)\n", rows, cols);
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      fprintf(fp, "%f", (float)data[i * cols + j]);
+      if (j < cols - 1) fprintf(fp, " ");
+    }
+    fprintf(fp, "\n");
+  }
+  fclose(fp);
+}
+
+// Golden result calculation: naive matrix multiplication (float)
+void calc_golden_result(const float* A, const float* B, float* C, int M, int K, int N) {
+  // C[M x N] = A[M x K] * B[K x N]
+  for (int i = 0; i < M; ++i) {
+    for (int j = 0; j < N; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < K; ++k) {
+        float a_val = (float)A[i * K + k];
+        float b_val = (float)B[k * N + j];
+        if (i == 0 && j == 0 && k < 8) {
+          printf("DEBUG: A[0][%d]=%f, B[%d][0]=%f\n", k, a_val, k, b_val);
+        }
+        sum += a_val * b_val;
+      }
+      if (i == 0 && j < 8) {
+        printf("DEBUG: gold[%d] sum = %f\n", j, sum);
+      }
+      C[i * N + j] = (float)sum;
+    }
+  }
+}
+
+// Layout transpose function: reorganize 8x8 matrix from row-major to column-major layout
+// Input: 8x8 float array, row-major
+// Output: 8x8 array in column-major layout
+void layout_transpose_8x8block(float* input, float* output, int rows, int cols) {
+  
+  int output_idx = 0;
+
+  // Process the single 8x8 block in column-major order
+  for (int col = 0; col < 8; col++) {
+    for (int row = 0; row < 8; row++) {
+      // Calculate the position in the original row-major matrix
+      int orig_idx = row * 8 + col;
+      
+      // Copy to output in column-major layout
+      output[output_idx++] = input[orig_idx];
+    }
+  }
+}
+
+
diff --git a/tools/aie-chess-simulation/mixed_bfp16_bf16/io_helpers.h b/tools/aie-chess-simulation/mixed_bfp16_bf16/io_helpers.h
new file mode 100644
index 00000000000..7e0a4a02e38
--- /dev/null
+++ b/tools/aie-chess-simulation/mixed_bfp16_bf16/io_helpers.h
@@ -0,0 +1,367 @@
+/*  (c) Copyright 2014 - 2019 Xilinx, Inc. All rights reserved.
+
+    This file contains confidential and proprietary information
+    of Xilinx, Inc. and is protected under U.S. and
+    international copyright and other intellectual property
+    laws.
+
+    DISCLAIMER
+    This disclaimer is not a license and does not grant any
+    rights to the materials distributed herewith. Except as
+    otherwise provided in a valid license issued to you by
+    Xilinx, and to the maximum extent permitted by applicable
+    law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+    WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+    AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+    BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+    INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+    (2) Xilinx shall not be liable (whether in contract or tort,
+    including negligence, or under any other theory of
+    liability) for any loss or damage of any kind or nature
+    related to, arising under or in connection with these
+    materials, including for any direct, or any indirect,
+    special, incidental, or consequential loss or damage
+    (including loss of data, profits, goodwill, or any type of
+    loss or damage suffered as a result of any action brought
+    by a third party) even if such damage or loss was
+    reasonably foreseeable or Xilinx had been advised of the
+    possibility of the same.
+
+    CRITICAL APPLICATIONS
+    Xilinx products are not designed or intended to be fail-
+    safe, or for use in any application requiring fail-safe
+    performance, such as life-support or safety devices or
+    systems, Class III medical devices, nuclear facilities,
+    applications related to the deployment of airbags, or any
+    other applications that could lead to death, personal
+    injury, or severe property or environmental damage
+    (individually and collectively, "Critical
+    Applications"). Customer assumes the sole risk and
+    liability of any use of Xilinx products in Critical
+    Applications, subject only to applicable laws and
+    regulations governing limitations on product liability.
+
+    THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+    PART OF THIS FILE AT ALL TIMES.                       */
+
+#pragma once
+
+#ifndef __AIE_API_TESTS_IO_HELPERS_HPP__
+#define __AIE_API_TESTS_IO_HELPERS_HPP__
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+
+#include "aie_api/aie.hpp"
+
+[[maybe_unused]] static FILE *open_file(const char* filename, const char *mode)
+{
+    FILE *fp = fopen(filename,mode);
+
+    if (fp == NULL) {
+        fprintf(stderr, "ERROR: Cannot open file '%s'.\n",filename);
+        exit(1);
+    }
+
+    return fp;
+}
+
+[[maybe_unused]] static void write_file(const int8 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%d\n", output[i]);
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void write_file(const uint8 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%u\n", (unsigned)output[i]);
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void write_file(const int16 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%d\n", output[i]);
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void write_file(const uint16 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%u\n", output[i]);
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void write_file(const int32 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%d\n", output[i]);
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void write_file(const uint32 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%u\n", output[i]);
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void write_file(float *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%f\n", output[i]);
+
+    fclose(fp);
+}
+
+#if __AIE_ARCH__ >= 20
+[[maybe_unused]] static void write_file(bfloat16 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%f\n", (float)(output[i]));
+
+    fclose(fp);
+}
+#endif
+
+[[maybe_unused]] static void write_file(const float *output, unsigned num, bool cmplx, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+    if (cmplx) {
+        for (int i = 0; i < num/2; i++)
+            fprintf(fp, "%9.6g %9.6g\n", output[2*i], output[2*i+1]);
+    }
+    else {
+        for (int i = 0; i < num; i++)
+            fprintf(fp, "%f\n", output[i]);
+    }
+
+    fclose(fp);
+}
+
+#if __AIE_ARCH__ == 10 || __AIE_API_COMPLEX_FP32_EMULATION__
+[[maybe_unused]] static void write_file(const cfloat *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename,"w+");
+    const float *tmp = (const float*)output; 
+
+    for (int i = 0; i < num; i++)
+        fprintf(fp, "%9.6g %9.6g\n", tmp[2*i], tmp[2*i+1]);
+
+    fclose(fp);
+}
+#endif
+
+[[maybe_unused]] static void read_file(int8 *dest, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "r");
+
+    for (int i = 0; i < num; ++i) {
+        int re;
+        int ret = fscanf(fp, "%d", &re);
+        if (ret != 1) fprintf(stderr, "failed: %d\n", i);
+        assert(ret == 1);
+
+        *dest++ = re;
+    }
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void read_file(uint8 *dest, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "r");
+
+    for (int i = 0; i < num; ++i) {
+        unsigned re;
+        int ret = fscanf(fp, "%u", &re);
+        if (ret != 1) fprintf(stderr, "failed: %d\n", i);
+        assert(ret == 1);
+
+        *dest++ = re;
+    }
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void read_file(int16 *dest, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "r");
+
+    for (int i = 0; i < num; ++i) {
+        int re;
+        int ret = fscanf(fp, "%d", &re);
+        assert(ret == 1);
+
+        *dest++ = re;
+    }
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void read_file(uint16 *dest, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "r");
+
+    for (int i = 0; i < num; ++i) {
+        unsigned re;
+        int ret = fscanf(fp, "%u", &re);
+        assert(ret == 1);
+
+        *dest++ = re;
+    }
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void read_file(int32 *dest, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "r");
+
+    for (int i = 0; i < num; ++i) {
+        int re;
+        int ret = fscanf(fp, "%d", &re);
+        assert(ret == 1);
+
+        *dest++ = re;
+    }
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void read_file(uint32 *dest, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "r");
+
+    for (int i = 0; i < num; ++i) {
+        unsigned re;
+        int ret = fscanf(fp, "%u", &re);
+        assert(ret == 1);
+
+        *dest++ = re;
+    }
+
+    fclose(fp);
+}
+
+typedef int (*stream_32_in_t)();
+typedef void (*stream_32_out_t)(int);
+
+//read 32bit stream
+[[maybe_unused]] static void read_stream(int16 *dest, unsigned num, bool cplx, stream_32_in_t stream_in)
+{
+    int32 tmp;
+    if (cplx) {
+        for (int i=0; i<num; i++) {
+            tmp=(*stream_in)();
+            *dest++=(short)(tmp&0xffff);
+            *dest++=(short)((tmp>>16)&0xffff);
+        }
+    }
+    else {
+        for (int i=0; i<num; i++) {
+            tmp=(*stream_in)();
+            *dest++=(short)(tmp&0xffff);
+        }
+    }
+}
+
+[[maybe_unused]] static void write_file(const cint16 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "w+");
+
+    for (int i = 0; i < num; ++i)
+        fprintf(fp,"%d\t%d\n",(short)output[i].real,(short)output[i].imag);
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void write_file(const cint32 *output, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "w+");
+
+    for (int i = 0; i < num; ++i)
+        fprintf(fp,"%d\t%d\n",output[i].real,output[i].imag);
+
+    fclose(fp);
+}
+
+[[maybe_unused]] static void read_file(cint16 *dest, unsigned num, const char* filename)
+{
+    read_file((int16 *)dest,num*2,filename);
+}
+
+[[maybe_unused]] static void stream_output_data(cint16 *output, int size, stream_32_out_t stream_out)
+{
+    for (int i=0; i<size; i++)
+        (*stream_out)(*(int*)(&output[i]));
+}
+
+[[maybe_unused]] static void read_file(cint32 *dest, unsigned num, const char* filename)
+{
+    read_file((int32 *)dest,num*2,filename);
+}
+
+[[maybe_unused]] static void read_file(float *dest, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "r");
+
+    for (int i = 0; i < num; ++i) {
+        float re;
+        int ret = fscanf(fp, "%f", &re);
+        assert(ret == 1);
+
+        *dest++ = re;
+    }
+
+    fclose(fp);
+}
+
+#if __AIE_ARCH__ >= 20
+[[maybe_unused]] static void read_file(bfloat16 *dest, unsigned num, const char* filename)
+{
+    FILE *fp = open_file(filename, "r");
+
+    for (int i = 0; i < num; ++i) {
+        float re;
+        int ret = fscanf(fp, "%f", &re);
+        assert(ret == 1);
+
+        *dest++ = (bfloat16)re;
+    }
+
+    fclose(fp);
+}
+#endif
+
+[[maybe_unused]] static void read_file(cfloat *dest, unsigned num, const char* filename)
+{
+    return read_file((float *)dest, num * 2, filename);
+}
+
+#endif // __AIE_API_TESTS_IO_HELPERS_HPP__
diff --git a/tools/aie-chess-simulation/mixed_bfp16_bf16/sim.tcl b/tools/aie-chess-simulation/mixed_bfp16_bf16/sim.tcl
new file mode 100644
index 00000000000..adad53661d5
--- /dev/null
+++ b/tools/aie-chess-simulation/mixed_bfp16_bf16/sim.tcl
@@ -0,0 +1,13 @@
+proc my_load_program {file} {
+    set me_DIR $::env(me_DIR)
+    iss program load $file -nmlpath $me_DIR -do_not_set_entry_pc 1 -pm_check first -load_offsets {}
+}
+
+iss::create %PROCESSORNAME% iss
+my_load_program [lindex $::iss::tcl_script_args 0]
+iss step -1
+set retcode [iss program query exit_code]
+puts -nonewline "@@ EXIT STATUS "
+puts $retcode
+exit $retcode
+
diff --git a/tools/aie-chess-simulation/mixed_bfp16_bf16/test.cc b/tools/aie-chess-simulation/mixed_bfp16_bf16/test.cc
new file mode 100755
index 00000000000..b0361fd351d
--- /dev/null
+++ b/tools/aie-chess-simulation/mixed_bfp16_bf16/test.cc
@@ -0,0 +1,84 @@
+#include "helper.h"
+#include "aie_kernel_utils.h"
+
+
+void single_mac_8x8x8(bfloat16 *__restrict inA,
+                      bfp16ebs8 *__restrict inB,
+                      bfloat16 *__restrict outC) {
+  aie::vector<bfloat16, 64> A_data_bf16 = aie::load_v<64>(inA);
+  aie::accum<accfloat, 64> A_data_float;
+  A_data_float = A_data_bf16;
+  aie::block_vector<bfp16ebs8, 64> A_data_bfp = A_data_float.to_vector<bfp16ebs8>();
+
+  aie::block_vector_input_buffer_stream<bfp16ebs8, 64> pB_stream(inB);
+  aie::block_vector<bfp16ebs8, 64> B_data = pB_stream.pop();
+  aie::accum<accfloat, 64> acc_data = aie::zeros<accfloat, 64>();
+
+  chess_report(A_data_bfp);
+  chess_report(B_data);
+  acc_data = mac_8x8_8x8T(A_data_bfp, B_data, acc_data);
+  chess_report(acc_data);
+  aie::vector<bfloat16, 64> C_data = acc_data.template to_vector<bfloat16>();
+  chess_report(C_data);
+  aie::store_v(outC, C_data);
+}
+
+
+constexpr int M = 8;  constexpr int K = 8;  constexpr int N = 8;
+constexpr int m = 8;  constexpr int k = 8;  constexpr int n = 8;
+constexpr int r = 8;   constexpr int s = 8;   constexpr int t = 8;
+
+
+
+int main()
+{
+
+  printf("test start ...\n");
+  int A_SIZE = M * K;
+  int B_SIZE = N * K;
+  int C_SIZE = M * N;
+  size_t A_VOLUME = (A_SIZE * sizeof(uint8_t)) * 1.125;
+  size_t B_VOLUME = (B_SIZE * sizeof(uint8_t)) * 1.125;
+  size_t C_VOLUME = (C_SIZE * sizeof(uint8_t)) * 1.125;
+
+  float* A_float = (float*)malloc(A_SIZE * sizeof(float));
+  float* B_float = (float*)malloc(B_SIZE * sizeof(float));  
+  for (int i = 0; i < A_SIZE; i++) {
+    A_float[i] =  i % 8;
+  }
+  for (int i = 0; i < B_SIZE; i++) {
+    B_float[i] =  i % 8  ;
+  }
+  
+  // Test layout transpose function
+  printf("Testing layout transpose...\n");
+  float* B_transposed = (float*)malloc(B_SIZE * sizeof(float));
+  layout_transpose_8x8block(B_float, B_transposed, N, K);
+
+  float* Gold_float = (float*)malloc(C_SIZE * sizeof(float));
+  calc_golden_result(A_float, B_float, Gold_float, M, K, N);
+
+  print_matrix_float("output/A.txt", A_float, M, K);
+  print_matrix_float("output/B.txt", B_float, N, K);
+  print_matrix_float("output/B_transposed.txt", B_transposed, N, K);
+  print_matrix_float("output/Gold.txt", Gold_float, M, N);
+
+  alignas(aie::vector_decl_align) bfloat16 A_bfloat16[A_SIZE];
+  for (int i = 0; i < A_SIZE; i++) {
+    A_bfloat16[i] = (bfloat16)A_float[i];
+  }
+
+  std::vector<uint8_t> B_bfp16ebs8 = floatToBfp16(8, B_SIZE , B_transposed, 0);
+  alignas(aie::vector_decl_align) bfloat16 C_bfloat16[64];
+  single_mac_8x8x8(A_bfloat16, (bfp16ebs8*)B_bfp16ebs8.data(), C_bfloat16);
+
+  print_matrix_bfloat16("output/C.txt", C_bfloat16, M, N);
+
+  free(Gold_float);
+  free(A_float);
+  free(B_float);
+  free(B_transposed);
+
+  printf("test done!\n");
+  return 0;
+}
diff --git a/tools/aie-chess-simulation/mixed_bfp16_bf16/test.prx b/tools/aie-chess-simulation/mixed_bfp16_bf16/test.prx
new file mode 100644
index 00000000000..2911f739a15
--- /dev/null
+++ b/tools/aie-chess-simulation/mixed_bfp16_bf16/test.prx
@@ -0,0 +1,8 @@
+<project name="Project" processor="me">
+    <file type="c" name="test.cc" path="."/>
+    <option id="backend.mist2.xargs" value="-ggraph +A" inherit="1"/>
+    <option id="cpp.include" value="." inherit="1"/>
+    <option id="cpp.define" value="__AIENGINE__" inherit="1"/>
+    <option id="project.name" value="test"/>
+    <option id="project.type" value="exe"/>
+</project>
diff --git a/tools/aie-chess-simulation/simple_test/Makefile b/tools/aie-chess-simulation/simple_test/Makefile
new file mode 100644
index 00000000000..a84e72a85d1
--- /dev/null
+++ b/tools/aie-chess-simulation/simple_test/Makefile
@@ -0,0 +1,15 @@
+all: build sim
+.PHONY : all build sim clean
+
+build: 
+	xchesscc --aiearch aie2p -p me -C Release_LLVM -D__AIENGINE__ - -I ${AIETOOLS_ROOT}/include -I ${AIETOOLS_ROOT}/include/aie_api -P ${AIETOOLS_ROOT}/data/aie2p/lib -d -f -g +s +w work +o work -I. -I $../../../.. test.cc
+
+sim: 
+	xca_udm_dbg --aiearch aie2p -qf -T -P ${AIETOOLS_ROOT}/data/aie2p/lib -t "sim.tcl work/a.out"
+
+clean:
+	rm -rf work *txt *mem *.output
+
+
+
+
diff --git a/tools/aie-chess-simulation/simple_test/sim.tcl b/tools/aie-chess-simulation/simple_test/sim.tcl
new file mode 100644
index 00000000000..adad53661d5
--- /dev/null
+++ b/tools/aie-chess-simulation/simple_test/sim.tcl
@@ -0,0 +1,13 @@
+proc my_load_program {file} {
+    set me_DIR $::env(me_DIR)
+    iss program load $file -nmlpath $me_DIR -do_not_set_entry_pc 1 -pm_check first -load_offsets {}
+}
+
+iss::create %PROCESSORNAME% iss
+my_load_program [lindex $::iss::tcl_script_args 0]
+iss step -1
+set retcode [iss program query exit_code]
+puts -nonewline "@@ EXIT STATUS "
+puts $retcode
+exit $retcode
+
diff --git a/tools/aie-chess-simulation/simple_test/test.cc b/tools/aie-chess-simulation/simple_test/test.cc
new file mode 100755
index 00000000000..871017df86b
--- /dev/null
+++ b/tools/aie-chess-simulation/simple_test/test.cc
@@ -0,0 +1,21 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+
+int __inline__  kernel(int a)
+{
+    return a*2;
+}
+
+
+
+int main()
+{
+    printf("test start ...\n");
+    int a = 1;
+    int res = kernel(a);
+
+    printf("test done!\n");
+    return 0;
+}
diff --git a/tools/aie-chess-simulation/simple_test/test.prx b/tools/aie-chess-simulation/simple_test/test.prx
new file mode 100644
index 00000000000..2911f739a15
--- /dev/null
+++ b/tools/aie-chess-simulation/simple_test/test.prx
@@ -0,0 +1,8 @@
+<project name="Project" processor="me">
+    <file type="c" name="test.cc" path="."/>
+    <option id="backend.mist2.xargs" value="-ggraph +A" inherit="1"/>
+    <option id="cpp.include" value="." inherit="1"/>
+    <option id="cpp.define" value="__AIENGINE__" inherit="1"/>
+    <option id="project.name" value="test"/>
+    <option id="project.type" value="exe"/>
+</project>