代码¶

tiny_matrix.hpp¶

/**
 * @file tiny_matrix.hpp
 * @author SHUAIWEN CUI (SHUAIWEN001@e.ntu.edu.sg)
 * @brief This file is the header file for the submodule matrix (advanced matrix operations) of the tiny_math middleware.
 *
 * Terminology conventions:
 * - pad / padding: number of padded elements at row end (not part of logical matrix values).
 * - step: total elements per physical row (logical + padding).
 * - stride: spacing for strided sampling in one row (general concept in low-level kernels).
 *   In this matrix class, row indexing uses `step`; column sampling spacing is fixed to 1.
 *   The field name `stride` exists only as a deprecated backward-compatible alias of `step`.
 * @version 1.0
 * @date 2025-04-17
 * @note This file is built on top of the mat.h file from the ESP-DSP library.
 *
 */

#pragma once

/* DEPENDENCIES */
// TinyMath
#include "tiny_math_config.h"
#include "tiny_vec.h"
#include "tiny_mat.h"

// Standard Libraries
#include <iostream>
#include <stdint.h>

#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
// ESP32 DSP C++ Matrix library
#include "mat.h"
#endif

/* STATEMENTS */
namespace tiny
{
    class Mat
    {
    public:
        // ============================================================================
        // Matrix Metadata
        // ============================================================================
        int row;         //< number of rows
        int col;         //< number of columns
        int pad;         //< placeholder elements in storage; not used in math ops
        union
        {
            int step;    //< total elements per row (logical + padding)
            int stride;  //< deprecated backward-compatible alias for `step` (not sampling stride)
        };
        int element;     //< number of elements = rows * cols
        int memory;      //< size of the data buffer = rows * step
        float *data;     //< pointer to the data buffer
        float *temp;     //< pointer to the temporary data buffer
        bool ext_buff;   //< flag indicates that matrix use external buffer
        bool sub_matrix; //< flag indicates that matrix is a subset of another matrix

        // ============================================================================
        // Rectangular ROI Structure
        // ============================================================================
        /**
         * @name Region of Interest (ROI) Structure
         * @brief This is the structure for ROI
         */
        struct ROI
        {
            int pos_x;  ///< starting column index
            int pos_y;  ///< starting row index
            int width;  ///< width of ROI (columns)
            int height; ///< height of ROI (rows)

            ROI(int pos_x = 0, int pos_y = 0, int width = 0, int height = 0);
            void resize_roi(int pos_x, int pos_y, int width, int height);
            int area_roi(void) const;
        };

        // ============================================================================
        // Printing Functions
        // ============================================================================
        void print_info() const;
        void print_matrix(bool show_padding) const;

        // ============================================================================
        // Constructors & Destructor
        // ============================================================================
        /**
         * @brief Allocate memory for the matrix according to the memory required.
         * @note For ESP32, it will automatically determine if using RAM or PSRAM based on the size of the matrix.
         * @note This function sets ext_buff to false and allocates memory based on row * step.
         *       If allocation fails or parameters are invalid, data will be set to nullptr.
         */
        void alloc_mem();

        /**
         * @brief Default constructor: create a 1x1 matrix with only a zero element.
         * @note If memory allocation fails, the object will be in an invalid state (data = nullptr).
         *       Caller should check the data pointer before using the matrix.
         */
        Mat();

        /**
         * @brief Constructor - create a matrix with the specified number of rows and columns.
         * @param rows Number of rows
         * @param cols Number of columns
         */
        Mat(int rows, int cols);

        /**
         * @brief Constructor - create a matrix with the specified number of rows, columns and step.
         * @param rows Number of rows
         * @param cols Number of columns
         * @param step Total number of elements in a row (including padding)
         */
        Mat(int rows, int cols, int step);

        /**
         * @brief Constructor - create a matrix with the specified number of rows, columns and external data.
         * @param data Pointer to external data buffer
         * @param rows Number of rows
         * @param cols Number of columns
         */
        Mat(float *data, int rows, int cols);

        /**
         * @brief Constructor - create a matrix with the specified number of rows, columns and external data.
         * @param data Pointer to external data buffer
         * @param rows Number of rows
         * @param cols Number of columns
         * @param step Total number of elements in a row (including padding)
         */
        Mat(float *data, int rows, int cols, int step);

        /**
         * @brief Copy constructor - create a matrix with the same properties as the source matrix.
         * @param src Source matrix
         */
        Mat(const Mat &src);

        /**
         * @brief Destructor - free the memory allocated for the matrix.
         */
        ~Mat();

        // ============================================================================
        // Element Access
        // ============================================================================
        inline float &operator()(int row, int col) { return data[row * step + col]; }
        inline const float &operator()(int row, int col) const { return data[row * step + col]; }

        // ============================================================================
        // Data Manipulation
        // ============================================================================
        tiny_error_t copy_paste(const Mat &src, int row_pos, int col_pos);
        tiny_error_t copy_head(const Mat &src);
        Mat view_roi(int start_row, int start_col, int roi_rows, int roi_cols) const;
        Mat view_roi(const Mat::ROI &roi) const;
        Mat copy_roi(int start_row, int start_col, int height, int width);
        Mat copy_roi(const Mat::ROI &roi);
        Mat block(int start_row, int start_col, int block_rows, int block_cols);
        void swap_rows(int row1, int row2);
        void swap_cols(int col1, int col2);
        void clear(void);

        // ============================================================================
        // Arithmetic Operators
        // ============================================================================
        Mat &operator=(const Mat &src);    // Copy assignment
        Mat &operator+=(const Mat &A);     // Add matrix
        Mat &operator+=(float C);          // Add constant
        Mat &operator-=(const Mat &A);     // Subtract matrix
        Mat &operator-=(float C);          // Subtract constant 
        Mat &operator*=(const Mat &A);     // Multiply matrix
        Mat &operator*=(float C);          // Multiply constant
        Mat &operator/=(const Mat &B);     // Divide matrix
        Mat &operator/=(float C);          // Divide constant
        Mat operator^(int C);              // Exponentiation

        // ============================================================================
        // Linear Algebra - Basic Operations
        // ============================================================================
        Mat transpose();                   // Transpose matrix
        float determinant();               // Compute determinant (auto-selects method based on size)
        float determinant_laplace();        // Compute determinant using Laplace expansion (O(n!), for small matrices)
        float determinant_lu();            // Compute determinant using LU decomposition (O(n³), efficient for large matrices)
        float determinant_gaussian();      // Compute determinant using Gaussian elimination (O(n³), efficient for large matrices)
        Mat adjoint();                     // Compute adjoint matrix
        Mat inverse_adjoint();            // Compute inverse using adjoint method
        void normalize();                  // Normalize matrix
        float norm() const;                // Compute matrix norm
        float dotprod(const Mat &A, const Mat &B);  // Dot product

        // ============================================================================
        // Linear Algebra - Matrix Utilities
        // ============================================================================
        static Mat eye(int size);          // Create identity matrix
        static Mat ones(int rows, int cols);  // Create matrix filled with ones
        static Mat ones(int size);         // Create square matrix filled with ones
        static Mat augment(const Mat &A, const Mat &B);  // Horizontal concatenation [A | B]
        static Mat vstack(const Mat &A, const Mat &B);   // Vertical concatenation [A; B]

        /**
         * @brief Gram-Schmidt orthogonalization process
         * @note Orthogonalizes a set of vectors using the Gram-Schmidt process
         * @param vectors Input matrix where each column is a vector to be orthogonalized
         * @param orthogonal_vectors Output matrix for orthogonalized vectors (each column is orthogonal)
         * @param coefficients Output matrix for projection coefficients (R matrix in QR decomposition)
         * @param tolerance Minimum norm threshold for linear independence check
         * @return true if successful, false if input is invalid
         */
        static bool gram_schmidt_orthogonalize(const Mat &vectors, Mat &orthogonal_vectors, 
                                               Mat &coefficients, float tolerance = 1e-6f);

        // ============================================================================
        // Linear Algebra - Matrix Operations
        // ============================================================================
        Mat minor(int target_row, int target_col) const;       // Submatrix after removing one row + one column (works on any m x n matrix)
        Mat cofactor(int target_row, int target_col) const;    // Cofactor submatrix; requires the matrix to be square
        Mat gaussian_eliminate() const;    // Gaussian elimination
        Mat row_reduce_from_gaussian() const;   // REF -> RREF via back-substitution
        Mat inverse_gje() const;           // Inverse using Gauss-Jordan elimination on [A | I]

        // ============================================================================
        // Linear Algebra - Linear System Solving
        // ============================================================================
        Mat solve(const Mat &A, const Mat &b) const;  // Solve Ax = b using Gaussian elimination
        Mat band_solve(Mat A, Mat b, int k) const;    // Solve banded system
        Mat roots(Mat A, Mat y) const;                // Alternative solve method

        // ============================================================================
        // Matrix Decomposition
        // ============================================================================
        // Forward declarations (structures defined after class)
        struct LUDecomposition;
        struct CholeskyDecomposition;
        struct QRDecomposition;
        struct SVDDecomposition;

        // Matrix property checks
        /**
         * @brief Check if the matrix is symmetric within a given tolerance.
         * @param tolerance Maximum allowed difference between A(i,j) and A(j,i) (must be >= 0)
         * @return true if matrix is symmetric, false otherwise
         */
        bool is_symmetric(float tolerance = 1e-6f) const;

        /**
         * @brief Check if matrix is positive definite using Sylvester's criterion.
         * @param tolerance Tolerance for numerical checks (must be >= 0)
         * @param max_minors_to_check Maximum number of leading principal minors to check.
         *                            - If -1: check all minors (complete Sylvester's criterion)
         *                            - If > 0: check first max_minors_to_check minors
         * @return true if matrix is positive definite, false otherwise
         */
        bool is_positive_definite(float tolerance = 1e-6f, int max_minors_to_check = -1) const;

        // Decomposition methods
        LUDecomposition lu_decompose(bool use_pivoting = true) const;
        CholeskyDecomposition cholesky_decompose() const;
        QRDecomposition qr_decompose() const;
        SVDDecomposition svd_decompose(int max_iter = 100, float tolerance = 1e-6f) const;

        // Solve using decomposition (more efficient for multiple RHS)
        static Mat solve_lu(const LUDecomposition &lu, const Mat &b);
        static Mat solve_cholesky(const CholeskyDecomposition &chol, const Mat &b);
        static Mat solve_qr(const QRDecomposition &qr, const Mat &b);  // Least squares solution

        // Pseudo-inverse using SVD (for rank-deficient or non-square matrices)
        static Mat pseudo_inverse(const SVDDecomposition &svd, float tolerance = 1e-6f);

        // ============================================================================
        // Eigenvalue & Eigenvector Decomposition
        // ============================================================================
        // Forward declarations (structures defined after class)
        struct EigenPair;
        struct EigenDecomposition;

        // Single eigenvalue methods (fast, for real-time applications)
        /**
         * @brief Compute the dominant (largest magnitude) eigenvalue and eigenvector using power iteration.
         * @param max_iter Maximum number of iterations (must be > 0)
         * @param tolerance Convergence tolerance (must be >= 0). Convergence when |λ_k - λ_{k-1}| < tolerance * |λ_k|
         * @return EigenPair containing the dominant eigenvalue, eigenvector, and status
         */
        EigenPair power_iteration(int max_iter = 1000, float tolerance = 1e-6f) const;

        /**
         * @brief Compute the smallest (minimum magnitude) eigenvalue and eigenvector using inverse power iteration.
         * @param max_iter Maximum number of iterations (must be > 0)
         * @param tolerance Convergence tolerance (must be >= 0). Convergence when |λ_k - λ_{k-1}| < tolerance * max(|λ_k|, 1)
         * @return EigenPair containing the smallest eigenvalue, eigenvector, and status
         * @note The matrix must be invertible (non-singular) for this method to work.
         */
        EigenPair inverse_power_iteration(int max_iter = 1000, float tolerance = 1e-6f) const;

        // Complete eigendecomposition methods
        /**
         * @brief Compute complete eigenvalue decomposition using Jacobi method for symmetric matrices.
         * @param tolerance Convergence tolerance (must be >= 0). Convergence when max off-diagonal < tolerance
         * @param max_iter Maximum number of iterations (must be > 0)
         * @return EigenDecomposition containing all eigenvalues, eigenvectors, and status
         * @note Best for symmetric matrices. Matrix should be symmetric for best results.
         */
        EigenDecomposition eigendecompose_jacobi(float tolerance = 1e-6f, int max_iter = 100) const;

        /**
         * @brief Compute complete eigenvalue decomposition using QR algorithm for general matrices.
         * @param max_iter Maximum number of QR iterations (must be > 0)
         * @param tolerance Convergence tolerance (must be >= 0). Convergence when subdiagonal < tolerance
         * @return EigenDecomposition containing eigenvalues, eigenvectors, and status
         * @note Supports non-symmetric matrices, but may have complex eigenvalues (only real part returned).
         */
        EigenDecomposition eigendecompose_qr(int max_iter = 100, float tolerance = 1e-6f) const;

        /**
         * @brief Automatic eigenvalue decomposition with method selection.
         * @param tolerance Convergence tolerance (must be >= 0)
         * @param max_iter Maximum number of iterations (must be > 0, default = 100)
         * @return EigenDecomposition containing eigenvalues, eigenvectors, and status
         * @note Automatically selects Jacobi method for symmetric matrices, QR algorithm for general matrices.
         */
        EigenDecomposition eigendecompose(float tolerance = 1e-6f, int max_iter = 100) const;

    protected:

    private:

    };

    // ============================================================================
    // Matrix Decomposition Structures
    // ============================================================================
    /**
     * @brief Structure to hold LU decomposition results
     * @note A = L * U, where L is lower triangular and U is upper triangular
     */
    struct Mat::LUDecomposition
    {
        Mat L;                 ///< Lower triangular matrix (with unit diagonal)
        Mat U;                 ///< Upper triangular matrix
        Mat P;                 ///< Permutation matrix (if pivoting used)
        bool pivoted;          ///< Whether pivoting was used
        tiny_error_t status;   ///< Computation status

        LUDecomposition();
    };

    /**
     * @brief Structure to hold Cholesky decomposition results
     * @note A = L * L^T, where L is lower triangular (for symmetric positive definite matrices)
     */
    struct Mat::CholeskyDecomposition
    {
        Mat L;                 ///< Lower triangular matrix
        tiny_error_t status;   ///< Computation status

        CholeskyDecomposition();
    };

    /**
     * @brief Structure to hold QR decomposition results
     * @note A = Q * R, where Q is orthogonal and R is upper triangular
     */
    struct Mat::QRDecomposition
    {
        Mat Q;                 ///< Orthogonal matrix (Q^T * Q = I)
        Mat R;                 ///< Upper triangular matrix
        tiny_error_t status;   ///< Computation status

        QRDecomposition();
    };

    /**
     * @brief Structure to hold SVD decomposition results
     * @note A = U * S * V^T, where U and V are orthogonal, S is diagonal (singular values)
     */
    struct Mat::SVDDecomposition
    {
        Mat U;                 ///< Left singular vectors (orthogonal matrix)
        Mat S;                 ///< Singular values (diagonal matrix or vector)
        Mat V;                 ///< Right singular vectors (orthogonal matrix, V^T)
        int rank;              ///< Numerical rank of the matrix
        int iterations;        ///< Number of iterations performed
        tiny_error_t status;   ///< Computation status

        SVDDecomposition();
    };

    // ============================================================================
    // Eigenvalue & Eigenvector Decomposition Structures
    // ============================================================================
    /**
     * @brief Structure to hold a single eigenvalue-eigenvector pair
     * @note Used primarily for power iteration method
     */
    struct Mat::EigenPair
    {
        float eigenvalue;      ///< Eigenvalue (real part)
        Mat eigenvector;       ///< Corresponding eigenvector (column vector)
        int iterations;        ///< Number of iterations performed
        tiny_error_t status;   ///< Computation status

        EigenPair();
    };

    /**
     * @brief Structure to hold complete eigenvalue decomposition results
     * @note Contains all eigenvalues and eigenvectors
     */
    struct Mat::EigenDecomposition
    {
        Mat eigenvalues;       ///< Eigenvalues (diagonal matrix or vector)
        Mat eigenvectors;      ///< Eigenvector matrix (each column is an eigenvector)
        int iterations;        ///< Number of iterations performed
        tiny_error_t status;   ///< Computation status

        EigenDecomposition();
    };

    // ============================================================================
    // Stream Operators
    // ============================================================================
    std::ostream &operator<<(std::ostream &os, const Mat &m);
    std::ostream &operator<<(std::ostream &os, const Mat::ROI &roi);
    std::istream &operator>>(std::istream &is, Mat &m);

    // ============================================================================
    // Global Arithmetic Operators
    // ============================================================================
    Mat operator+(const Mat &A, const Mat &B);
    Mat operator+(const Mat &A, float C);
    Mat operator-(const Mat &A, const Mat &B);
    Mat operator-(const Mat &A, float C);
    Mat operator*(const Mat &A, const Mat &B);
    Mat operator*(const Mat &A, float C);
    Mat operator*(float C, const Mat &A);
    Mat operator/(const Mat &A, float C);
    Mat operator/(const Mat &A, const Mat &B);
    bool operator==(const Mat &A, const Mat &B);

}

tiny_matrix.cpp¶

/**
 * @file tiny_matrix.cpp
 * @author SHUAIWEN CUI (SHUAIWEN001@e.ntu.edu.sg)
 * @brief This file is the source file for the submodule matrix (advanced matrix operations) of the tiny_math middleware.
 * @version 1.0
 * @date 2025-04-17
 * @copyright Copyright (c) 2025
 *
 */

/* DEPENDENCIES */
// TinyMath
#include "tiny_matrix.hpp"

// Standard Libraries
#include <cstring>
#include <algorithm>
#include <iostream>
#include <stdexcept>
#include <cmath>
#include <cinttypes>
#include <iomanip>
#include <vector>
#include <limits>

/* LIBRARY CONTENTS */
namespace tiny
{
    // ============================================================================
    // Rectangular ROI Structure
    // ============================================================================
    /**
     * @brief Construct a new Mat::ROI object.
     *
     * @param pos_x
     * @param pos_y
     * @param width
     * @param height
     */
    Mat::ROI::ROI(int pos_x, int pos_y, int width, int height)
    {
        this->pos_x = pos_x;
        this->pos_y = pos_y;
        this->width = width;
        this->height = height;
    }

    /**
     * @brief resize the ROI structure
     * 
     * @param pos_x starting column
     * @param pos_y starting row
     * @param width number of columns
     * @param height number of rows
     */
    void Mat::ROI::resize_roi(int pos_x, int pos_y, int width, int height)
    {
        this->pos_x = pos_x;
        this->pos_y = pos_y;
        this->width = width;
        this->height = height;
    }

    /**
     * @brief calculate the area of the ROI structure - how many elements covered
     * 
     * @return int 
     */
    int Mat::ROI::area_roi(void) const
    {
        if (this->width < 0 || this->height < 0)
        {
            return 0;
        }
        return this->width * this->height;
    }

    // ============================================================================
    // Printing Functions
    // ============================================================================
    /**
     * @name Mat::print_info()
     * @brief Print the header of the matrix.
     */
    void Mat::print_info() const
    {
        std::cout << "Matrix Info >>>\n";

        // Basic matrix metadata
        std::cout << "rows            " << this->row << "\n";
        std::cout << "cols            " << this->col << "\n";
        std::cout << "elements        " << this->element;

        // Check if elements match rows * cols
        if (this->element != this->row * this->col)
        {
            std::cout << "   [Warning] Mismatch! Expected: " << (this->row * this->col);
        }
        std::cout << "\n";

        std::cout << "paddings        " << this->pad << "\n";
        std::cout << "step            " << this->step << "\n";
        std::cout << "memory          " << this->memory << "\n";

        // Pointer information
        std::cout << "data pointer    " << static_cast<const void *>(this->data) << "\n";
        std::cout << "temp pointer    " << static_cast<const void *>(this->temp) << "\n";

        // Flags information
        std::cout << "ext_buff        " << this->ext_buff;
        if (this->ext_buff)
        {
            std::cout << "   (External buffer or View)";
        }
        std::cout << "\n";

        std::cout << "sub_matrix      " << this->sub_matrix;
        if (this->sub_matrix)
        {
            std::cout << "   (This is a Sub-Matrix View)";
        }
        std::cout << "\n";

        // State warnings
        if (this->sub_matrix && !this->ext_buff)
        {
            std::cout << "[Warning] Sub-matrix is marked but ext_buff is false! Potential logic error.\n";
        }

        if (this->data == nullptr)
        {
            std::cout << "[Info] No data buffer assigned to this matrix.\n";
        }

        std::cout << "<<< Matrix Info\n";
    }

    /**
     * @name Mat::print_matrix()
     * @brief Print the matrix elements.
     *
     * @param show_padding If true, print the padding elements as well.
     */
    void Mat::print_matrix(bool show_padding) const
    {
        if (this->data == nullptr)
        {
            std::cout << "[Error] Cannot print matrix: data pointer is null.\n";
            return;
        }

        if (this->row < 0 || this->col < 0 || this->step < 0)
        {
            std::cout << "[Error] Invalid matrix dimensions\n";
            return;
        }

        const int print_cols = (this->step < this->col) ? this->step : this->col;
        if (this->step < this->col)
        {
            std::cout << "[Warning] step < cols; printing only the first " << print_cols
                      << " column(s) per row to avoid out-of-bounds access.\n";
        }

        std::cout << "Matrix Elements >>>\n";
        for (int i = 0; i < this->row; ++i)
        {
            // print the non-padding elements (bounded by step)
            for (int j = 0; j < print_cols; ++j)
            {
                std::cout << std::setw(12) << this->data[i * this->step + j] << " ";
            }

            // if padding is enabled, print the padding elements
            if (show_padding)
            {
                // print a separator first
                std::cout << "      |";

                // print the padding elements
                for (int j = this->col; j < this->step; ++j)
                {
                    if (j == this->col)
                    {
                        std::cout << std::setw(7) << this->data[i * this->step + j] << " ";
                    }
                    else
                    {
                        // print the padding elements
                        std::cout << std::setw(12) << this->data[i * this->step + j] << " ";
                    }
                }
            }

            // print a new line after each row
            std::cout << "\n";
        }

        std::cout << "<<< Matrix Elements\n";
        std::cout << std::endl;
    }

    // ============================================================================
    // Constructors & Destructor
    // ============================================================================
    // memory allocation
    /**
     * @name Mat::alloc_mem()
     * @brief Allocate memory for the matrix according to the memory required.
     * @note For ESP32, it will automatically determine if using RAM or PSRAM based on the size of the matrix.
     * @note This function sets ext_buff to false and allocates memory based on row * step.
     *       If allocation fails or parameters are invalid, data will be set to nullptr.
     */
    void Mat::alloc_mem()
    {
        // Parameter validation: check if row and step are non-negative
        if (this->row < 0 || this->step < 0)
        {
            std::cerr << "[Error] Invalid matrix dimensions in alloc_mem(): row=" << this->row 
                      << ", step=" << this->step << "\n";
            this->data = nullptr;
            this->ext_buff = false;
            this->memory = 0;
            return;
        }

        // Check for integer overflow: row * step might overflow
        if (this->row > 0 && this->step > INT_MAX / this->row)
        {
            std::cerr << "[Error] Matrix size too large, integer overflow: row=" << this->row 
                      << ", step=" << this->step << "\n";
            this->data = nullptr;
            this->ext_buff = false;
            this->memory = 0;
            return;
        }

        this->ext_buff = false;
        this->memory = this->row * this->step;

        // Handle empty matrix case (memory = 0)
        if (this->memory == 0)
        {
            this->data = nullptr;
            return;
        }

        // Use nothrow new to return nullptr on failure instead of throwing exception
        // This allows callers to check for nullptr, which is consistent with existing code
        this->data = new(std::nothrow) float[this->memory];

        // If allocation failed, data will be nullptr (caller should check)
        if (this->data == nullptr)
        {
            this->memory = 0;
        }
    }

    /**
     * @name Mat::Mat()
     * @brief Constructor - default constructor: create a 1x1 matrix with only a zero element.
     * @note If memory allocation fails, the object will be in an invalid state (data = nullptr).
     *       Caller should check the data pointer before using the matrix.
     */
    Mat::Mat()
        : row(1), col(1), pad(0), step(1), element(1), memory(1),
          data(nullptr), temp(nullptr),
          ext_buff(false), sub_matrix(false)
    {
        // memory will be recalculated by alloc_mem() based on row * step
        alloc_mem();
        if (this->data == nullptr && this->memory > 0)
        {
            std::cerr << "[>>> Error ! <<<] Memory allocation failed in alloc_mem()\n";
            // Memory allocation failed, object is in invalid state (data = nullptr)
            // Caller should check data pointer before using the matrix
            return;
        }
        // Initialize all elements to zero
        std::memset(this->data, 0, this->memory * sizeof(float));
    }

    /**
     * @name Mat::Mat(int rows, int cols)
     * @brief Constructor - create a matrix with the specified number of rows and columns.
     * @param rows Number of rows (must be non-negative)
     * @param cols Number of columns (must be non-negative)
     * @note If rows or cols is negative, the object will be in an invalid state.
     * @note If memory allocation fails, the object will be in an invalid state (data = nullptr).
     *       Caller should check the data pointer before using the matrix.
     */
    Mat::Mat(int rows, int cols)
        : row(rows), col(cols), pad(0), step(cols),
          element(rows * cols), memory(rows * cols),
          data(nullptr), temp(nullptr),
          ext_buff(false), sub_matrix(false)
    {
        // Parameter validation: check if rows and cols are non-negative
        if (rows < 0 || cols < 0)
        {
            std::cerr << "[Error] Invalid matrix dimensions: rows=" << rows 
                      << ", cols=" << cols << " (must be non-negative)\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        // Check for integer overflow: rows * cols might overflow
        if (rows > 0 && cols > INT_MAX / rows)
        {
            std::cerr << "[Error] Matrix size too large, integer overflow: rows=" << rows 
                      << ", cols=" << cols << "\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        // Empty matrix is a valid state; keep data == nullptr without allocation.
        if (rows == 0 || cols == 0)
        {
            this->element = 0;
            this->memory = 0;
            this->data = nullptr;
            return;
        }

        // memory will be recalculated by alloc_mem() based on row * step
        alloc_mem();
        if (this->data == nullptr && this->memory > 0)
        {
            std::cerr << "[>>> Error ! <<<] Memory allocation failed in alloc_mem()\n";
            // Memory allocation failed, object is in invalid state (data = nullptr)
            // Caller should check data pointer before using the matrix
            return;
        }
        // Initialize all elements to zero
        std::memset(this->data, 0, this->memory * sizeof(float));
    }
    /**
     * @name Mat::Mat(int rows, int cols, int step)
     * @brief Constructor - create a matrix with the specified number of rows, columns and step.
     * @param rows Number of rows (must be non-negative)
     * @param cols Number of columns (must be non-negative)
     * @param step Total number of elements in a row (must be >= cols)
     * @note If rows, cols is negative, or step < cols, the object will be in an invalid state.
     * @note If memory allocation fails, the object will be in an invalid state (data = nullptr).
     *       Caller should check the data pointer before using the matrix.
     */
    Mat::Mat(int rows, int cols, int step)
        : row(rows), col(cols), pad(step - cols), step(step),
          element(rows * cols), memory(rows * step),
          data(nullptr), temp(nullptr),
          ext_buff(false), sub_matrix(false)
    {
        // Parameter validation: check if rows, cols are non-negative
        if (rows < 0 || cols < 0)
        {
            std::cerr << "[Error] Invalid matrix dimensions: rows=" << rows 
                      << ", cols=" << cols << " (must be non-negative)\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        // Validate step: must be >= cols (padding cannot be negative)
        if (step < cols)
        {
            std::cerr << "[Error] Invalid step: step=" << step 
                      << " must be >= cols=" << cols << "\n";
            this->data = nullptr;
            this->memory = 0;
            this->pad = 0;  // Reset pad to avoid negative value
            return;
        }

        // Check for integer overflow: rows * cols and rows * step might overflow
        if (rows > 0 && cols > INT_MAX / rows)
        {
            std::cerr << "[Error] Matrix size too large, integer overflow: rows=" << rows 
                      << ", cols=" << cols << "\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        if (rows > 0 && step > INT_MAX / rows)
        {
            std::cerr << "[Error] Matrix size too large, integer overflow: rows=" << rows 
                      << ", step=" << step << "\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        // Empty matrix is a valid state; keep data == nullptr without allocation.
        if (rows == 0 || cols == 0)
        {
            this->element = 0;
            this->memory = 0;
            this->data = nullptr;
            return;
        }

        // memory will be recalculated by alloc_mem() based on row * step
        alloc_mem();
        if (this->data == nullptr && this->memory > 0)
        {
            std::cerr << "[>>> Error ! <<<] Memory allocation failed in alloc_mem()\n";
            // Memory allocation failed, object is in invalid state (data = nullptr)
            // Caller should check data pointer before using the matrix
            return;
        }
        // Initialize all elements to zero
        std::memset(this->data, 0, this->memory * sizeof(float));
    }

    /**
     * @name Mat::Mat(float *data, int rows, int cols)
     * @brief Constructor - create a matrix with the specified number of rows, columns and external data.
     * @param data Pointer to external data buffer (can be nullptr for empty matrix)
     * @param rows Number of rows (must be non-negative)
     * @param cols Number of columns (must be non-negative)
     * @note This constructor does not allocate memory. The matrix uses the external buffer.
     * @note If rows or cols is negative, the object will be in an invalid state.
     * @note The caller is responsible for ensuring the buffer is large enough and valid.
     */
    Mat::Mat(float *data, int rows, int cols)
        : row(rows), col(cols), pad(0), step(cols),
          element(rows * cols), memory(rows * cols),
          data(data), temp(nullptr),
          ext_buff(true), sub_matrix(false)
    {
        // Parameter validation: check if rows and cols are non-negative
        if (rows < 0 || cols < 0)
        {
            std::cerr << "[Error] Invalid matrix dimensions: rows=" << rows 
                      << ", cols=" << cols << " (must be non-negative)\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        // Check for integer overflow: rows * cols might overflow
        if (rows > 0 && cols > INT_MAX / rows)
        {
            std::cerr << "[Error] Matrix size too large, integer overflow: rows=" << rows 
                      << ", cols=" << cols << "\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        // Note: data can be nullptr for empty matrix, but caller should ensure buffer validity
    }

    /**
     * @name Mat::Mat(float *data, int rows, int cols, int step)
     * @brief Constructor - create a matrix with the specified number of rows, columns and external data.
     * @param data Pointer to external data buffer (can be nullptr for empty matrix)
     * @param rows Number of rows (must be non-negative)
     * @param cols Number of columns (must be non-negative)
     * @param step Total number of elements in a row (must be >= cols)
     * @note This constructor does not allocate memory. The matrix uses the external buffer.
     * @note If rows, cols is negative, or step < cols, the object will be in an invalid state.
     * @note The caller is responsible for ensuring the buffer is large enough and valid.
     */
    Mat::Mat(float *data, int rows, int cols, int step)
        : row(rows), col(cols), pad(step - cols), step(step),
          element(rows * cols), memory(rows * step),
          data(data), temp(nullptr),
          ext_buff(true), sub_matrix(false)
    {
        // Parameter validation: check if rows, cols are non-negative
        if (rows < 0 || cols < 0)
        {
            std::cerr << "[Error] Invalid matrix dimensions: rows=" << rows 
                      << ", cols=" << cols << " (must be non-negative)\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        // Validate step: must be >= cols (padding cannot be negative)
        if (step < cols)
        {
            std::cerr << "[Error] Invalid step: step=" << step 
                      << " must be >= cols=" << cols << "\n";
            this->data = nullptr;
            this->memory = 0;
            this->pad = 0;  // Reset pad to avoid negative value
            return;
        }

        // Check for integer overflow: rows * cols and rows * step might overflow
        if (rows > 0 && cols > INT_MAX / rows)
        {
            std::cerr << "[Error] Matrix size too large, integer overflow: rows=" << rows 
                      << ", cols=" << cols << "\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        if (rows > 0 && step > INT_MAX / rows)
        {
            std::cerr << "[Error] Matrix size too large, integer overflow: rows=" << rows 
                      << ", step=" << step << "\n";
            this->data = nullptr;
            this->memory = 0;
            return;
        }

        // Note: data can be nullptr for empty matrix, but caller should ensure buffer validity
    }

    /**
     * @name Mat::Mat(const Mat &src)
     * @brief Copy constructor - create a matrix with the same properties as the source matrix.
     * @param src Source matrix
     * @note If source is a submatrix view (sub_matrix && ext_buff), performs shallow copy (shares data pointer).
     *       Otherwise, performs deep copy (allocates new memory and copies data).
     * @note If memory allocation fails, the object will be in an invalid state (data = nullptr).
     *       Caller should check the data pointer before using the matrix.
     * @warning Shallow copy: If source is destroyed, the copied matrix's data pointer will be invalid.
     */
    Mat::Mat(const Mat &src)
        : row(src.row), col(src.col), pad(src.pad), step(src.step),
          element(src.element), memory(src.memory),
          data(nullptr), temp(nullptr),
          ext_buff(false), sub_matrix(false)
    {
        if (src.sub_matrix && src.ext_buff)
        {
            // if the source is a view (submatrix), do shallow copy
            // WARNING: This creates a shared reference. If source is destroyed, this pointer becomes invalid.
            this->data = src.data;
            this->ext_buff = true;
            this->sub_matrix = true;
        }
        else
        {
            // otherwise do deep copy
            this->ext_buff = false;
            this->sub_matrix = false;

            if (src.data != nullptr)
            {
                alloc_mem();
                if (this->data == nullptr)
                {
                    std::cerr << "[Error] Memory allocation failed in alloc_mem()\n";
                    // Memory allocation failed, object is in invalid state (data = nullptr)
                    // Caller should check data pointer before using the matrix
                    return;
                }

                // Copy data row by row to handle different strides correctly
                // This ensures correct copying even if source and destination have different strides
                for (int i = 0; i < this->row; ++i)
                {
                    std::memcpy(
                        &this->data[i * this->step],
                        &src.data[i * src.step],
                        this->col * sizeof(float)
                    );
                }
            }
            // If src.data == nullptr, this->data remains nullptr (empty matrix)
        }
    }

    /**
     * @name ~Mat()
     * @brief Destructor - free the memory allocated for the matrix.
     * @note Only deletes memory if it was allocated by this object (ext_buff == false).
     *       External buffers are not deleted.
     * @note temp buffer is always deleted if it exists (assumed to be allocated by this object).
     */
    Mat::~Mat()
    {
        // Only delete data if it was allocated by this object (not external buffer)
        if (!this->ext_buff && this->data != nullptr)
        {
            delete[] this->data;
            this->data = nullptr;  // Set to nullptr after deletion (good practice)
        }

        // Delete temporary buffer if it exists
        // Note: temp is assumed to be allocated by this object, not external
        if (this->temp != nullptr)
        {
            delete[] this->temp;
            this->temp = nullptr;  // Set to nullptr after deletion (good practice)
        }
    }

    // ============================================================================
    // Element Access
    // ============================================================================
    // Already defined by inline functions in the header file

    // ============================================================================
    // Data Manipulation
    // ============================================================================

    /**
     * @name Mat::copy_paste(const Mat &src, int row_pos, int col_pos)
     * @brief Copy the elements of the source matrix into the destination matrix. 
     *        The dimension of the current matrix must be larger than the source matrix.
     * @brief This one does not share memory with the source matrix.
     * @param src Source matrix (must be valid and non-empty)
     * @param row_pos Start row position of the destination matrix (must be non-negative)
     * @param col_pos Start column position of the destination matrix (must be non-negative)
     * @return TINY_OK on success, TINY_ERR_INVALID_ARG on error
     */
    tiny_error_t Mat::copy_paste(const Mat &src, int row_pos, int col_pos)
    {
        // Check for null pointers
        if (this->data == nullptr)
        {
            std::cerr << "[Error] copy_paste: destination matrix data pointer is null\n";
            return TINY_ERR_INVALID_ARG;
        }
        if (src.data == nullptr)
        {
            std::cerr << "[Error] copy_paste: source matrix data pointer is null\n";
            return TINY_ERR_INVALID_ARG;
        }

        // Validate source matrix dimensions
        if (src.row <= 0 || src.col <= 0)
        {
            std::cerr << "[Error] copy_paste: invalid source matrix dimensions: rows=" 
                      << src.row << ", cols=" << src.col << "\n";
            return TINY_ERR_INVALID_ARG;
        }

        // Validate position parameters (must be non-negative)
        if (row_pos < 0 || col_pos < 0)
        {
            std::cerr << "[Error] copy_paste: invalid position: row_pos=" << row_pos 
                      << ", col_pos=" << col_pos << " (must be non-negative)\n";
            return TINY_ERR_INVALID_ARG;
        }

        // Validate destination matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] copy_paste: invalid destination matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return TINY_ERR_INVALID_ARG;
        }

        // Check if source matrix fits in destination at the specified position
        if ((row_pos + src.row) > this->row)
        {
            std::cerr << "[Error] copy_paste: source matrix exceeds destination row boundary: "
                      << "row_pos=" << row_pos << ", src.rows=" << src.row 
                      << ", dest.rows=" << this->row << "\n";
            return TINY_ERR_INVALID_ARG;
        }
        if ((col_pos + src.col) > this->col)
        {
            std::cerr << "[Error] copy_paste: source matrix exceeds destination column boundary: "
                      << "col_pos=" << col_pos << ", src.cols=" << src.col 
                      << ", dest.cols=" << this->col << "\n";
            return TINY_ERR_INVALID_ARG;
        }

        // Copy data row by row (handles different strides correctly)
        for (int r = 0; r < src.row; r++)
        {
            memcpy(&this->data[(r + row_pos) * this->step + col_pos], 
                   &src.data[r * src.step], 
                   src.col * sizeof(float));
        }

        return TINY_OK;
    }

    /**
     * @name Mat::copy_head(const Mat &src)
     * @brief Copy the header (metadata) of the source matrix into the destination matrix. 
     *        The data pointer is shared (shallow copy).
     * @param src Source matrix (must be valid)
     * @return TINY_OK on success, TINY_ERR_INVALID_ARG on error
     * @warning This function performs a SHALLOW COPY. The destination matrix shares the 
     *          data pointer with the source matrix. If the source matrix is destroyed, 
     *          the destination matrix's data pointer will become invalid.
     * @note The destination matrix marks ext_buff=true to indicate it does NOT own the data
     *       buffer, preventing double-free issues. Only the original owner (ext_buff=false)
     *       is responsible for memory deallocation.
     * @note The temp pointer is NOT shared (set to nullptr) to prevent double-free issues.
     *       Each object manages its own temp buffer independently.
     */
    tiny_error_t Mat::copy_head(const Mat &src)
    {
        // Check for null pointer
        if (src.data == nullptr)
        {
            std::cerr << "[Error] copy_head: source matrix data pointer is null\n";
            return TINY_ERR_INVALID_ARG;
        }

        // Delete current data if it was allocated by this object
        if (!this->ext_buff && this->data != nullptr)
        {
            delete[] this->data;
            this->data = nullptr;
        }

        // Delete current temp if it exists (assuming it was allocated by this object)
        if (this->temp != nullptr)
        {
            delete[] this->temp;
            this->temp = nullptr;
        }

        // Copy all metadata from source matrix
        this->row = src.row;
        this->col = src.col;
        this->element = src.element;
        this->pad = src.pad;
        this->step = src.step;
        this->memory = src.memory;

        // Share data pointer regardless of source ownership
        // This allows multiple matrices to reference the same data buffer
        this->data = src.data;
        this->sub_matrix = src.sub_matrix;

        // Key difference: ALWAYS set ext_buff=true for destination
        // This ensures only the original owner (src.ext_buff=false) will deallocate memory
        // The destination matrix becomes a view that does NOT own the data
        // This design eliminates double-free issues:
        // - Original owner: ext_buff=false, deletes memory on destruction
        // - All views: ext_buff=true, does NOT delete memory on destruction
        this->ext_buff = true;

        // Do NOT share temp pointer - temp is a temporary buffer that should not be shared
        // Setting temp to nullptr prevents double-free issues when either object is destroyed
        // Each object should manage its own temp buffer if needed
        this->temp = nullptr;

        return TINY_OK;
    }

    /**
     * @name Mat::view_roi(int start_row, int start_col, int roi_rows, int roi_cols)
     * @brief Make a shallow copy of ROI matrix. Create a view of the ROI matrix. 
     *        Low level function. Unlike ESP-DSP, it is not allowed to setup step here, 
     *        step is automatically calculated inside the function.
     * @param start_row Start row position of source matrix (must be non-negative)
     * @param start_col Start column position of source matrix (must be non-negative)
     * @param roi_rows Size of row elements of source matrix to copy (must be positive)
     * @param roi_cols Size of column elements of source matrix to copy (must be positive)
     * @return result matrix size roi_rows x roi_cols, or empty matrix on error
     * @warning The returned matrix is a VIEW (shallow copy) that shares data with the source matrix.
     *          If the source matrix is destroyed, the view's data pointer will become invalid.
     * @note The step of the result matrix is inherited from the source matrix.
     */
    Mat Mat::view_roi(int start_row, int start_col, int roi_rows, int roi_cols) const
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] view_roi: source matrix data pointer is null\n";
            return Mat();  // Return empty matrix as error indicator
        }

        // Validate source matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] view_roi: invalid source matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return Mat();
        }

        // Validate position parameters (must be non-negative)
        if (start_row < 0 || start_col < 0)
        {
            std::cerr << "[Error] view_roi: invalid position: start_row=" << start_row 
                      << ", start_col=" << start_col << " (must be non-negative)\n";
            return Mat();
        }

        // Validate ROI size parameters (must be positive)
        if (roi_rows <= 0 || roi_cols <= 0)
        {
            std::cerr << "[Error] view_roi: invalid ROI size: roi_rows=" << roi_rows 
                      << ", roi_cols=" << roi_cols << " (must be positive)\n";
            return Mat();
        }

        // Check if ROI fits within source matrix boundaries
        if ((start_row + roi_rows) > this->row)
        {
            std::cerr << "[Error] view_roi: ROI exceeds row boundary: start_row=" << start_row 
                      << ", roi_rows=" << roi_rows << ", source.rows=" << this->row << "\n";
            return Mat();
        }
        if ((start_col + roi_cols) > this->col)
        {
            std::cerr << "[Error] view_roi: ROI exceeds column boundary: start_col=" << start_col 
                      << ", roi_cols=" << roi_cols << ", source.cols=" << this->col << "\n";
            return Mat();
        }

        // Validate step: must be >= roi_cols (padding cannot be negative)
        if (this->step < roi_cols)
        {
            std::cerr << "[Error] view_roi: step < roi_cols: step=" << this->step 
                      << ", roi_cols=" << roi_cols << "\n";
            return Mat();
        }

        // Check for integer overflow
        if (roi_rows > 0 && this->step > INT_MAX / roi_rows)
        {
            std::cerr << "[Error] view_roi: integer overflow: roi_rows=" << roi_rows 
                      << ", step=" << this->step << "\n";
            return Mat();
        }
        if (roi_rows > 0 && roi_cols > INT_MAX / roi_rows)
        {
            std::cerr << "[Error] view_roi: integer overflow: roi_rows=" << roi_rows 
                      << ", roi_cols=" << roi_cols << "\n";
            return Mat();
        }

        // Create ROI view (shallow copy)
        Mat result;
        result.row = roi_rows;
        result.col = roi_cols;
        result.step = this->step;
        result.pad = this->step - roi_cols;  // Now guaranteed to be non-negative
        result.element = roi_rows * roi_cols;
        result.memory = roi_rows * this->step;
        result.data = this->data + (start_row * this->step + start_col);
        result.temp = nullptr;
        result.ext_buff = true;
        result.sub_matrix = true;

        return result;
    }

    /**
     * @name Mat::view_roi(const Mat::ROI &roi)
     * @brief Make a shallow copy of ROI matrix. Create a view of the ROI matrix using ROI structure.
     * @param roi Rectangular area of interest (roi.pos_x, roi.pos_y must be non-negative, 
     *            roi.width, roi.height must be positive)
     * @return result matrix size roi.height x roi.width, or empty matrix on error
     * @warning The returned matrix is a VIEW (shallow copy) that shares data with the source matrix.
     *          If the source matrix is destroyed, the view's data pointer will become invalid.
     * @note This is a convenience wrapper that calls view_roi(roi.pos_y, roi.pos_x, roi.height, roi.width).
     */
    Mat Mat::view_roi(const Mat::ROI &roi) const
    {
        return view_roi(roi.pos_y, roi.pos_x, roi.height, roi.width);
    }

    /**
     * @name Mat::copy_roi(int start_row, int start_col, int height, int width)
     * @brief Make a deep copy of matrix. Compared to view_roi(), this one is a deep copy, 
     *        not sharing memory with the source matrix.
     * @param start_row Start row position of source matrix to copy (must be non-negative)
     * @param start_col Start column position of source matrix to copy (must be non-negative)
     * @param height Size of row elements of source matrix to copy (must be positive)
     * @param width Size of column elements of source matrix to copy (must be positive)
     * @return result matrix size height x width, or empty matrix on error
     * @note The returned matrix is a DEEP COPY with its own memory. It is independent 
     *       of the source matrix and can be safely used after the source is destroyed.
     */
    Mat Mat::copy_roi(int start_row, int start_col, int height, int width)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] copy_roi: source matrix data pointer is null\n";
            return Mat();  // Return empty matrix as error indicator
        }

        // Validate source matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] copy_roi: invalid source matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return Mat();
        }

        // Validate position parameters (must be non-negative)
        if (start_row < 0 || start_col < 0)
        {
            std::cerr << "[Error] copy_roi: invalid position: start_row=" << start_row 
                      << ", start_col=" << start_col << " (must be non-negative)\n";
            return Mat();
        }

        // Validate size parameters (must be positive)
        if (height <= 0 || width <= 0)
        {
            std::cerr << "[Error] copy_roi: invalid size: height=" << height 
                      << ", width=" << width << " (must be positive)\n";
            return Mat();
        }

        // Check if ROI fits within source matrix boundaries
        if ((start_row + height) > this->row)
        {
            std::cerr << "[Error] copy_roi: ROI exceeds row boundary: start_row=" << start_row 
                      << ", height=" << height << ", source.rows=" << this->row << "\n";
            return Mat();
        }
        if ((start_col + width) > this->col)
        {
            std::cerr << "[Error] copy_roi: ROI exceeds column boundary: start_col=" << start_col 
                      << ", width=" << width << ", source.cols=" << this->col << "\n";
            return Mat();
        }

        // Create result matrix (deep copy)
        Mat result(height, width);

        // Check if result matrix was created successfully
        if (result.data == nullptr)
        {
            std::cerr << "[Error] copy_roi: failed to allocate memory for result matrix\n";
            return Mat();
        }

        // Deep copy the data from the source matrix row by row
        // This handles different step values correctly
        for (int r = 0; r < result.row; r++)
        {
            memcpy(&result.data[r * result.step], 
                   &this->data[(r + start_row) * this->step + start_col], 
                   result.col * sizeof(float));
        }

        return result;
    }

    /**
     * @name Mat::copy_roi(const Mat::ROI &roi)
     * @brief Make a deep copy of matrix using ROI structure. Compared to view_roi(), 
     *        this one is a deep copy, not sharing memory with the source matrix.
     * @param roi Rectangular area of interest (roi.pos_x, roi.pos_y must be non-negative, 
     *            roi.width, roi.height must be positive)
     * @return result matrix size roi.height x roi.width, or empty matrix on error
     * @note The returned matrix is a DEEP COPY with its own memory. It is independent 
     *       of the source matrix and can be safely used after the source is destroyed.
     * @note This is a convenience wrapper that calls copy_roi(roi.pos_y, roi.pos_x, roi.height, roi.width).
     */
    Mat Mat::copy_roi(const Mat::ROI &roi)
    {
        return copy_roi(roi.pos_y, roi.pos_x, roi.height, roi.width);
    }

    /**
     * @name Mat::block(int start_row, int start_col, int block_rows, int block_cols)
     * @brief Get a block (submatrix) of the matrix. This is a deep copy operation.
     * @param start_row Start row position of the block (must be non-negative)
     * @param start_col Start column position of the block (must be non-negative)
     * @param block_rows Number of rows in the block (must be positive)
     * @param block_cols Number of columns in the block (must be positive)
     * @return result matrix size block_rows x block_cols, or empty matrix on error
     * @note The returned matrix is a DEEP COPY with its own memory. It is independent 
     *       of the source matrix and can be safely used after the source is destroyed.
     * @note This function is similar to copy_roi(), but uses element-by-element access.
     *       For better performance with large blocks, consider using copy_roi() instead.
     */
    Mat Mat::block(int start_row, int start_col, int block_rows, int block_cols)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] block: source matrix data pointer is null\n";
            return Mat();  // Return empty matrix as error indicator
        }

        // Validate source matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] block: invalid source matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return Mat();
        }

        // Boundary check: validate position parameters (must be non-negative)
        if (start_row < 0 || start_col < 0)
        {
            std::cerr << "[Error] block: invalid position: start_row=" << start_row 
                      << ", start_col=" << start_col << " (must be non-negative)\n";
            return Mat();
        }

        // Boundary check: validate block size parameters (must be positive)
        if (block_rows <= 0 || block_cols <= 0)
        {
            std::cerr << "[Error] block: invalid block size: block_rows=" << block_rows 
                      << ", block_cols=" << block_cols << " (must be positive)\n";
            return Mat();
        }

        // Check if block fits within source matrix boundaries
        if ((start_row + block_rows) > this->row)
        {
            std::cerr << "[Error] block: block exceeds row boundary: start_row=" << start_row 
                      << ", block_rows=" << block_rows << ", source.rows=" << this->row << "\n";
            return Mat();
        }
        if ((start_col + block_cols) > this->col)
        {
            std::cerr << "[Error] block: block exceeds column boundary: start_col=" << start_col 
                      << ", block_cols=" << block_cols << ", source.cols=" << this->col << "\n";
            return Mat();
        }

        // Create result matrix
        Mat result(block_rows, block_cols);

        // Check if result matrix was created successfully
        if (result.data == nullptr)
        {
            std::cerr << "[Error] block: failed to allocate memory for result matrix\n";
            return Mat();
        }

        // Copy block data element by element
        // Note: This uses operator() which handles step correctly, but is slower than memcpy
        // For better performance, consider using copy_roi() which uses memcpy
        for (int i = 0; i < block_rows; ++i)
        {
            for (int j = 0; j < block_cols; ++j)
            {
                result(i, j) = (*this)(start_row + i, start_col + j);
            }
        }

        return result;
    }

    /**
     * @name Mat::swap_rows(int row1, int row2)
     * @brief Swap two rows of the matrix.
     * @param row1 The index of the first row to swap (must be in range [0, row-1])
     * @param row2 The index of the second row to swap (must be in range [0, row-1])
     * @note If row1 == row2, the function returns immediately without doing anything.
     * @note This function is commonly used in matrix operations like Gaussian elimination with row pivoting.
     */
    void Mat::swap_rows(int row1, int row2)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] swap_rows: matrix data pointer is null\n";
            return;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] swap_rows: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return;
        }

        // Validate row indices
        if (row1 < 0 || row1 >= this->row)
        {
            std::cerr << "[Error] swap_rows: row1 index out of range: row1=" << row1 
                      << ", matrix.rows=" << this->row << "\n";
            return;
        }
        if (row2 < 0 || row2 >= this->row)
        {
            std::cerr << "[Error] swap_rows: row2 index out of range: row2=" << row2 
                      << ", matrix.rows=" << this->row << "\n";
            return;
        }

        // Optimization: if same row, no need to swap
        if (row1 == row2)
        {
            return;
        }

        // Allocate temporary buffer for row swap
        // Note: Using new/delete here is acceptable for this operation,
        // but could be optimized by using the matrix's temp buffer if available
        float *temp_row = new(std::nothrow) float[this->col];
        if (temp_row == nullptr)
        {
            std::cerr << "[Error] swap_rows: failed to allocate temporary buffer\n";
            return;
        }

        // Swap rows using memcpy (handles step correctly)
        memcpy(temp_row, &this->data[row1 * this->step], this->col * sizeof(float));
        memcpy(&this->data[row1 * this->step], &this->data[row2 * this->step], this->col * sizeof(float));
        memcpy(&this->data[row2 * this->step], temp_row, this->col * sizeof(float));

        delete[] temp_row;
    }

    /**
     * @name Mat::swap_cols(int col1, int col2)
     * @brief Swap two columns of the matrix.
     * @param col1 The index of the first column to swap (must be in range [0, col-1])
     * @param col2 The index of the second column to swap (must be in range [0, col-1])
     * @note If col1 == col2, the function returns immediately without doing anything.
     * @note Useful for column pivoting in algorithms like Gaussian elimination with column pivoting.
     * @note This function swaps columns element by element, which correctly handles step.
     */
    void Mat::swap_cols(int col1, int col2)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] swap_cols: matrix data pointer is null\n";
            return;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] swap_cols: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return;
        }

        // Validate column indices
        if (col1 < 0 || col1 >= this->col)
        {
            std::cerr << "[Error] swap_cols: col1 index out of range: col1=" << col1 
                      << ", matrix.cols=" << this->col << "\n";
            return;
        }
        if (col2 < 0 || col2 >= this->col)
        {
            std::cerr << "[Error] swap_cols: col2 index out of range: col2=" << col2 
                      << ", matrix.cols=" << this->col << "\n";
            return;
        }

        // Optimization: if same column, no need to swap
        if (col1 == col2)
        {
            return;
        }

        // Swap columns element by element (considering step)
        // Note: This approach correctly handles different step values
        for (int i = 0; i < this->row; ++i)
        {
            float temp = (*this)(i, col1);
            (*this)(i, col1) = (*this)(i, col2);
            (*this)(i, col2) = temp;
        }
    }

    /**
     * @name Mat::clear()
     * @brief Clear the matrix by setting all elements to zero.
     * @note Only clears the actual matrix elements (col elements per row), not the padding area.
     * @note If the matrix has padding (step > col), the padding elements are not cleared.
     */
    void Mat::clear(void)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] clear: matrix data pointer is null\n";
            return;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            // Empty matrix, nothing to clear
            return;
        }

        // Clear matrix row by row (handles step correctly)
        // Only clear the actual matrix elements (col elements), not the padding
        for (int row = 0; row < this->row; row++)
        {
            memset(this->data + (row * this->step), 0, this->col * sizeof(float));
        }
    }

    // ============================================================================
    // Arithmetic Operators
    // ============================================================================
    /**
     * @name Mat::operator=(const Mat &src)
     * @brief Copy assignment operator - copy the elements of the source matrix into the destination matrix.
     * @param src Source matrix to copy from
     * @return Reference to this matrix
     * @note Compared to the copy constructor, this operator is used for existing matrices.
     *       If dimensions differ, memory will be reallocated.
     * @note Assignment to sub-matrix views is not allowed.
     * @warning If memory allocation fails, the matrix may be in an invalid state.
     */
    Mat &Mat::operator=(const Mat &src)
    {
        // 1. Self-assignment check
        if (this == &src)
        {
            return *this;
        }

        // 2. Forbid assignment to sub-matrix views
        if (this->sub_matrix)
        {
            std::cerr << "[Error] Assignment to a sub-matrix is not allowed.\n";
            return *this;
        }

        // Check source matrix validity
        if (src.data == nullptr)
        {
            std::cerr << "[Error] operator=: source matrix data pointer is null\n";
            return *this;
        }

        // 3. If dimensions differ, reallocate memory
        if (this->row != src.row || this->col != src.col)
        {
            if (!this->ext_buff && this->data != nullptr)
            {
                delete[] this->data;
                this->data = nullptr;
            }

            // Update dimensions and memory info
            this->row = src.row;
            this->col = src.col;
            this->step = src.col; // Follow source's logical step (no padding)
            this->pad = 0;

            // Check for integer overflow
            if (this->row > 0 && this->col > INT_MAX / this->row)
            {
                std::cerr << "[Error] operator=: integer overflow in element calculation\n";
                this->data = nullptr;
                return *this;
            }
            this->element = this->row * this->col;

            if (this->row > 0 && this->step > INT_MAX / this->row)
            {
                std::cerr << "[Error] operator=: integer overflow in memory calculation\n";
                this->data = nullptr;
                return *this;
            }
            this->memory = this->row * this->step;

            this->ext_buff = false;
            this->sub_matrix = false;

            alloc_mem();

            // Check if memory allocation succeeded
            if (this->data == nullptr)
            {
                std::cerr << "[Error] operator=: memory allocation failed\n";
                return *this;
            }
        }

        // Check if this->data is valid before copying
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator=: destination matrix data pointer is null\n";
            return *this;
        }

        // 4. Data copy (row-wise, handles different strides correctly)
        for (int r = 0; r < this->row; ++r)
        {
            std::memcpy(this->data + r * this->step, 
                       src.data + r * src.step, 
                       this->col * sizeof(float));
        }

        return *this;
    }

    /**
     * @name Mat::operator+=(const Mat &A)
     * @brief Element-wise addition of another matrix to this matrix.
     * @param A The matrix to add (must have same dimensions as this matrix)
     * @return Mat& Reference to the current matrix
     * @note This function performs element-wise addition: this[i,j] += A[i,j]
     * @note The function automatically handles padding and uses optimized vectorized operations when possible.
     */
    Mat &Mat::operator+=(const Mat &A)
    {
        // Check for null pointers
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator+=: this matrix data pointer is null\n";
            return *this;
        }
        if (A.data == nullptr)
        {
            std::cerr << "[Error] operator+=: source matrix data pointer is null\n";
            return *this;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator+=: invalid this matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return *this;
        }
        if (A.row <= 0 || A.col <= 0)
        {
            std::cerr << "[Error] operator+=: invalid source matrix dimensions: rows=" 
                      << A.row << ", cols=" << A.col << "\n";
            return *this;
        }

        // 1. Dimension check - matrices must have same dimensions
        if ((this->row != A.row) || (this->col != A.col))
        {
            std::cerr << "[Error] Matrix addition failed: Dimension mismatch ("
                      << this->row << "x" << this->col << " vs "
                      << A.row << "x" << A.col << ")\n";
            return *this;
        }

        // 2. Determine if padding handling is needed
        bool need_padding_handling = (this->pad > 0) || (A.pad > 0);

        if (need_padding_handling)
        {
            // Padding-aware addition
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_add_f32(this->data, A.data, this->data,
                         this->row, this->col,
                         this->pad, A.pad, this->pad,
                         1, 1, 1);
#else
            tiny_mat_add_f32(this->data, A.data, this->data,
                             this->row, this->col,
                             this->pad, A.pad, this->pad,
                             1, 1, 1);
#endif
        }
        else
        {
            // Vectorized addition for contiguous memory
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dsps_add_f32(this->data, A.data, this->data, this->memory, 1, 1, 1);
#else
            tiny_vec_add_f32(this->data, A.data, this->data, this->memory, 1, 1, 1);
#endif
        }

        return *this;
    }

    /**
     * @name Mat::operator+=(float C)
     * @brief Element-wise addition of a constant to this matrix.
     * @param C The constant to add to each element
     * @return Mat& Reference to the current matrix
     * @note This function performs element-wise addition: this[i,j] += C
     * @note The function automatically handles padding and uses optimized vectorized operations when possible.
     */
    Mat &Mat::operator+=(float C)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator+=(float): matrix data pointer is null\n";
            return *this;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator+=(float): invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return *this;
        }

        // Check whether padding is present
        bool need_padding_handling = (this->pad > 0);

        if (need_padding_handling)
        {
            // Padding-aware constant addition
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_addc_f32(this->data, this->data, C,
                          this->row, this->col,
                          this->pad, this->pad,
                          1, 1);
#else
            tiny_mat_addc_f32(this->data, this->data, C,
                              this->row, this->col,
                              this->pad, this->pad,
                              1, 1);
#endif
        }
        else
        {
            // Vectorized constant addition
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dsps_addc_f32(this->data, this->data, this->memory, C, 1, 1);
#else
            tiny_vec_addc_f32(this->data, this->data, this->memory, C, 1, 1);
#endif
        }

        return *this;
    }

    /**
     * @name Mat::operator-=(const Mat &A)
     * @brief Element-wise subtraction of another matrix from this matrix.
     * @param A The matrix to subtract (must have same dimensions as this matrix)
     * @return Mat& Reference to the current matrix
     * @note This function performs element-wise subtraction: this[i,j] -= A[i,j]
     * @note The function automatically handles padding and uses optimized vectorized operations when possible.
     */
    Mat &Mat::operator-=(const Mat &A)
    {
        // Check for null pointers
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator-=: this matrix data pointer is null\n";
            return *this;
        }
        if (A.data == nullptr)
        {
            std::cerr << "[Error] operator-=: source matrix data pointer is null\n";
            return *this;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator-=: invalid this matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return *this;
        }
        if (A.row <= 0 || A.col <= 0)
        {
            std::cerr << "[Error] operator-=: invalid source matrix dimensions: rows=" 
                      << A.row << ", cols=" << A.col << "\n";
            return *this;
        }

        // 1. Dimension check - matrices must have same dimensions
        if ((this->row != A.row) || (this->col != A.col))
        {
            std::cerr << "[Error] Matrix subtraction failed: Dimension mismatch ("
                      << this->row << "x" << this->col << " vs "
                      << A.row << "x" << A.col << ")\n";
            return *this;
        }

        // 2. Determine if padding handling is needed
        bool need_padding_handling = (this->pad > 0) || (A.pad > 0);

        if (need_padding_handling)
        {
            // Padding-aware subtraction
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_sub_f32(this->data, A.data, this->data,
                         this->row, this->col,
                         this->pad, A.pad, this->pad,
                         1, 1, 1);
#else
            tiny_mat_sub_f32(this->data, A.data, this->data,
                             this->row, this->col,
                             this->pad, A.pad, this->pad,
                             1, 1, 1);
#endif
        }
        else
        {
            // Vectorized subtraction for contiguous memory
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dsps_sub_f32(this->data, A.data, this->data, this->memory, 1, 1, 1);
#else
            tiny_vec_sub_f32(this->data, A.data, this->data, this->memory, 1, 1, 1);
#endif
        }

        return *this;
    }

    /**
     * @name Mat::operator-=(float C)
     * @brief Element-wise subtraction of a constant from this matrix.
     * @param C The constant to subtract from each element
     * @return Mat& Reference to the current matrix
     * @note This function performs element-wise subtraction: this[i,j] -= C
     * @note The function automatically handles padding and uses optimized vectorized operations when possible.
     * @note On ESP32, this uses addc with -C since subc is not available in DSP library.
     */
    Mat &Mat::operator-=(float C)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator-=(float): matrix data pointer is null\n";
            return *this;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator-=(float): invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return *this;
        }

        bool need_padding_handling = (this->pad > 0);

        if (need_padding_handling)
        {
            // Padding-aware constant subtraction
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            // Note: ESP32 DSP does not provide dspm_subc_f32, using dspm_addc_f32 with -C
            dspm_addc_f32(this->data, this->data, -C,
                          this->row, this->col,
                          this->pad, this->pad,
                          1, 1);
#else
            tiny_mat_subc_f32(this->data, this->data, C,
                              this->row, this->col,
                              this->pad, this->pad,
                              1, 1);
#endif
        }
        else
        {
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            // Note: ESP32 DSP does not provide dsps_subc_f32, using dsps_addc_f32 with -C
            dsps_addc_f32(this->data, this->data, this->memory, -C, 1, 1);
#else
            tiny_vec_subc_f32(this->data, this->data, this->memory, C, 1, 1);
#endif
        }

        return *this;
    }

    /**
     * @name Mat::operator*=(const Mat &m)
     * @brief Matrix multiplication: this = this * m
     * @param m The matrix to multiply with (must have compatible dimensions: this.col == m.row)
     * @return Mat& Reference to the current matrix
     * @note Matrix multiplication requires: this.col == m.row
     *       Result dimensions: this.row x m.col
     * @note This function creates a temporary copy to avoid overwriting data during computation.
     */
    Mat &Mat::operator*=(const Mat &m)
    {
        // Check for null pointers
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator*=: this matrix data pointer is null\n";
            return *this;
        }
        if (m.data == nullptr)
        {
            std::cerr << "[Error] operator*=: source matrix data pointer is null\n";
            return *this;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator*=: invalid this matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return *this;
        }
        if (m.row <= 0 || m.col <= 0)
        {
            std::cerr << "[Error] operator*=: invalid source matrix dimensions: rows=" 
                      << m.row << ", cols=" << m.col << "\n";
            return *this;
        }

        // 1. Dimension check - matrix multiplication requires: this.col == m.row
        if (this->col != m.row)
        {
            std::cerr << "[Error] Matrix multiplication failed: incompatible dimensions ("
                      << this->row << "x" << this->col << " * "
                      << m.row << "x" << m.col << ")\n";
            return *this;
        }

        // 1.5. In-place matrix multiplication may change output width (this->col -> m.col).
        // For views/external buffers, resizing is unsafe because ownership/capacity is unknown.
        if ((this->sub_matrix || this->ext_buff) && (this->col != m.col))
        {
            std::cerr << "[Error] operator*=: cannot reshape sub-matrix/external-buffer matrix ("
                      << this->row << "x" << this->col << " -> "
                      << this->row << "x" << m.col << ")\n";
            return *this;
        }

        // 2. Prepare temp matrix (in case overwriting the original data)
        // Create a copy of this matrix to avoid overwriting during computation
        Mat temp = this->copy_roi(0, 0, this->row, this->col);

        // Check if copy_roi succeeded
        if (temp.data == nullptr)
        {
            std::cerr << "[Error] operator*=: failed to create temporary matrix copy\n";
            return *this;
        }

        // 2.5. Ensure destination buffer/metadata can hold the multiplication result.
        // Result shape: this->row x m.col
        const int result_col = m.col;
        if (this->col != result_col)
        {
            // We only reach here when this matrix owns its buffer (checked above).
            if (!this->ext_buff && this->data != nullptr)
            {
                delete[] this->data;
                this->data = nullptr;
            }

            this->col = result_col;
            this->step = result_col;  // keep destination contiguous after reshape
            this->pad = 0;

            if (this->row > 0 && this->col > INT_MAX / this->row)
            {
                std::cerr << "[Error] operator*=: integer overflow in element calculation\n";
                this->data = nullptr;
                this->element = 0;
                this->memory = 0;
                return *this;
            }
            this->element = this->row * this->col;

            if (this->row > 0 && this->step > INT_MAX / this->row)
            {
                std::cerr << "[Error] operator*=: integer overflow in memory calculation\n";
                this->data = nullptr;
                this->memory = 0;
                return *this;
            }
            this->memory = this->row * this->step;

            alloc_mem();
            if (this->data == nullptr)
            {
                std::cerr << "[Error] operator*=: memory allocation failed for result matrix\n";
                return *this;
            }
        }

        // 3. Check whether padding is present in either matrix
        bool need_padding_handling = (this->pad > 0) || (m.pad > 0);

        if (need_padding_handling)
        {
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_mult_ex_f32(temp.data, m.data, this->data, temp.row, temp.col, m.col, temp.pad, m.pad, this->pad);
#else
            tiny_mat_mult_ex_f32(temp.data, m.data, this->data, temp.row, temp.col, m.col, temp.pad, m.pad, this->pad);
#endif
        }
        else
        {
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_mult_f32(temp.data, m.data, this->data, temp.row, temp.col, m.col);
#else
            tiny_mat_mult_f32(temp.data, m.data, this->data, temp.row, temp.col, m.col);
#endif
        }

        return *this;
    }

    /**
     * @name Mat::operator*=(float num)
     * @brief Element-wise multiplication by a constant.
     * @param num The constant multiplier
     * @return Mat& Reference to the current matrix
     * @note This function performs element-wise multiplication: this[i,j] *= num
     * @note The function automatically handles padding and uses optimized vectorized operations when possible.
     */
    Mat &Mat::operator*=(float num)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator*=(float): matrix data pointer is null\n";
            return *this;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator*=(float): invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return *this;
        }

        // Check whether padding is present
        bool need_padding_handling = (this->pad > 0);

        if (need_padding_handling)
        {
            // Padding-aware multiplication
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_mulc_f32(this->data, this->data, num,
                          this->row, this->col,
                          this->pad, this->pad,
                          1, 1);
#else
            tiny_mat_multc_f32(this->data, this->data, num, this->row, this->col, this->pad, this->pad, 1, 1);
#endif
        }
        else
        {
            // No padding, use vectorized multiplication
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dsps_mulc_f32(this->data, this->data, this->memory, num, 1, 1);
#else
            tiny_vec_mulc_f32(this->data, this->data, this->memory, num, 1, 1);
#endif
        }

        return *this;
    }

    /**
     * @name Mat::operator/=(const Mat &B)
     * @brief Element-wise division: this = this / B
     * @param B The matrix divisor (must have same dimensions as this matrix, and no zero elements)
     * @return Mat& Reference to the current matrix
     * @note This function performs element-wise division: this[i,j] /= B[i,j]
     * @warning Division by zero will cause an error. All elements of B must be non-zero.
     */
    Mat &Mat::operator/=(const Mat &B)
    {
        // Check for null pointers
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator/=: this matrix data pointer is null\n";
            return *this;
        }
        if (B.data == nullptr)
        {
            std::cerr << "[Error] operator/=: divisor matrix data pointer is null\n";
            return *this;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator/=: invalid this matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return *this;
        }
        if (B.row <= 0 || B.col <= 0)
        {
            std::cerr << "[Error] operator/=: invalid divisor matrix dimensions: rows=" 
                      << B.row << ", cols=" << B.col << "\n";
            return *this;
        }

        // 1. Dimension check - matrices must have same dimensions
        if ((this->row != B.row) || (this->col != B.col))
        {
            std::cerr << "[Error] Matrix division failed: Dimension mismatch ("
                      << this->row << "x" << this->col << " vs "
                      << B.row << "x" << B.col << ")\n";
            return *this;
        }

        // 2. Zero division check - scan for near-zero elements
        bool zero_found = false;
        const float epsilon = 1e-9f;
        for (int i = 0; i < B.row; ++i)
        {
            for (int j = 0; j < B.col; ++j)
            {
                if (fabs(B(i, j)) < epsilon)
                {
                    zero_found = true;
                    std::cerr << "[Error] Matrix division failed: Division by zero detected at position ("
                              << i << ", " << j << ")\n";
                    break;
                }
            }
            if (zero_found)
                break;
        }

        if (zero_found)
        {
            return *this;
        }

        // 3. Element-wise division
        for (int i = 0; i < this->row; ++i)
        {
            for (int j = 0; j < this->col; ++j)
            {
                (*this)(i, j) /= B(i, j);
            }
        }

        return *this;
    }

    /**
     * @name Mat::operator/=(float num)
     * @brief Element-wise division of this matrix by a constant.
     * @param num The constant divisor (must be non-zero)
     * @return Mat& Reference to the current matrix
     * @note This function performs element-wise division: this[i,j] /= num
     * @note The function uses multiplication by 1/num for efficiency (division is slower than multiplication).
     * @note The function automatically handles padding and uses optimized vectorized operations when possible.
     * @warning Division by zero will cause an error.
     */
    Mat &Mat::operator/=(float num)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator/=(float): matrix data pointer is null\n";
            return *this;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator/=(float): invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return *this;
        }

        // 1. Check division by zero
        const float epsilon = 1e-9f;
        if (fabs(num) < epsilon)
        {
            std::cerr << "[Error] Matrix division by zero is undefined (divisor=" << num << ")\n";
            return *this;
        }

        // 2. Determine if padding handling is needed
        bool need_padding_handling = (this->pad > 0);

        // Use multiplication by inverse for better performance (division is slower)
        float inv_num = 1.0f / num;

        if (need_padding_handling)
        {
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_mulc_f32(this->data, this->data, inv_num,
                          this->row, this->col,
                          this->pad, this->pad,
                          1, 1);
#else
            tiny_mat_multc_f32(this->data, this->data, inv_num,
                              this->row, this->col,
                              this->pad, this->pad,
                              1, 1);
#endif
        }
        else
        {
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dsps_mulc_f32(this->data, this->data, this->memory, inv_num, 1, 1);
#else
            tiny_vec_mulc_f32(this->data, this->data, this->memory, inv_num, 1, 1);
#endif
        }

        return *this;
    }

    /**
     * @name Mat::operator^(int num)
     * @brief Element-wise integer exponentiation. Returns a new matrix where each element is raised to the given power.
     * @param num The exponent (integer, can be positive, negative, or zero)
     * @return Mat New matrix after exponentiation
     * @note This function performs element-wise exponentiation: result[i,j] = this[i,j]^num
     * @note For num=0, all elements become 1.0. For num=1, returns a copy of the matrix.
     * @note For num=-1, returns element-wise reciprocal: result[i,j] = 1.0 / this[i,j]
     * @note For num < 0, performs: result[i,j] = 1.0 / (this[i,j]^(-num))
     * @warning For negative exponents, elements that are zero or too close to zero will result in
     *          infinity or NaN. The function will print warnings for such cases.
     */
    Mat Mat::operator^(int num)
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] operator^: matrix data pointer is null\n";
            return Mat();  // Return empty matrix as error indicator
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] operator^: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return Mat();
        }

        // Handle special cases
        if (num == 0)
        {
            // Any number to the power of 0 is 1
            Mat result(this->row, this->col, this->step);
            if (result.data == nullptr)
            {
                std::cerr << "[Error] operator^: failed to allocate memory for result matrix\n";
                return Mat();
            }
            for (int i = 0; i < this->row; ++i)
            {
                for (int j = 0; j < this->col; ++j)
                {
                    result(i, j) = 1.0f;
                }
            }
            return result;
        }

        if (num == 1)
        {
            // Return a copy of current matrix
            return Mat(*this);
        }

        // Handle negative exponents
        if (num < 0)
        {
            // For negative exponent: A^(-n) = 1 / A^n (element-wise)
            // Compute positive power first, then take reciprocal
            int abs_num = -num;  // Positive exponent value

            Mat result(this->row, this->col, this->step);
            if (result.data == nullptr)
            {
                std::cerr << "[Error] operator^: failed to allocate memory for result matrix\n";
                return Mat();
            }

            // Element-wise exponentiation with negative exponent
            // result[i,j] = 1.0 / (this[i,j]^abs_num)
            for (int i = 0; i < this->row; ++i)
            {
                for (int j = 0; j < this->col; ++j)
                {
                    float base = (*this)(i, j);

                    // Check for zero or near-zero base (for negative exponents)
                    if (fabsf(base) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
                    {
                        std::cerr << "[Warning] operator^: element at (" << i << ", " << j 
                                  << ") is zero or too small (" << base 
                                  << "), cannot compute negative power. Result will be Inf or NaN.\n";
                        result(i, j) = (base == 0.0f) ? 
                                      (std::numeric_limits<float>::infinity()) : 
                                      (std::numeric_limits<float>::quiet_NaN());
                        continue;
                    }

                    // Compute base^abs_num (positive power)
                    float value = 1.0f;
                    for (int k = 0; k < abs_num; ++k)
                    {
                        value *= base;
                    }

                    // Take reciprocal: 1 / value
                    if (fabsf(value) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
                    {
                        std::cerr << "[Warning] operator^: computed power value at (" << i << ", " << j 
                                  << ") is too small (" << value 
                                  << "), reciprocal may be invalid.\n";
                        result(i, j) = std::numeric_limits<float>::infinity();
                    }
                    else
                    {
                        result(i, j) = 1.0f / value;
                    }
                }
            }

            return result;
        }

        // General case: positive exponent > 1
        Mat result(this->row, this->col, this->step);
        if (result.data == nullptr)
        {
            std::cerr << "[Error] operator^: failed to allocate memory for result matrix\n";
            return Mat();
        }

        // Element-wise exponentiation using iterative multiplication
        // Note: For large exponents, this could be optimized using fast exponentiation,
        // but for typical use cases (small exponents), this is acceptable
        for (int i = 0; i < this->row; ++i)
        {
            for (int j = 0; j < this->col; ++j)
            {
                float base = (*this)(i, j);
                float value = 1.0f;
                for (int k = 0; k < num; ++k)
                {
                    value *= base;
                }
                result(i, j) = value;
            }
        }

        return result;
    }

    // ============================================================================
    // Linear Algebra - Basic Operations
    // ============================================================================
    /**
     * @name Mat::transpose()
     * @brief Transpose the matrix. Returns a new matrix with rows and columns swapped.
     * @return Transposed matrix (col x row), or empty matrix on error
     * @note If this matrix is m x n, the result will be n x m.
     * @note The transpose operation: result[j][i] = this[i][j]
     */
    Mat Mat::transpose()
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] transpose: matrix data pointer is null\n";
            return Mat();  // Return empty matrix as error indicator
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] transpose: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return Mat();
        }

        // Create result matrix with swapped dimensions
        Mat result(this->col, this->row);

        // Check if result matrix was created successfully
        if (result.data == nullptr)
        {
            std::cerr << "[Error] transpose: failed to allocate memory for result matrix\n";
            return Mat();
        }

        // Transpose: swap rows and columns
        for (int i = 0; i < this->row; ++i)
        {
            for (int j = 0; j < this->col; ++j)
            {
                result(j, i) = this->data[i * this->step + j];
            }
        }

        return result;
    }

    /**
     * @name Mat::determinant()
     * @brief Compute the determinant of the matrix (auto-selects method based on size).
     * @return float The determinant value, or 0.0f on error
     * @note For small matrices (n <= 4), uses Laplace expansion (more accurate).
     *       For larger matrices, uses LU decomposition (O(n³)) for better efficiency.
     * @note Determinant can be 0.0f for singular matrices, which is a valid result.
     *       Use error checking to distinguish between error and zero determinant.
     */
    float Mat::determinant()
    {
        // Special case: 0×0 matrix (empty matrix)
        // By mathematical convention, det(empty matrix) = 1 (similar to empty product)
        if (this->row == 0 && this->col == 0)
        {
            return 1.0f;
        }

        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] determinant: matrix data pointer is null\n";
            return 0.0f;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] determinant: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return 0.0f;
        }

        // Check if matrix is square
        if (this->row != this->col)
        {
            std::cerr << "[Error] Determinant requires a square matrix (got " 
                      << this->row << "x" << this->col << ")\n";
            return 0.0f;
        }

        int n = this->row;

        // For small matrices, use Laplace expansion (more accurate for small sizes)
        if (n <= 4)
        {
            return this->determinant_laplace();
        }

        // For larger matrices, use LU decomposition (much faster, O(n³) vs O(n!))
        return this->determinant_lu();
    }

    /**
     * @name Mat::determinant_laplace()
     * @brief Compute the determinant using Laplace expansion (cofactor expansion).
     * @return float The determinant value, or 0.0f on error
     * @note Time complexity: O(n!) - suitable only for small matrices (n <= 4).
     *       Uses recursive method with first row expansion.
     * @note For n=1 and n=2, uses direct formulas for efficiency.
     * @note Determinant can be 0.0f for singular matrices, which is a valid result.
     */
    float Mat::determinant_laplace()
    {
        // Special case: 0×0 matrix (empty matrix)
        // By mathematical convention, det(empty matrix) = 1 (similar to empty product)
        if (this->row == 0 && this->col == 0)
        {
            return 1.0f;
        }

        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] determinant_laplace: matrix data pointer is null\n";
            return 0.0f;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] determinant_laplace: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return 0.0f;
        }

        // Check if matrix is square
        if (this->row != this->col)
        {
            std::cerr << "[Error] Determinant requires a square matrix (got " 
                      << this->row << "x" << this->col << ")\n";
            return 0.0f;
        }

        int n = this->row;

        // Base case: 1x1 matrix
        if (n == 1)
        {
            return this->data[0];
        }

        // Base case: 2x2 matrix (direct formula)
        if (n == 2)
        {
            return this->data[0] * this->data[this->step + 1] - 
                   this->data[1] * this->data[this->step];
        }

        // Recursive case: use Laplace expansion along first row
        float det = 0.0f;
        for (int j = 0; j < n; ++j)
        {
            Mat minor_mat = this->minor(0, j);

            // Check if minor matrix was created successfully
            if (minor_mat.data == nullptr)
            {
                std::cerr << "[Error] determinant_laplace: failed to create minor matrix at (0, " << j << ")\n";
                return 0.0f;
            }

            // Compute cofactor: (-1)^(i+j) * det(minor)
            float cofactor_val = ((j % 2 == 0) ? 1.0f : -1.0f) * minor_mat.determinant_laplace();
            det += this->data[j] * cofactor_val;
        }

        return det;
    }

    /**
     * @name Mat::determinant_lu()
     * @brief Compute the determinant using LU decomposition.
     * @return float The determinant value, or 0.0f on error
     * @note Time complexity: O(n³) - efficient for large matrices.
     *       Formula: det(A) = det(P) * det(L) * det(U) = det(P) * 1 * (product of U diagonal)
     *       where det(P) = (-1)^(number of row swaps)
     * @note Uses pivoting for numerical stability.
     * @note Determinant can be 0.0f for singular matrices, which is a valid result.
     */
    float Mat::determinant_lu()
    {
        // Special case: 0×0 matrix (empty matrix)
        // By mathematical convention, det(empty matrix) = 1 (similar to empty product)
        if (this->row == 0 && this->col == 0)
        {
            return 1.0f;
        }

        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] determinant_lu: matrix data pointer is null\n";
            return 0.0f;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] determinant_lu: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return 0.0f;
        }

        // Check if matrix is square
        if (this->row != this->col)
        {
            std::cerr << "[Error] Determinant requires a square matrix (got " 
                      << this->row << "x" << this->col << ")\n";
            return 0.0f;
        }

        // Perform LU decomposition with pivoting for numerical stability
        LUDecomposition lu = this->lu_decompose(true);

        if (lu.status != TINY_OK)
        {
            // Matrix is singular or near-singular, or decomposition failed
            std::cerr << "[Warning] determinant_lu: LU decomposition failed (status=" 
                      << lu.status << "), matrix may be singular\n";
            return 0.0f;
        }

        // Check if decomposition matrices are valid
        if (lu.U.data == nullptr)
        {
            std::cerr << "[Error] determinant_lu: LU decomposition U matrix is null\n";
            return 0.0f;
        }

        int n = this->row;

        // Compute det(P): permutation matrix determinant = (-1)^(permutation signature)
        float det_P = 1.0f;
        if (lu.pivoted && lu.P.data != nullptr)
        {
            // Compute permutation signature by finding cycles in P
            // P is a permutation matrix, so each row/column has exactly one 1
            // We can compute the sign by decomposing into transpositions
            std::vector<bool> visited(n, false);
            int cycle_count = 0;

            for (int i = 0; i < n; ++i)
            {
                if (visited[i]) continue;

                // Find the cycle starting at i
                int current = i;
                int cycle_length = 0;
                bool found_mapping = false;
                while (!visited[current])
                {
                    visited[current] = true;
                    cycle_length++;
                    found_mapping = false;

                    // Find where P maps current row
                    for (int j = 0; j < n; ++j)
                    {
                        if (fabsf(lu.P(current, j) - 1.0f) < 1e-6f)
                        {
                            current = j;
                            found_mapping = true;
                            break;
                        }
                    }

                    // Safety check: if no mapping found, break to avoid infinite loop
                    // This should not happen for a valid permutation matrix
                    if (!found_mapping)
                    {
                        std::cerr << "[Warning] determinant_lu: Could not find mapping for row " 
                                  << current << " in permutation matrix P. Matrix may be invalid.\n";
                        break;
                    }
                }

                // A cycle of length k contributes (k-1) transpositions
                if (cycle_length > 1)
                {
                    cycle_count += (cycle_length - 1);
                }
            }

            // det(P) = (-1)^(number of transpositions)
            det_P = (cycle_count % 2 == 0) ? 1.0f : -1.0f;
        }

        // Compute det(L): lower triangular with unit diagonal = 1
        float det_L = 1.0f;  // L has unit diagonal, so det(L) = 1

        // Compute det(U): product of diagonal elements
        float det_U = 1.0f;
        for (int i = 0; i < n; ++i)
        {
            det_U *= lu.U(i, i);
        }

        // det(A) = det(P) * det(L) * det(U)
        return det_P * det_L * det_U;
    }

    /**
     * @name Mat::determinant_gaussian()
     * @brief Compute the determinant using Gaussian elimination.
     * @return float The determinant value, or 0.0f on error
     * @note Time complexity: O(n³) - efficient for large matrices.
     *       Converts matrix to upper triangular form, then multiplies diagonal elements.
     *       Tracks row swaps to account for sign changes.
     * @note Uses partial pivoting for numerical stability.
     * @note Determinant can be 0.0f for singular matrices, which is a valid result.
     */
    float Mat::determinant_gaussian()
    {
        // Special case: 0×0 matrix (empty matrix)
        // By mathematical convention, det(empty matrix) = 1 (similar to empty product)
        if (this->row == 0 && this->col == 0)
        {
            return 1.0f;
        }

        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] determinant_gaussian: matrix data pointer is null\n";
            return 0.0f;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] determinant_gaussian: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return 0.0f;
        }

        // Check if matrix is square
        if (this->row != this->col)
        {
            std::cerr << "[Error] Determinant requires a square matrix (got " 
                      << this->row << "x" << this->col << ")\n";
            return 0.0f;
        }

        int n = this->row;
        Mat A = Mat(*this);  // Working copy

        // Check if copy was successful
        if (A.data == nullptr)
        {
            std::cerr << "[Error] determinant_gaussian: failed to create working copy\n";
            return 0.0f;
        }

        int swap_count = 0;  // Track number of row swaps

        // Gaussian elimination to upper triangular form
        for (int k = 0; k < n - 1; ++k)
        {
            // Partial pivoting: find row with largest element in column k
            int max_row = k;
            float max_val = fabsf(A(k, k));
            for (int i = k + 1; i < n; ++i)
            {
                if (fabsf(A(i, k)) > max_val)
                {
                    max_val = fabsf(A(i, k));
                    max_row = i;
                }
            }

            // Swap rows if necessary
            if (max_row != k)
            {
                A.swap_rows(k, max_row);
                swap_count++;
            }

            // Check for singular matrix (near-zero pivot)
            if (fabsf(A(k, k)) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
            {
                // Matrix is singular or near-singular
                return 0.0f;
            }

            // Eliminate below diagonal
            for (int i = k + 1; i < n; ++i)
            {
                float factor = A(i, k) / A(k, k);
                for (int j = k; j < n; ++j)
                {
                    A(i, j) -= factor * A(k, j);
                }
            }
        }

        // Compute determinant: product of diagonal elements
        float det = 1.0f;
        for (int i = 0; i < n; ++i)
        {
            det *= A(i, i);
        }

        // Account for row swaps: each swap multiplies determinant by -1
        if (swap_count % 2 == 1)
        {
            det = -det;
        }

        return det;
    }

    /**
     * @name Mat::adjoint()
     * @brief Compute the adjoint (adjugate) matrix.
     * @note The adjoint matrix is the transpose of the cofactor matrix.
     *       adj(A)_ij = (-1)^(i+j) * det(minor_ji)
     *       Note: The result is stored at (j,i) to achieve transpose.
     * @return Mat The adjoint matrix, or empty Mat() on error
     * @note Time complexity: O(n² * O(det)) - expensive for large matrices.
     *       For n×n matrix, requires computing n² determinants of (n-1)×(n-1) matrices.
     */
    Mat Mat::adjoint()
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] adjoint: matrix data pointer is null\n";
            return Mat();
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] adjoint: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return Mat();
        }

        // Check if matrix is square
        if (this->row != this->col)
        {
            std::cerr << "[Error] Adjoint requires a square matrix (got " 
                      << this->row << "x" << this->col << ")\n";
            return Mat();
        }

        int n = this->row;
        Mat result(n, n);

        // Check if result matrix was created successfully
        if (result.data == nullptr)
        {
            std::cerr << "[Error] adjoint: failed to create result matrix\n";
            return Mat();
        }

        for (int i = 0; i < n; ++i)
        {
            for (int j = 0; j < n; ++j)
            {
                Mat cofactor_mat = this->cofactor(i, j);

                // For 1×1 matrix, cofactor is a 0×0 empty matrix (valid result)
                // The determinant of a 0×0 matrix is 1.0 by mathematical convention
                // So cofactor_mat.data == nullptr is acceptable for this case

                // Compute cofactor value: (-1)^(i+j) * det(minor)
                float sign = ((i + j) % 2 == 0) ? 1.0f : -1.0f;
                float cofactor_val = sign * cofactor_mat.determinant();

                // Store at (j,i) to achieve transpose: adj(A) = C^T
                result(j, i) = cofactor_val;
            }
        }

        return result;
    }

    /**
     * @name Mat::normalize()
     * @brief Normalize the matrix by dividing each element by the matrix norm (Frobenius norm).
     * @note Normalization: M_normalized = M / ||M||_F
     *       where ||M||_F = sqrt(sum of squares of all elements)
     * @note If the matrix norm is zero or too small, normalization is skipped
     *       and a warning is printed. The matrix remains unchanged.
     * @note This function modifies the matrix in-place.
     */
    void Mat::normalize()
    {
        // Validate matrix dimensions
        if (this->row < 0 || this->col < 0)
        {
            std::cerr << "[Error] normalize: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return;
        }

        // Check for empty matrix
        if (this->row == 0 || this->col == 0)
        {
            // Empty matrix, nothing to normalize
            return;
        }

        // For non-empty matrices, data pointer must be valid
        if (this->data == nullptr)
        {
            std::cerr << "[Error] normalize: matrix data pointer is null\n";
            return;
        }

        float n = this->norm();

        // Check for invalid norm (NaN, Inf, or too small)
        if (!(n > TINY_MATH_MIN_POSITIVE_INPUT_F32))
        {
            if (n == 0.0f)
            {
                std::cerr << "[Warning] normalize: matrix norm is zero (matrix is all zeros), "
                          << "normalization skipped\n";
            }
            else if (std::isnan(n) || std::isinf(n))
            {
                std::cerr << "[Error] normalize: matrix norm is invalid (NaN or Inf), "
                          << "normalization skipped\n";
            }
            else
            {
                std::cerr << "[Warning] normalize: matrix norm is too small (" << n 
                          << "), normalization skipped\n";
            }
            return;
        }

        // Normalize each element
        for (int i = 0; i < this->row; ++i)
        {
            for (int j = 0; j < this->col; ++j)
            {
                (*this)(i, j) /= n;
            }
        }
    }

    /**
     * @name Mat::norm()
     * @brief Compute the Frobenius norm (Euclidean norm) of the matrix.
     * @note Frobenius norm: ||M||_F = sqrt(Σ M_ij²)
     *       This is the square root of the sum of squares of all matrix elements.
     * @note For empty matrices, returns 0.0f.
     * @note For zero matrices, returns 0.0f.
     * 
     * @return float The Frobenius norm of the matrix, or 0.0f on error
     */
    float Mat::norm() const
    {
        // Validate matrix dimensions
        if (this->row < 0 || this->col < 0)
        {
            std::cerr << "[Error] norm: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return 0.0f;
        }

        // Handle empty matrix
        if (this->row == 0 || this->col == 0)
        {
            return 0.0f;
        }

        // For non-empty matrices, data pointer must be valid
        if (this->data == nullptr)
        {
            std::cerr << "[Error] norm: matrix data pointer is null\n";
            return 0.0f;
        }

        float sum_sq = 0.0f;
        for (int i = 0; i < this->row; ++i)
        {
            for (int j = 0; j < this->col; ++j)
            {
                float val = (*this)(i, j);
                sum_sq += val * val;
            }
        }

        // Compute square root
        // Note: sum_sq should always be non-negative (sum of squares)
        float result = sqrtf(sum_sq);

        // Safety check: if result is invalid, return 0.0f
        if (std::isnan(result) || std::isinf(result))
        {
            std::cerr << "[Warning] norm: computed norm is invalid (NaN or Inf), returning 0.0f\n";
            return 0.0f;
        }

        return result;
    }

    /**
     * @name Mat::inverse_adjoint()
     * @brief Compute the inverse matrix using the adjoint method.
     * @note Formula: A^(-1) = (1 / det(A)) * adj(A)
     *       where adj(A) is the adjoint (transpose of cofactor matrix).
     * @note WARNING: This method is SLOW for large matrices!
     *       - Time complexity: O(n² × n!) - exponential growth with matrix size
     *       - For n×n matrix, requires computing n² determinants of (n-1)×(n-1) matrices
     *       - Each determinant calculation uses Laplace expansion (O(n!) complexity)
     *       - Example: 4×4 matrix needs 16 determinants of 3×3 matrices
     *       - Example: 5×5 matrix needs 25 determinants of 4×4 matrices (very slow!)
     * @note Performance comparison:
     *       - 2×2 matrix: Fast (direct formula)
     *       - 3×3 matrix: Acceptable
     *       - 4×4 matrix: Slow but usable
     *       - 5×5+ matrix: VERY SLOW - use other methods instead
     * @note For larger matrices (n >= 4), strongly recommend using:
     *       - inverse_gje() (Gauss-Jordan elimination, O(n³))
     *       - LU decomposition methods (O(n³), more stable)
     * @note This method is mainly useful for:
     *       - Small matrices (n <= 3) where simplicity is preferred
     *       - Educational purposes to understand the adjoint formula
     *       - Cases where you need the adjoint matrix anyway
     * 
     * @return Mat The inverse matrix, or empty Mat() on error
     */
    Mat Mat::inverse_adjoint()
    {
        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] inverse_adjoint: matrix data pointer is null\n";
            return Mat();
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] inverse_adjoint: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            return Mat();
        }

        // Check if matrix is square
        if (this->row != this->col)
        {
            std::cerr << "[Error] inverse_adjoint: requires square matrix (got " 
                      << this->row << "x" << this->col << ")\n";
            return Mat();
        }

        // Compute determinant
        float det = this->determinant();

        // Check if determinant is valid
        if (std::isnan(det) || std::isinf(det))
        {
            std::cerr << "[Error] inverse_adjoint: determinant is invalid (NaN or Inf)\n";
            return Mat();
        }

        // Check if matrix is singular (determinant is zero or too small)
        if (fabsf(det) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
        {
            std::cerr << "[Error] inverse_adjoint: matrix is singular (det=" << det 
                      << "), cannot compute inverse\n";
            return Mat();
        }

        // Compute adjoint matrix
        Mat adj = this->adjoint();

        // Check if adjoint computation was successful
        if (adj.data == nullptr)
        {
            std::cerr << "[Error] inverse_adjoint: failed to compute adjoint matrix\n";
            return Mat();
        }

        // Check if adjoint matrix has correct dimensions
        if (adj.row != this->row || adj.col != this->col)
        {
            std::cerr << "[Error] inverse_adjoint: adjoint matrix has incorrect dimensions: " 
                      << adj.row << "x" << adj.col << " (expected " << this->row << "x" << this->col << ")\n";
            return Mat();
        }

        // Compute inverse: A^(-1) = (1 / det(A)) * adj(A)
        float inv_det = 1.0f / det;
        Mat result = adj * inv_det;

        // Check if matrix multiplication was successful
        if (result.data == nullptr)
        {
            std::cerr << "[Error] inverse_adjoint: failed to compute inverse matrix (multiplication failed)\n";
            return Mat();
        }

        return result;
    }

    /**
     * @name Mat::dotprod(const Mat &A, const Mat &B)
     * @brief Compute the dot product (Frobenius inner product) of two matrices.
     * @note Mathematical definition: A · B = Σ A_ij * B_ij (sum over all elements)
     *       This is the element-wise multiplication followed by summation.
     * @note Also known as:
     *       - Frobenius inner product
     *       - Hadamard product sum
     *       - Element-wise dot product
     * @note For vectors, this is equivalent to the standard dot product.
     * @note For empty matrices, returns 0.0f.
     *
     * @param A First matrix
     * @param B Second matrix
     * @return float Dot product value, or 0.0f on error
     */
    float Mat::dotprod(const Mat &A, const Mat &B)
    {
        // Check for null pointers
        if (A.data == nullptr)
        {
            std::cerr << "[Error] dotprod: matrix A data pointer is null\n";
            return 0.0f;
        }

        if (B.data == nullptr)
        {
            std::cerr << "[Error] dotprod: matrix B data pointer is null\n";
            return 0.0f;
        }

        // Validate matrix dimensions
        if (A.row <= 0 || A.col <= 0)
        {
            std::cerr << "[Error] dotprod: invalid matrix A dimensions: rows=" 
                      << A.row << ", cols=" << A.col << "\n";
            return 0.0f;
        }

        if (B.row <= 0 || B.col <= 0)
        {
            std::cerr << "[Error] dotprod: invalid matrix B dimensions: rows=" 
                      << B.row << ", cols=" << B.col << "\n";
            return 0.0f;
        }

        // Check if matrices have the same size
        if (A.row != B.row || A.col != B.col)
        {
            std::cerr << "[Error] dotprod: matrices must have the same size (A: " 
                      << A.row << "x" << A.col << ", B: " << B.row << "x" << B.col << ")\n";
            return 0.0f;
        }

        // Handle empty matrices
        if (A.row == 0 || A.col == 0)
        {
            return 0.0f;
        }

        // Compute dot product: sum of element-wise products
        float result = 0.0f;
        for (int i = 0; i < A.row; ++i)
        {
            for (int j = 0; j < A.col; ++j)
            {
                result += A(i, j) * B(i, j);
            }
        }

        return result;
    }

    // ============================================================================
    // Linear Algebra - Matrix Utilities
    // ============================================================================
    /**
     * @name Mat::eye(int size)
     * @brief Create an identity matrix (unit matrix) of specified size.
     * @note Identity matrix I has:
     *       - I_ij = 1 if i == j (diagonal elements)
     *       - I_ij = 0 if i != j (off-diagonal elements)
     * @note Properties:
     *       - I * A = A (left identity)
     *       - A * I = A (right identity)
     *       - I^(-1) = I (self-inverse)
     * @note For size = 0, returns empty matrix.
     *
     * @param size Size of the square identity matrix (must be >= 0)
     * @return Mat Identity matrix, or empty Mat() on error
     */
    Mat Mat::eye(int size)
    {
        // Validate parameter
        if (size < 0)
        {
            std::cerr << "[Error] eye: size must be non-negative (got " << size << ")\n";
            return Mat();
        }

        // Handle size = 0 (empty matrix)
        if (size == 0)
        {
            return Mat(0, 0);
        }

        Mat identity(size, size);

        // Check if matrix was created successfully
        if (identity.data == nullptr)
        {
            std::cerr << "[Error] eye: failed to create identity matrix of size " << size << "\n";
            return Mat();
        }

        // Initialize identity matrix: diagonal = 1, others = 0
        // Note: Mat constructor already initializes to 0, so we only need to set diagonal
        for (int i = 0; i < size; ++i)
        {
            identity(i, i) = 1.0f;  // Set diagonal elements to 1
            // Off-diagonal elements are already 0 from constructor
        }

        return identity;
    }

    /**
     * @name Mat::ones(int rows, int cols)
     * @brief Create a matrix filled with ones (all elements = 1.0f).
     * @note Creates a matrix where every element is 1.0f.
     * @note For rows = 0 or cols = 0, returns empty matrix.
     *
     * @param rows Number of rows (must be >= 0)
     * @param cols Number of columns (must be >= 0)
     * @return Mat Matrix filled with ones, or empty Mat() on error
     */
    Mat Mat::ones(int rows, int cols)
    {
        // Validate parameters
        if (rows < 0 || cols < 0)
        {
            std::cerr << "[Error] ones: dimensions must be non-negative (got rows=" 
                      << rows << ", cols=" << cols << ")\n";
            return Mat();
        }

        // Handle empty matrix
        if (rows == 0 || cols == 0)
        {
            return Mat(rows, cols);
        }

        Mat result(rows, cols);

        // Check if matrix was created successfully
        if (result.data == nullptr)
        {
            std::cerr << "[Error] ones: failed to create matrix of size " 
                      << rows << "x" << cols << "\n";
            return Mat();
        }

        // Fill all elements with 1.0f
        for (int i = 0; i < rows; ++i)
        {
            for (int j = 0; j < cols; ++j)
            {
                result(i, j) = 1.0f;
            }
        }

        return result;
    }

    /**
     * @name Mat::ones(int size)
     * @brief Create a square matrix filled with ones (all elements = 1.0f).
     * @note Convenience function that creates a size×size matrix filled with ones.
     *       Equivalent to ones(size, size).
     * @note For size = 0, returns empty matrix.
     *
     * @param size Size of the square matrix (rows = cols, must be >= 0)
     * @return Mat Square matrix [size x size] with all elements = 1, or empty Mat() on error
     */
    Mat Mat::ones(int size)
    {
        // All validation is handled by ones(size, size)
        return Mat::ones(size, size);
    }

    /**
     * @name Mat::augment(const Mat &A, const Mat &B)
     * @brief Augment two matrices horizontally [A | B] (concatenate columns).
     * @note Creates a new matrix by placing B to the right of A.
     *       Result: [A | B] with dimensions (rows, A.cols + B.cols)
     *       where rows must be the same for both matrices.
     * @note Common use case: Creating augmented matrix [A | b] for solving Ax = b
     *       using Gaussian elimination.
     * @note For empty matrices, returns empty matrix if dimensions are valid.
     *
     * @param A Left matrix
     * @param B Right matrix
     * @return Mat Augmented matrix [A B], or empty Mat() on error
     */
    Mat Mat::augment(const Mat &A, const Mat &B)
    {
        // Validate matrix dimensions
        if (A.row < 0 || A.col < 0)
        {
            std::cerr << "[Error] augment: invalid matrix A dimensions: rows=" 
                      << A.row << ", cols=" << A.col << "\n";
            return Mat();
        }

        if (B.row < 0 || B.col < 0)
        {
            std::cerr << "[Error] augment: invalid matrix B dimensions: rows=" 
                      << B.row << ", cols=" << B.col << "\n";
            return Mat();
        }

        // Check if row counts match
        if (A.row != B.row)
        {
            std::cerr << "[Error] augment: row counts must match (A: " 
                      << A.row << ", B: " << B.row << ")\n";
            return Mat();
        }

        // Check for integer overflow in column sum
        if (A.col > INT_MAX - B.col)
        {
            std::cerr << "[Error] augment: combined column count too large, integer overflow "
                      << "(A.col=" << A.col << ", B.col=" << B.col << ")\n";
            return Mat();
        }

        // For non-empty matrices, data pointers must be valid.
        // Empty matrices (rows==0 or cols==0) are allowed to have nullptr data.
        if (A.row > 0 && A.col > 0 && A.data == nullptr)
        {
            std::cerr << "[Error] augment: matrix A data pointer is null\n";
            return Mat();
        }
        if (B.row > 0 && B.col > 0 && B.data == nullptr)
        {
            std::cerr << "[Error] augment: matrix B data pointer is null\n";
            return Mat();
        }

        // Create new matrix with combined columns
        Mat AB(A.row, A.col + B.col);

        // Check if matrix was created successfully
        if (AB.data == nullptr)
        {
            std::cerr << "[Error] augment: failed to create augmented matrix of size " 
                      << A.row << "x" << (A.col + B.col) << "\n";
            return Mat();
        }

        // Handle empty matrices
        if (A.row == 0)
        {
            // Empty result matrix already created, return it
            return AB;
        }

        // Copy data from A and B
        for (int i = 0; i < A.row; ++i)
        {
            // Copy A (left part)
            for (int j = 0; j < A.col; ++j)
            {
                AB(i, j) = A(i, j);
            }
            // Copy B (right part)
            for (int j = 0; j < B.col; ++j)
            {
                AB(i, A.col + j) = B(i, j);
            }
        }

        return AB;
    }

    /**
     * @name Mat::vstack(const Mat &A, const Mat &B)
     * @brief Vertically stack two matrices [A; B] (concatenate rows).
     * @note Creates a new matrix by placing B below A.
     *       Result: [A; B] with dimensions (A.rows + B.rows, cols)
     *       where cols must be the same for both matrices.
     * @note Common use case: Combining data from multiple sources vertically,
     *       or building block matrices in linear algebra operations.
     * @note For empty matrices, returns empty matrix if dimensions are valid.
     *
     * @param A Top matrix
     * @param B Bottom matrix
     * @return Mat Vertically stacked matrix [A; B], or empty Mat() on error
     */
    Mat Mat::vstack(const Mat &A, const Mat &B)
    {
        // Validate matrix dimensions
        if (A.row < 0 || A.col < 0)
        {
            std::cerr << "[Error] vstack: invalid matrix A dimensions: rows=" 
                      << A.row << ", cols=" << A.col << "\n";
            return Mat();
        }

        if (B.row < 0 || B.col < 0)
        {
            std::cerr << "[Error] vstack: invalid matrix B dimensions: rows=" 
                      << B.row << ", cols=" << B.col << "\n";
            return Mat();
        }

        // Check if column counts match
        if (A.col != B.col)
        {
            std::cerr << "[Error] vstack: column counts must match (A: " 
                      << A.col << ", B: " << B.col << ")\n";
            return Mat();
        }

        // Check for integer overflow in row sum
        if (A.row > INT_MAX - B.row)
        {
            std::cerr << "[Error] vstack: combined row count too large, integer overflow "
                      << "(A.row=" << A.row << ", B.row=" << B.row << ")\n";
            return Mat();
        }

        // For non-empty matrices, data pointers must be valid.
        // Empty matrices (rows==0 or cols==0) are allowed to have nullptr data.
        if (A.row > 0 && A.col > 0 && A.data == nullptr)
        {
            std::cerr << "[Error] vstack: matrix A data pointer is null\n";
            return Mat();
        }
        if (B.row > 0 && B.col > 0 && B.data == nullptr)
        {
            std::cerr << "[Error] vstack: matrix B data pointer is null\n";
            return Mat();
        }

        // Create new matrix with combined rows
        Mat AB(A.row + B.row, A.col);

        // Check if matrix was created successfully
        if (AB.data == nullptr)
        {
            std::cerr << "[Error] vstack: failed to create stacked matrix of size " 
                      << (A.row + B.row) << "x" << A.col << "\n";
            return Mat();
        }

        // Handle empty matrices
        if (A.col == 0)
        {
            // Empty result matrix already created, return it
            return AB;
        }

        // Copy data from A and B
        // Copy A (top rows)
        for (int i = 0; i < A.row; ++i)
        {
            for (int j = 0; j < A.col; ++j)
            {
                AB(i, j) = A(i, j);
            }
        }
        // Copy B (bottom rows)
        for (int i = 0; i < B.row; ++i)
        {
            for (int j = 0; j < B.col; ++j)
            {
                AB(A.row + i, j) = B(i, j);
            }
        }

        return AB;
    }

    /**
     * @name Mat::gram_schmidt_orthogonalize()
     * @brief QR-style orthogonalization of a column-vector set via Modified
     *        Gram-Schmidt (MGS) with conditional twice-reorthogonalization.
     *
     * Given an input matrix A whose columns are the vectors a_0..a_{n-1}, this
     * function produces:
     *   - Q (orthogonal_vectors, m x n) : columns q_0..q_{n-1} are orthonormal
     *                                      (Q^T * Q = I to working precision).
     *   - R (coefficients,       n x n) : upper triangular such that A = Q * R,
     *                                      where R(k,j) is the projection of
     *                                      a_j onto q_k.
     *
     * Algorithmic notes
     * -----------------
     *  - We use Modified Gram-Schmidt rather than Classical GS: each projection
     *    is subtracted immediately, which is much more numerically stable.
     *  - A second MGS sweep (Kahan/Parlett "twice is enough") is performed
     *    *only* when pass-1 cancelled away more than ~30 % of the original
     *    norm. This recovers full orthogonality on ill-conditioned inputs while
     *    saving roughly half the FLOPs on well-conditioned inputs.
     *  - When a column is linearly dependent on the previous q's, we set
     *    R(j,j) = 0 and synthesize a substitute q_j by orthogonalizing the
     *    first standard basis vector e_b that survives the projection. This
     *    keeps Q a complete orthonormal set without disturbing A = Q*R
     *    (because the dependent column is fully described by R(0..j-1, j)).
     *
     * @param vectors             Input matrix; columns are the vectors a_j.
     * @param orthogonal_vectors  [out] Q (m x n), orthonormal columns.
     * @param coefficients        [out] R (n x n), upper triangular.
     * @param tolerance           Rank threshold; columns whose post-orthogonal
     *                            norm falls below max(tolerance, 1e-5f) are
     *                            treated as linearly dependent. Must be >= 0.
     *                            The 1e-5f floor is enforced to keep
     *                            single-precision noise from being mistaken
     *                            for a real direction.
     * @return true on success; false on invalid input.
     */
    bool Mat::gram_schmidt_orthogonalize(const Mat &vectors, Mat &orthogonal_vectors,
                                         Mat &coefficients, float tolerance)
    {
        // ----------------------------------------------------------------
        // 1) Input validation
        // ----------------------------------------------------------------
        if (vectors.data == nullptr)
        {
            std::cerr << "[Error] gram_schmidt_orthogonalize: input matrix is null.\n";
            return false;
        }

        const int m = vectors.row;  // dimension of each column vector
        const int n = vectors.col;  // number of column vectors

        if (m <= 0 || n <= 0)
        {
            std::cerr << "[Error] gram_schmidt_orthogonalize: invalid dimensions (m="
                      << m << ", n=" << n << ")\n";
            return false;
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] gram_schmidt_orthogonalize: tolerance must be non-negative (got "
                      << tolerance << ")\n";
            return false;
        }

        // ----------------------------------------------------------------
        // 2) Allocate outputs
        //    Q : m x n  -- orthonormal columns
        //    R : n x n  -- upper-triangular coefficients (A = Q * R)
        // ----------------------------------------------------------------
        orthogonal_vectors = Mat(m, n);
        coefficients       = Mat(n, n);
        if (orthogonal_vectors.data == nullptr || coefficients.data == nullptr)
        {
            std::cerr << "[Error] gram_schmidt_orthogonalize: output allocation failed.\n";
            return false;
        }
        coefficients.clear();  // ensure strictly-lower & off-band entries are 0

        // ----------------------------------------------------------------
        // 3) Effective thresholds (kept in single source of truth)
        // ----------------------------------------------------------------
        // Single-precision rank floor: below this we cannot trust the value
        // of ||q_j|| against rounding noise, so we always clamp here.
        constexpr float kRankFloor    = 1e-5f;
        // Kahan's "twice is enough" trigger: 1/sqrt(2). If pass-1 left the
        // working column shorter than this fraction of its original length,
        // significant cancellation occurred and a second MGS pass is needed.
        constexpr float kReorthRatio  = 0.7071068f;

        const float rank_tol = (tolerance > kRankFloor) ? tolerance : kRankFloor;

        // ----------------------------------------------------------------
        // 4) Local column-wise primitives on Q
        //    The matrix is row-major with stride 'step', so columns are not
        //    contiguous; we rely on the inline operator() in the header.
        //    Lambdas below are zero-overhead after inlining.
        // ----------------------------------------------------------------
        Mat &Q = orthogonal_vectors;
        Mat &R = coefficients;

        // <Q(:,k), Q(:,j)>
        auto col_dot = [&](int k, int j) -> float {
            float s = 0.0f;
            for (int i = 0; i < m; ++i) s += Q(i, k) * Q(i, j);
            return s;
        };
        // ||Q(:,j)||_2
        auto col_norm = [&](int j) -> float {
            float s = 0.0f;
            for (int i = 0; i < m; ++i) { const float v = Q(i, j); s += v * v; }
            return sqrtf(s);
        };
        // Q(:,j) <- Q(:,j) - c * Q(:,k)
        auto col_axpy = [&](int j, int k, float c) {
            for (int i = 0; i < m; ++i) Q(i, j) -= c * Q(i, k);
        };
        // One MGS sweep: orthogonalize Q(:,j) against Q(:,0..j-1).
        //   write_R == true  -> dot products are recorded in R(k,j)
        //   accumulate       -> add to R(k,j) (used by pass-2) instead of overwrite
        auto mgs_sweep = [&](int j, bool write_R, bool accumulate) {
            for (int k = 0; k < j; ++k)
            {
                const float dot = col_dot(k, j);
                if (write_R)
                {
                    if (accumulate) R(k, j) += dot;
                    else            R(k, j)  = dot;
                }
                col_axpy(j, k, dot);
            }
        };

        // ----------------------------------------------------------------
        // 5) Main orthogonalization loop
        // ----------------------------------------------------------------
        for (int j = 0; j < n; ++j)
        {
            // 5.1 Initialize working column: Q(:,j) <- A(:,j)
            for (int i = 0; i < m; ++i) Q(i, j) = vectors(i, j);

            // 5.2 First MGS pass (always); records R(k,j) for k < j
            const float norm_before = col_norm(j);   // = ||A(:,j)||
            mgs_sweep(j, /*write_R=*/true, /*accumulate=*/false);
            float norm_after = col_norm(j);

            // 5.3 Conditional second MGS pass (Kahan)
            //     Triggered only when significant cancellation occurred.
            if (norm_after < kReorthRatio * norm_before)
            {
                mgs_sweep(j, /*write_R=*/true, /*accumulate=*/true);
                norm_after = col_norm(j);
            }

            // 5.4 Regular case: column is independent -> normalize and store R(j,j)
            if (norm_after >= rank_tol)
            {
                R(j, j) = norm_after;
                const float inv = 1.0f / norm_after;
                for (int i = 0; i < m; ++i) Q(i, j) *= inv;
                continue;
            }

            // 5.5 Linearly-dependent case: synthesize a substitute q_j
            //
            // Mathematically a_j ∈ span{q_0..q_{j-1}}, so R(j,j) = 0 and the
            // already-written R(0..j-1, j) reconstruct a_j via Q*R. We still
            // need a unit q_j orthogonal to all previous columns so that Q
            // remains a complete orthonormal set (consumers such as SVD or
            // null-space queries rely on this).
            //
            // Strategy: try standard basis vectors e_0, e_1, ..., e_{m-1};
            // keep the first one whose projected residual is large enough.
            // Re-orthogonalization is applied unconditionally here because an
            // e_b that lies almost inside span{q_0..q_{j-1}} loses most of
            // its norm in pass-1 and is numerically fragile.
            R(j, j) = 0.0f;
            bool found = false;

            for (int b = 0; b < m && !found; ++b)
            {
                for (int i = 0; i < m; ++i) Q(i, j) = (i == b) ? 1.0f : 0.0f;

                mgs_sweep(j, /*write_R=*/false, /*accumulate=*/false);  // pass 1
                mgs_sweep(j, /*write_R=*/false, /*accumulate=*/false);  // pass 2

                const float new_norm = col_norm(j);
                if (new_norm > kRankFloor)
                {
                    const float inv = 1.0f / new_norm;
                    for (int i = 0; i < m; ++i) Q(i, j) *= inv;
                    found = true;
                }
            }

            // 5.6 Defensive fallback: only reachable when n > m, i.e. the
            //     caller asked for more orthonormal columns than the ambient
            //     dimension can supply. Leave Q(:,j) as a zero vector.
            if (!found)
            {
                for (int i = 0; i < m; ++i) Q(i, j) = 0.0f;
            }
        }

        return true;
    }

    // ============================================================================
    // Linear Algebra - Matrix Operations
    // ============================================================================
    /**
     * @name Mat::minor(int target_row, int target_col) const
     * @brief Submatrix obtained by deleting one row and one column.
     *
     * For an m x n input, the result is (m-1) x (n-1). This is purely a
     * geometric "delete a row, delete a column" operation and is therefore
     * defined on ANY rectangular matrix; it does NOT require squareness.
     * (Squareness is only needed when the caller subsequently takes a
     * determinant, e.g. for the minor value or cofactor value -- which is
     * why cofactor() is the function that enforces it.)
     *
     * Edge case: if either dimension would collapse to 0 after the deletion
     * (i.e. m == 1 or n == 1), the result is the canonical empty Mat(0, 0).
     *
     * Naming reference:
     *   - minor   matrix  M_ij = this function's return value.
     *   - minor   value   m_ij = det(M_ij)             [requires square]
     *   - cofactor value  C_ij = (-1)^(i+j) * det(M_ij) [requires square]
     *
     * @param target_row Row index to remove (0-based, in [0, this->row)).
     * @param target_col Column index to remove (0-based, in [0, this->col)).
     * @return Mat The (m-1)x(n-1) submatrix; Mat(0, 0) on invalid input or
     *             when an output dimension would be zero.
     */
    Mat Mat::minor(int target_row, int target_col) const
    {
        // ----------------------------------------------------------------
        // 1) Input validation
        // ----------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] minor: matrix data pointer is null\n";
            return Mat(0, 0);
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] minor: invalid matrix dimensions (rows="
                      << this->row << ", cols=" << this->col << ")\n";
            return Mat(0, 0);
        }

        const int rows = this->row;
        const int cols = this->col;

        if (target_row < 0 || target_row >= rows)
        {
            std::cerr << "[Error] minor: target_row=" << target_row
                      << " is out of range [0, " << (rows - 1) << "]\n";
            return Mat(0, 0);
        }
        if (target_col < 0 || target_col >= cols)
        {
            std::cerr << "[Error] minor: target_col=" << target_col
                      << " is out of range [0, " << (cols - 1) << "]\n";
            return Mat(0, 0);
        }

        // Degenerate cases: deleting the only row or the only column leaves
        // a matrix with a zero dimension. Collapse all such results to the
        // canonical Mat(0, 0) for predictable downstream handling.
        if (rows == 1 || cols == 1)
        {
            return Mat(0, 0);
        }

        // ----------------------------------------------------------------
        // 2) Allocate result : (rows-1) x (cols-1)
        // ----------------------------------------------------------------
        Mat result(rows - 1, cols - 1);
        if (result.data == nullptr)
        {
            std::cerr << "[Error] minor: failed to create result matrix\n";
            return Mat(0, 0);
        }

        // ----------------------------------------------------------------
        // 3) Bulk copy with row-skip and column-skip
        //
        //    Each surviving row is copied as at most TWO contiguous chunks:
        //
        //      src row i :  [ 0 .. tc-1 | tc | tc+1 .. cols-1 ]
        //                    └─ left ─┘  skip  └──  right  ──┘
        //      dst row   :  [ 0 .. tc-1     | tc .. cols-2    ]
        //
        //    This eliminates the inner per-element branch (j == target_col)
        //    and lets std::memcpy use the platform's optimized block copy
        //    (vectorized / aligned). Asymptotic cost is still O(rows*cols)
        //    but the per-iteration constant drops several-fold for large
        //    matrices. (Especially valuable inside O(n!) Laplace recursion.)
        // ----------------------------------------------------------------
        const int src_step    = this->step;
        const int dst_step    = result.step;
        const int left_count  = target_col;                  // cols [0, tc)
        const int right_count = (cols - 1) - target_col;     // cols (tc, cols)

        int res_i = 0;
        for (int i = 0; i < rows; ++i)
        {
            if (i == target_row)
            {
                continue;
            }

            const float *src_row = this->data  + i     * src_step;
            float       *dst_row = result.data + res_i * dst_step;

            // Left chunk: columns [0, target_col)
            if (left_count > 0)
            {
                std::memcpy(dst_row,
                            src_row,
                            sizeof(float) * static_cast<size_t>(left_count));
            }
            // Right chunk: columns (target_col, cols), shifted left by 1 in dst
            if (right_count > 0)
            {
                std::memcpy(dst_row + left_count,
                            src_row + target_col + 1,
                            sizeof(float) * static_cast<size_t>(right_count));
            }

            ++res_i;
        }

        return result;
    }

    /**
     * @name Mat::cofactor(int target_row, int target_col) const
     * @brief Cofactor submatrix (same shape/contents as the minor matrix).
     *
     * The (i,j) cofactor matrix C_ij equals the minor matrix M_ij; the only
     * difference is semantic: cofactor() is the entry point used when the
     * caller is going to apply the sign (-1)^(i+j) and take a determinant
     * to obtain the cofactor VALUE. Because the cofactor value requires
     * det(M_ij), the input must be SQUARE -- this function therefore
     * enforces squareness, while minor() allows any m x n shape.
     *
     * @note Cofactor VALUE computation:
     *       float val = ((i + j) % 2 == 0 ? 1.0f : -1.0f) *
     *                   A.cofactor(i, j).determinant();
     *
     * @param target_row Row index to remove (0-based, in [0, n)).
     * @param target_col Column index to remove (0-based, in [0, n)).
     * @return Mat The (n-1) x (n-1) cofactor submatrix; Mat(0, 0) on error
     *             (non-square, out-of-range index, or 1x1 input).
     */
    Mat Mat::cofactor(int target_row, int target_col) const
    {
        // Cofactor is only well-defined on square matrices, since downstream
        // it is consumed via det(M_ij). Validate squareness here, then
        // delegate the actual submatrix extraction to minor().
        if (this->row != this->col)
        {
            std::cerr << "[Error] cofactor: requires a square matrix (got "
                      << this->row << "x" << this->col << ")\n";
            return Mat(0, 0);
        }
        return this->minor(target_row, target_col);
    }

    /**
     * @name Mat::gaussian_eliminate
     * @brief Gaussian elimination with partial pivoting -> Row Echelon Form.
     *
     * Reduces the matrix to upper-triangular Row Echelon Form (REF) via
     * elementary row operations (row swaps + row additions).
     *
     * Numerical strategy
     * ------------------
     *  - **Partial pivoting**: at each step, the row with the LARGEST
     *    |element| in the current column (within rows [r, rows)) is chosen
     *    as the pivot row and swapped into row r. This bounds the
     *    multiplier |factor| <= 1 and keeps roundoff under control. This
     *    is the standard textbook recipe used by LAPACK's *getrf, and is
     *    materially more stable than picking the first non-zero entry.
     *  - **Exact zero in the eliminated column**: after subtracting the
     *    pivot row, the entry directly under the pivot is FORCED to 0.0f
     *    (it is mathematically zero; this avoids visible roundoff in REF
     *    structure without polluting the rest of the row).
     *  - Other entries are left as they actually compute -- any small
     *    "denormal noise" floor decision is left to the caller.
     *
     * Algorithmic complexity: O(rows * cols * min(rows, cols))
     *
     * @return Mat REF copy of the input. Mat(0, 0) on invalid input;
     *             a copy of the (empty) input when row == 0 or col == 0.
     */
    Mat Mat::gaussian_eliminate() const
    {
        // ----------------------------------------------------------------
        // 1) Input validation
        // ----------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] gaussian_eliminate: matrix data pointer is null\n";
            return Mat(0, 0);
        }
        if (this->row < 0 || this->col < 0)
        {
            std::cerr << "[Error] gaussian_eliminate: invalid matrix dimensions (rows="
                      << this->row << ", cols=" << this->col << ")\n";
            return Mat(0, 0);
        }
        // Valid empty matrix (one or both dims == 0): nothing to do.
        if (this->row == 0 || this->col == 0)
        {
            return Mat(*this);
        }

        // ----------------------------------------------------------------
        // 2) Working copy
        // ----------------------------------------------------------------
        Mat result(*this);
        if (result.data == nullptr)
        {
            std::cerr << "[Error] gaussian_eliminate: failed to create working copy\n";
            return Mat(0, 0);
        }

        const int rows = result.row;
        const int cols = result.col;
        const int rstep = result.step;

        // Pivot acceptance threshold. TINY_MATH_MIN_POSITIVE_INPUT_F32 is
        // the project-wide "non-zero" floor. With partial pivoting we
        // declare the column "pivot-less" only if even its LARGEST entry
        // sits below this threshold, so this check is much less aggressive
        // than the old "first non-zero" pattern.
        const float pivot_tol = TINY_MATH_MIN_POSITIVE_INPUT_F32;

        // ----------------------------------------------------------------
        // 3) Main elimination loop
        //    Independently advance row index 'r' and column index 'lead'.
        //    When the current column has no usable pivot, advance 'lead'
        //    only and try again with the same 'r' (these rows then take a
        //    "free" position, contributing zeros to the rank/echelon).
        // ----------------------------------------------------------------
        int r    = 0;
        int lead = 0;

        while (r < rows && lead < cols)
        {
            // 3.1 Find pivot row: argmax_{i in [r, rows)} |A(i, lead)|
            int   pivot_row = r;
            float pivot_abs = fabsf(result.data[r * rstep + lead]);
            for (int i = r + 1; i < rows; ++i)
            {
                const float v = fabsf(result.data[i * rstep + lead]);
                if (v > pivot_abs)
                {
                    pivot_abs = v;
                    pivot_row = i;
                }
            }

            // 3.2 No usable pivot in this column -> skip to next column,
            //     keep r unchanged so this row tries again later.
            if (pivot_abs < pivot_tol)
            {
                ++lead;
                continue;
            }

            // 3.3 Bring pivot row to position r
            if (pivot_row != r)
            {
                result.swap_rows(pivot_row, r);
            }

            // 3.4 Eliminate all rows below the pivot
            //     Row pointers hoisted out of the inner loop to avoid the
            //     repeated (i*step + j) address calculation; this also
            //     lets the compiler vectorize the AXPY across columns.
            float       *row_r  = result.data + r * rstep;
            const float  pivot  = row_r[lead];

            for (int j = r + 1; j < rows; ++j)
            {
                float *row_j = result.data + j * rstep;

                // Skip rows whose entry in the pivot column is already ~0
                if (fabsf(row_j[lead]) < pivot_tol)
                {
                    row_j[lead] = 0.0f;   // tidy any tiny residue
                    continue;
                }

                const float factor = row_j[lead] / pivot;
                // The eliminated column is mathematically zero after the
                // subtraction below; assign it directly to avoid visible
                // roundoff and shave one FMA per row.
                row_j[lead] = 0.0f;

                for (int k = lead + 1; k < cols; ++k)
                {
                    row_j[k] -= factor * row_r[k];
                }
            }

            // 3.5 Advance both indices: this pivot is consumed.
            ++r;
            ++lead;
        }

        return result;
    }

    /**
     * @name Mat::row_reduce_from_gaussian() const
     * @brief Convert a Row Echelon Form (REF) matrix to Reduced Row Echelon
     *        Form (RREF) by back-substitution.
     *
     * Contract: input is assumed to already be in REF (zeros below each
     * pivot, pivots strictly stair-step to the right). Calling this on
     * non-REF inputs is undefined behaviour by contract, though for
     * "almost REF" inputs it still degrades gracefully because each row's
     * leading non-zero is taken as that row's pivot.
     *
     * Algorithm
     * ---------
     *   For each row from the bottom up:
     *     a) Locate the pivot = first |entry| >= pivot_tol in that row.
     *        (Since the input is REF, this IS the pivot column.)
     *     b) Scale the row so the pivot becomes exactly 1.0.
     *     c) Subtract appropriate multiples of the row from every row
     *        above to zero out their entry in the pivot column.
     *
     * Numerical strategy
     * ------------------
     *  - The pivot column entry is set to its mathematically exact value
     *    (1.0 after normalize, 0.0 after eliminate-above) DIRECTLY,
     *    avoiding visible roundoff while keeping the rest of each row
     *    untouched (no per-element noise floor).
     *  - Inner loops are written against raw row pointers to skip the
     *    operator() index math and to give the compiler a clean AXPY/
     *    scaled-assign shape suitable for auto-vectorization.
     *
     * @return Mat RREF copy of the input. Mat(0, 0) on invalid input;
     *             a copy of the (empty) input when row == 0 or col == 0.
     */
    Mat Mat::row_reduce_from_gaussian() const
    {
        // ----------------------------------------------------------------
        // 1) Input validation
        // ----------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] row_reduce_from_gaussian: matrix data pointer is null\n";
            return Mat(0, 0);
        }
        if (this->row < 0 || this->col < 0)
        {
            std::cerr << "[Error] row_reduce_from_gaussian: invalid matrix dimensions (rows="
                      << this->row << ", cols=" << this->col << ")\n";
            return Mat(0, 0);
        }
        // Valid empty matrix: nothing to reduce.
        if (this->row == 0 || this->col == 0)
        {
            return Mat(*this);
        }

        // ----------------------------------------------------------------
        // 2) Working copy
        // ----------------------------------------------------------------
        Mat R(*this);
        if (R.data == nullptr)
        {
            std::cerr << "[Error] row_reduce_from_gaussian: failed to create working copy\n";
            return Mat(0, 0);
        }

        const int rows  = R.row;
        const int cols  = R.col;
        const int rstep = R.step;

        const float pivot_tol = TINY_MATH_MIN_POSITIVE_INPUT_F32;

        // ----------------------------------------------------------------
        // 3) Back-substitution loop: bottom row -> top row
        // ----------------------------------------------------------------
        for (int p = rows - 1; p >= 0; --p)
        {
            float *row_p = R.data + p * rstep;

            // 3.1 Locate pivot in row p (first |entry| >= pivot_tol).
            int   pivot_col = -1;
            float pivot_val = 0.0f;
            for (int k = 0; k < cols; ++k)
            {
                if (fabsf(row_p[k]) >= pivot_tol)
                {
                    pivot_col = k;
                    pivot_val = row_p[k];
                    break;
                }
            }
            if (pivot_col < 0)
            {
                // All-zero row -> nothing to do.
                continue;
            }

            // 3.2 Normalize the pivot row so that row_p[pivot_col] == 1.0.
            //     The pivot column is set to exact 1.0 directly; only
            //     columns to its right need the actual division.
            const float inv_pivot = 1.0f / pivot_val;
            row_p[pivot_col] = 1.0f;
            for (int s = pivot_col + 1; s < cols; ++s)
            {
                row_p[s] *= inv_pivot;
            }

            // 3.3 Eliminate above the pivot:
            //     row_t[s] -= factor * row_p[s]   for each row t < p,
            //     where factor = row_t[pivot_col] (the entry to clear).
            for (int t = p - 1; t >= 0; --t)
            {
                float       *row_t = R.data + t * rstep;
                const float  factor = row_t[pivot_col];

                if (fabsf(factor) < pivot_tol)
                {
                    row_t[pivot_col] = 0.0f;   // tidy any tiny residue
                    continue;
                }

                // Eliminated column is mathematically zero; assign
                // directly and start the inner loop one column later.
                row_t[pivot_col] = 0.0f;
                for (int s = pivot_col + 1; s < cols; ++s)
                {
                    row_t[s] -= factor * row_p[s];
                }
            }
        }

        return R;
    }

    /**
     * @name Mat::inverse_gje()
     * @brief Compute the inverse of a square matrix via Gauss-Jordan elimination.
     *
     * @details
     *   Algorithm (classical Gauss-Jordan on the augmented matrix):
     *     1. Build the augmented matrix M = [A | I] of size n x 2n.
     *     2. Reduce M to row-echelon form (REF) via gaussian_eliminate(), which
     *        applies partial pivoting (largest |pivot| in column).
     *     3. Reduce REF to reduced row-echelon form (RREF) via
     *        row_reduce_from_gaussian(), which back-substitutes to clear entries
     *        above each pivot and normalizes pivots to 1.
     *     4. If A is invertible, M reduces to [I | A^{-1}]; the right block is
     *        the inverse. Otherwise the left block is not identity and we abort.
     *
     *   Singularity test: after RREF, gaussian_eliminate + row_reduce_from_gaussian
     *   force pivot rows / columns to exact 1.0 / 0.0 (no roundoff). For an
     *   invertible A, every left-block column is a pivot column, so the left
     *   block equals I exactly. For a singular A, at least one column is a
     *   non-pivot column whose entries are structurally non-zero, far above
     *   TINY_MATH_MIN_POSITIVE_INPUT_F32. A small "any non-zero" threshold is
     *   therefore both necessary and sufficient.
     *
     * @note Complexity: O(n^3) time, O(n^2) extra memory (one n x 2n scratch
     *       matrix). Preferred over the adjoint method for n >= 4.
     * @note On any error (null data, non-square, singular, allocation failure)
     *       returns an empty matrix Mat(0, 0).
     *
     * @return Mat n x n inverse if A is invertible; empty Mat(0, 0) otherwise.
     */
    Mat Mat::inverse_gje() const
    {
        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] inverse_gje: matrix data pointer is null.\n";
            return Mat(0, 0);
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] inverse_gje: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            return Mat(0, 0);
        }
        if (this->row != this->col)
        {
            std::cerr << "[Error] inverse_gje: requires a square matrix (got "
                      << this->row << "x" << this->col << ").\n";
            return Mat(0, 0);
        }

        const int n = this->row;

        // ---------------------------------------------------------------------
        // 2. Build the augmented matrix M = [A | I]
        // ---------------------------------------------------------------------
        Mat I = Mat::eye(n);
        if (I.data == nullptr)
        {
            std::cerr << "[Error] inverse_gje: failed to allocate identity matrix.\n";
            return Mat(0, 0);
        }

        Mat augmented = Mat::augment(*this, I);
        if (augmented.data == nullptr)
        {
            std::cerr << "[Error] inverse_gje: failed to build augmented matrix.\n";
            return Mat(0, 0);
        }
        // augment() guarantees augmented.col == n + n on success; no extra check needed.

        // ---------------------------------------------------------------------
        // 3. Reduce [A | I] to RREF via REF -> RREF
        // ---------------------------------------------------------------------
        Mat rref = augmented.gaussian_eliminate();
        if (rref.data == nullptr)
        {
            std::cerr << "[Error] inverse_gje: gaussian_eliminate failed.\n";
            return Mat(0, 0);
        }

        rref = rref.row_reduce_from_gaussian();
        if (rref.data == nullptr)
        {
            std::cerr << "[Error] inverse_gje: row_reduce_from_gaussian failed.\n";
            return Mat(0, 0);
        }

        // ---------------------------------------------------------------------
        // 4. Singularity test: left block must equal identity
        //
        //    Because GE/RREF set pivot columns to exact 1.0 / 0.0, a fully
        //    pivoted (i.e. invertible) matrix yields an exact identity on the
        //    left. Any deviation > MIN_POSITIVE_INPUT signals a missing pivot.
        // ---------------------------------------------------------------------
        for (int i = 0; i < n; ++i)
        {
            const float* row_i = rref.data + i * rref.step;
            for (int j = 0; j < n; ++j)
            {
                const float expected = (i == j) ? 1.0f : 0.0f;
                if (fabsf(row_i[j] - expected) > TINY_MATH_MIN_POSITIVE_INPUT_F32)
                {
                    std::cerr << "[Error] inverse_gje: matrix is singular "
                              << "(left block not identity at (" << i << ", " << j
                              << "): expected " << expected << ", got " << row_i[j] << ").\n";
                    return Mat(0, 0);
                }
            }
        }

        // ---------------------------------------------------------------------
        // 5. Extract the right block as A^{-1} (one memcpy per row)
        // ---------------------------------------------------------------------
        Mat result(n, n);
        if (result.data == nullptr)
        {
            std::cerr << "[Error] inverse_gje: failed to allocate result matrix.\n";
            return Mat(0, 0);
        }

        const size_t row_bytes = sizeof(float) * static_cast<size_t>(n);
        for (int i = 0; i < n; ++i)
        {
            std::memcpy(result.data + i * result.step,
                        rref.data + i * rref.step + n,
                        row_bytes);
        }

        return result;
    }

    /**
     * @name Mat::solve
     * @brief Solve the linear system Ax = b using Gaussian elimination.
     *
     * @details
     *   The method builds the augmented matrix [A | b], reduces it to REF with
     *   gaussian_eliminate() (partial pivoting), then performs back-substitution.
     *   This keeps solve() numerically aligned with the shared elimination path
     *   used by inverse_gje().
     *
     * @note Time complexity: O(n^3). On invalid input, singular systems, or
     *       allocation failure, returns Mat(0, 0).
     *
     * @param A Coefficient matrix (N×N, must be square)
     * @param b Result vector (N×1)
     * @return Mat Solution vector (N×1) containing x such that Ax = b, or Mat(0, 0) on error.
     */
    Mat Mat::solve(const Mat &A, const Mat &b) const
    {
        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (A.data == nullptr)
        {
            std::cerr << "[Error] solve: matrix A data pointer is null.\n";
            return Mat(0, 0);
        }
        if (b.data == nullptr)
        {
            std::cerr << "[Error] solve: vector b data pointer is null.\n";
            return Mat(0, 0);
        }
        if (A.row <= 0 || A.col <= 0)
        {
            std::cerr << "[Error] solve: invalid matrix A dimensions: "
                      << A.row << "x" << A.col << ".\n";
            return Mat(0, 0);
        }
        if (b.row <= 0 || b.col <= 0)
        {
            std::cerr << "[Error] solve: invalid vector b dimensions: "
                      << b.row << "x" << b.col << ".\n";
            return Mat(0, 0);
        }
        if (A.row != A.col)
        {
            std::cerr << "[Error] solve: matrix A must be square (got "
                      << A.row << "x" << A.col << ").\n";
            return Mat(0, 0);
        }
        if (A.row != b.row || b.col != 1)
        {
            std::cerr << "[Error] solve: dimensions do not match (A: "
                      << A.row << "x" << A.col << ", b: " << b.row << "x" << b.col
                      << ", expected b: " << A.row << "x1).\n";
            return Mat(0, 0);
        }

        const int n = A.row;

        // ---------------------------------------------------------------------
        // 2. Build [A | b] and reduce it to REF
        // ---------------------------------------------------------------------
        Mat augmented = Mat::augment(A, b);
        if (augmented.data == nullptr)
        {
            std::cerr << "[Error] solve: failed to build augmented matrix.\n";
            return Mat(0, 0);
        }

        Mat ref = augmented.gaussian_eliminate();
        if (ref.data == nullptr)
        {
            std::cerr << "[Error] solve: gaussian_eliminate failed.\n";
            return Mat(0, 0);
        }

        // ---------------------------------------------------------------------
        // 3. Back-substitution on the upper triangular REF
        // ---------------------------------------------------------------------
        Mat solution(n, 1);
        if (solution.data == nullptr)
        {
            std::cerr << "[Error] solve: failed to allocate solution vector.\n";
            return Mat(0, 0);
        }

        const int ref_step = ref.step;
        const int rhs_col = n;
        for (int i = n - 1; i >= 0; --i)
        {
            const float* row_i = ref.data + i * ref_step;
            const float pivot = row_i[i];
            if (fabsf(pivot) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
            {
                std::cerr << "[Error] solve: zero or near-zero pivot at (" << i << ", " << i
                          << "), system is singular or rank-deficient.\n";
                return Mat(0, 0);
            }

            float sum = row_i[rhs_col];
            for (int j = i + 1; j < n; ++j)
            {
                sum -= row_i[j] * solution.data[j * solution.step];
            }
            solution.data[i * solution.step] = sum / pivot;
        }

        return solution;
    }

    /**
     * @name Mat::band_solve
     * @brief Solve Ax = b using band-limited Gaussian elimination.
     *
     * @details
     *   This routine assumes A is a square banded matrix whose non-zero entries
     *   lie within k total diagonals centered on the main diagonal. It avoids
     *   touching known-zero regions during elimination and back-substitution.
     *
     *   No pivoting is applied: row swaps generally widen the band and can
     *   destroy the purpose of a banded solver. If a diagonal pivot is tiny,
     *   the method fails fast; use solve() for the more robust partial-pivoting
     *   path.
     *
     * @note Time complexity: O(n * half_band^2) for symmetric narrow bands.
     * @note k is the total band width including the diagonal. For a tridiagonal
     *       matrix, k = 3 and half_band = 1.
     *
     * @param A Coefficient matrix (N×N) - banded matrix (passed by value, will be modified)
     * @param b Result vector (N×1) (passed by value, will be modified)
     * @param k Bandwidth of the matrix (must be >= 1 and odd, typically 3, 5, 7, ...)
     * @return Mat Solution vector (N×1) containing x such that Ax = b, or Mat(0, 0) on error.
     */
    Mat Mat::band_solve(Mat A, Mat b, int k) const
    {
        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (A.data == nullptr)
        {
            std::cerr << "[Error] band_solve: matrix A data pointer is null.\n";
            return Mat(0, 0);
        }
        if (b.data == nullptr)
        {
            std::cerr << "[Error] band_solve: vector b data pointer is null.\n";
            return Mat(0, 0);
        }
        if (A.row <= 0 || A.col <= 0)
        {
            std::cerr << "[Error] band_solve: invalid matrix A dimensions: "
                      << A.row << "x" << A.col << ".\n";
            return Mat(0, 0);
        }
        if (b.row <= 0 || b.col <= 0)
        {
            std::cerr << "[Error] band_solve: invalid vector b dimensions: "
                      << b.row << "x" << b.col << ".\n";
            return Mat(0, 0);
        }
        if (A.row != A.col)
        {
            std::cerr << "[Error] band_solve: matrix A must be square (got "
                      << A.row << "x" << A.col << ").\n";
            return Mat(0, 0);
        }
        if (A.row != b.row || b.col != 1)
        {
            std::cerr << "[Error] band_solve: dimensions do not match (A: "
                      << A.row << "x" << A.col << ", b: " << b.row << "x" << b.col
                      << ", expected b: " << A.row << "x1).\n";
            return Mat(0, 0);
        }
        if (k < 1)
        {
            std::cerr << "[Error] band_solve: bandwidth k must be >= 1 (got " << k << ").\n";
            return Mat(0, 0);
        }

        const int n = A.row;
        if (k > A.row)
        {
            std::cerr << "[Warning] band_solve: bandwidth k=" << k
                      << " is larger than matrix size " << n
                      << "; using general solve may be more efficient.\n";
        }

        const int half_band = (k - 1) / 2;
        const float pivot_tol = TINY_MATH_MIN_POSITIVE_INPUT_F32;

        // ---------------------------------------------------------------------
        // 2. Forward elimination within the band
        // ---------------------------------------------------------------------
        for (int i = 0; i < n; ++i)
        {
            float* row_i = A.data + i * A.step;
            const float pivot = row_i[i];
            if (fabsf(pivot) < pivot_tol)
            {
                std::cerr << "[Error] band_solve: zero or near-zero pivot at ("
                          << i << ", " << i << ") = " << pivot
                          << "; matrix is singular or requires pivoting.\n";
                return Mat(0, 0);
            }

            const float inv_pivot = 1.0f / pivot;
            const int last_row = std::min(n, i + half_band + 1);
            const int last_col = std::min(n, i + half_band + 1);

            for (int j = i + 1; j < last_row; ++j)
            {
                float* row_j = A.data + j * A.step;
                if (fabsf(row_j[i]) < pivot_tol)
                {
                    row_j[i] = 0.0f;
                    continue;
                }

                const float factor = row_j[i] * inv_pivot;
                row_j[i] = 0.0f; // eliminated exactly by construction

                for (int col_idx = i + 1; col_idx < last_col; ++col_idx)
                {
                    row_j[col_idx] -= row_i[col_idx] * factor;
                }
                b.data[j * b.step] -= b.data[i * b.step] * factor;
            }
        }

        // ---------------------------------------------------------------------
        // 3. Back-substitution within the upper band
        // ---------------------------------------------------------------------
        Mat x(n, 1);
        if (x.data == nullptr)
        {
            std::cerr << "[Error] band_solve: failed to allocate solution vector.\n";
            return Mat(0, 0);
        }

        for (int i = n - 1; i >= 0; --i)
        {
            const float* row_i = A.data + i * A.step;
            const float pivot = row_i[i];
            if (fabsf(pivot) < pivot_tol)
            {
                std::cerr << "[Error] band_solve: zero pivot at (" << i << ", " << i
                          << ") during back-substitution.\n";
                return Mat(0, 0);
            }

            float sum = 0.0f;
            const int max_j = std::min(n, i + half_band + 1);
            for (int j = i + 1; j < max_j; ++j)
            {
                sum += row_i[j] * x.data[j * x.step];
            }

            x.data[i * x.step] = (b.data[i * b.step] - sum) / pivot;
        }

        return x;
    }

    /**
     * @name Mat::roots(Mat A, Mat y)
     * @brief Solve the linear system A * x = y.
     *
     * @details
     *   roots() is kept as a compatibility wrapper around solve(). Older code
     *   had a second, independent elimination implementation here; delegating
     *   to solve() avoids duplicated numerical logic and gives roots() the same
     *   partial-pivoting behavior.
     *
     * @note Time complexity: O(n^3), same as solve().
     *
     * @param A Coefficient matrix (N×N, must be square)
     * @param y Result vector (N×1)
     * @return Mat Solution vector (N×1) containing x such that Ax = y, or Mat(0, 0) on error.
     */
    Mat Mat::roots(Mat A, Mat y) const
    {
        return solve(A, y);
    }

    // ============================================================================
    // Matrix Decomposition
    // ============================================================================
    /**
     * @name Mat::LUDecomposition::LUDecomposition()
     * @brief Default constructor for LUDecomposition structure
     */
    Mat::LUDecomposition::LUDecomposition()
        : L(0, 0), U(0, 0), P(0, 0), pivoted(false), status(TINY_OK)
    {
    }

    /**
     * @name Mat::CholeskyDecomposition::CholeskyDecomposition()
     * @brief Default constructor for CholeskyDecomposition structure
     */
    Mat::CholeskyDecomposition::CholeskyDecomposition()
        : L(0, 0), status(TINY_OK)
    {
    }

    /**
     * @name Mat::QRDecomposition::QRDecomposition()
     * @brief Default constructor for QRDecomposition structure
     */
    Mat::QRDecomposition::QRDecomposition()
        : Q(0, 0), R(0, 0), status(TINY_OK)
    {
    }

    /**
     * @name Mat::SVDDecomposition::SVDDecomposition()
     * @brief Default constructor for SVDDecomposition structure
     */
    Mat::SVDDecomposition::SVDDecomposition()
        : U(0, 0), S(0, 0), V(0, 0), rank(0), iterations(0), status(TINY_OK)
    {
    }

    /**
     * @name Mat::is_positive_definite()
     * @brief Check whether a symmetric matrix is positive definite.
     *
     * @details
     *   A symmetric matrix A is positive definite iff its Cholesky decomposition
     *   A = L * L^T exists with strictly positive diagonal entries on L. This is
     *   equivalent to Sylvester's criterion (every leading principal minor is
     *   positive) but **much cheaper to evaluate**: a partial Cholesky factor
     *   built up to depth k passes if and only if the first k leading principal
     *   minors are positive.
     *
     *   Algorithm (partial Cholesky):
     *       for i = 0 .. depth-1:
     *           sum = sum_{k<i} L(i,k)^2
     *           diag = A(i,i) - sum                  // == det of (i+1)x(i+1) leading minor
     *                                                 //    divided by det of i x i leading minor
     *           if diag <= tolerance: not PD, return false
     *           L(i,i) = sqrt(diag)
     *           for j = i+1 .. depth-1:
     *               L(j,i) = (A(j,i) - sum_{k<i} L(j,k)*L(i,k)) / L(i,i)
     *
     *   Complexity: O(depth^3 / 3), versus the previous O(n^4) Sylvester+Laplace
     *   path. Storage: depth^2 floats (one scratch lower-triangular L).
     *
     * @param tolerance Lower bound for each Cholesky pivot diag (must be >= 0).
     *                  A pivot <= tolerance is treated as a failure (i.e. A is
     *                  not strictly PD at that level).
     * @param max_minors_to_check Depth of the leading principal block to test.
     *                            - If < 0: test the whole matrix (depth = n).
     *                            - If == 0: invalid; returns false with an error.
     *                            - If > 0: clamped to [1, n]; only the first
     *                              `max_minors_to_check` leading principal minors
     *                              are tested (early-exit semantics).
     *
     * @return true if every tested leading principal minor is positive.
     */
    bool Mat::is_positive_definite(float tolerance, int max_minors_to_check) const
    {
        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] is_positive_definite: matrix data pointer is null.\n";
            return false;
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] is_positive_definite: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            return false;
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] is_positive_definite: tolerance must be >= 0 (got "
                      << tolerance << ").\n";
            return false;
        }
        if (this->row != this->col)
        {
            return false;
        }
        if (!this->is_symmetric(tolerance))
        {
            return false;
        }

        const int n = this->row;
        if (n == 0)
        {
            return true; // empty matrix trivially PD
        }

        // ---------------------------------------------------------------------
        // 2. Resolve check depth
        // ---------------------------------------------------------------------
        int depth;
        if (max_minors_to_check < 0)
        {
            depth = n;
        }
        else if (max_minors_to_check == 0)
        {
            std::cerr << "[Error] is_positive_definite: max_minors_to_check must be > 0 or -1 (got 0).\n";
            return false;
        }
        else
        {
            depth = (max_minors_to_check > n) ? n : max_minors_to_check;
        }

        // ---------------------------------------------------------------------
        // 3. Partial Cholesky to depth
        //    L is a (depth x depth) lower-triangular working buffer. We store
        //    only the lower triangle, which is the same memory layout the
        //    Cholesky-factor convention uses.
        // ---------------------------------------------------------------------
        Mat L(depth, depth);
        if (L.data == nullptr)
        {
            std::cerr << "[Error] is_positive_definite: failed to allocate scratch L.\n";
            return false;
        }

        const int Lstep = L.step;
        const int Astep = this->step;

        for (int i = 0; i < depth; ++i)
        {
            float* row_Li = L.data + i * Lstep;
            const float* row_Ai = this->data + i * Astep;

            // Diagonal: L(i,i) = sqrt(A(i,i) - sum_{k<i} L(i,k)^2)
            float diag_sum = 0.0f;
            for (int k = 0; k < i; ++k)
            {
                diag_sum += row_Li[k] * row_Li[k];
            }
            const float diag = row_Ai[i] - diag_sum;
            if (diag <= tolerance || std::isnan(diag) || std::isinf(diag))
            {
                return false; // i-th leading principal minor not PD
            }
            const float pivot = sqrtf(diag);
            row_Li[i] = pivot;
            const float inv_pivot = 1.0f / pivot;

            // Below-diagonal column i: L(j,i) = (A(j,i) - <Lj,Li>) / L(i,i)
            for (int j = i + 1; j < depth; ++j)
            {
                float* row_Lj = L.data + j * Lstep;
                const float* row_Aj = this->data + j * Astep;

                float off_sum = 0.0f;
                for (int k = 0; k < i; ++k)
                {
                    off_sum += row_Lj[k] * row_Li[k];
                }
                row_Lj[i] = (row_Aj[i] - off_sum) * inv_pivot;
            }
        }

        return true;
    }

    /**
     * @name Mat::lu_decompose()
     * @brief Compute LU decomposition: A = L * U (with optional pivoting).
     * @note LU decomposition factors a matrix A into:
     *       - L: Lower triangular matrix (with unit diagonal)
     *       - U: Upper triangular matrix
     *       - P: Permutation matrix (if pivoting used)
     * @note With pivoting: P * A = L * U
     *       Without pivoting: A = L * U
     * @note Algorithm: Modified Gaussian elimination that stores multipliers in L.
     *       Uses partial pivoting for numerical stability.
     * @note Time complexity: O(n³) - efficient for large matrices.
     *       More efficient than Gaussian elimination when solving multiple systems
     *       with the same coefficient matrix.
     * 
     * @param use_pivoting Whether to use partial pivoting (default: true).
     *                     Pivoting improves numerical stability but requires P matrix.
     * @return LUDecomposition containing L, U, P matrices and status
     */
    Mat::LUDecomposition Mat::lu_decompose(bool use_pivoting) const
    {
        LUDecomposition result;

        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] lu_decompose: matrix data pointer is null.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] lu_decompose: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (this->row != this->col)
        {
            std::cerr << "[Error] lu_decompose: requires a square matrix (got "
                      << this->row << "x" << this->col << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        const int n = this->row;
        result.pivoted = use_pivoting;

        if (n == 0)
        {
            result.L = Mat(0, 0);
            result.U = Mat(0, 0);
            if (use_pivoting) result.P = Mat(0, 0);
            result.status = TINY_OK;
            return result;
        }

        // ---------------------------------------------------------------------
        // 2. Allocate working / output buffers
        //    A : working copy that ends as packed L|U after the loop
        //    L : initialized to I, fills lower-triangular multipliers
        //    U : initialized to 0, fills upper-triangular at the end
        //    P : (optional) permutation matrix, init to I
        // ---------------------------------------------------------------------
        Mat A(*this);
        if (A.data == nullptr)
        {
            std::cerr << "[Error] lu_decompose: failed to allocate working copy.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        result.L = Mat::eye(n);
        result.U = Mat(n, n);
        if (result.L.data == nullptr || result.U.data == nullptr)
        {
            std::cerr << "[Error] lu_decompose: failed to allocate L or U.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (use_pivoting)
        {
            result.P = Mat::eye(n);
            if (result.P.data == nullptr)
            {
                std::cerr << "[Error] lu_decompose: failed to allocate P.\n";
                result.status = TINY_ERR_MATH_NULL_POINTER;
                return result;
            }
        }

        const int Astep = A.step;
        const int Lstep = result.L.step;
        float* const Ldata = result.L.data;
        float* const Adata = A.data;

        // ---------------------------------------------------------------------
        // 3. LU with optional partial pivoting (Doolittle: unit-diagonal L)
        // ---------------------------------------------------------------------
        for (int k = 0; k < n; ++k)
        {
            if (use_pivoting)
            {
                // 3.1 Find row with the largest |A(i, k)| for i in [k, n)
                int   max_row = k;
                float max_val = fabsf(Adata[k * Astep + k]);
                for (int i = k + 1; i < n; ++i)
                {
                    const float abs_val = fabsf(Adata[i * Astep + k]);
                    if (abs_val > max_val)
                    {
                        max_val = abs_val;
                        max_row = i;
                    }
                }

                // 3.2 Swap rows in A, in P, and in the already-written part of L
                if (max_row != k)
                {
                    A.swap_rows(k, max_row);
                    result.P.swap_rows(k, max_row);
                    float* row_Lk = Ldata + k       * Lstep;
                    float* row_Lm = Ldata + max_row * Lstep;
                    for (int j = 0; j < k; ++j)
                    {
                        const float t = row_Lk[j];
                        row_Lk[j] = row_Lm[j];
                        row_Lm[j] = t;
                    }
                }
            }

            float* row_Ak = Adata + k * Astep;

            // 3.3 Singularity guard
            const float pivot = row_Ak[k];
            if (fabsf(pivot) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
            {
                std::cerr << "[Error] lu_decompose: matrix is singular or near-singular at column "
                          << k << " (pivot = " << pivot << ").\n";
                result.status = TINY_ERR_MATH_INVALID_PARAM;
                return result;
            }
            const float inv_pivot = 1.0f / pivot;

            // 3.4 Copy U row k from A
            float* row_Uk = result.U.data + k * result.U.step;
            for (int j = k; j < n; ++j)
            {
                row_Uk[j] = row_Ak[j];
            }

            // 3.5 Eliminate below the pivot, store multipliers in L(i, k)
            for (int i = k + 1; i < n; ++i)
            {
                float* row_Ai = Adata + i * Astep;
                const float multiplier = row_Ai[k] * inv_pivot;
                Ldata[i * Lstep + k] = multiplier;
                if (multiplier == 0.0f) continue; // already eliminated
                for (int j = k + 1; j < n; ++j)
                {
                    row_Ai[j] -= multiplier * row_Ak[j];
                }
            }
        }

        result.status = TINY_OK;
        return result;
    }

    /**
     * @name Mat::cholesky_decompose()
     * @brief Compute Cholesky decomposition: A = L * L^T (for symmetric positive definite matrices).
     * @note Cholesky decomposition factors a symmetric positive definite matrix A into:
     *       A = L * L^T
     *       where L is a lower triangular matrix with positive diagonal elements.
     * @note Algorithm: For each row i and column j (j <= i):
     *       - Diagonal (j == i): L[i][i] = sqrt(A[i][i] - sum(L[i][k]^2))
     *       - Off-diagonal (j < i): L[i][j] = (A[i][j] - sum(L[i][k]*L[j][k])) / L[j][j]
     * @note Requirements:
     *       - Matrix must be square
     *       - Matrix must be symmetric: A^T = A
     *       - Matrix must be positive definite: all eigenvalues > 0
     * @note Advantages over LU decomposition:
     *       - Faster: O(n³/3) vs O(n³) for LU
     *       - More stable: no pivoting needed
     *       - Uses less memory: only stores L (not L and U)
     * @note Applications: Structural dynamics, optimization, Kalman filtering, Monte Carlo simulation
     * @note Time complexity: O(n³/3) - about 2x faster than LU decomposition
     * 
     * @return CholeskyDecomposition containing L matrix and status
     */
    Mat::CholeskyDecomposition Mat::cholesky_decompose() const
    {
        CholeskyDecomposition result;

        // Check for null pointer
        if (this->data == nullptr)
        {
            std::cerr << "[Error] cholesky_decompose: matrix data pointer is null\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }

        // Validate matrix dimensions
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] cholesky_decompose: invalid matrix dimensions: rows=" 
                      << this->row << ", cols=" << this->col << "\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        // Validation: must be square matrix
        if (this->row != this->col)
        {
            std::cerr << "[Error] cholesky_decompose: requires square matrix (got " 
                      << this->row << "x" << this->col << ")\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        int n = this->row;

        // Handle empty matrix
        if (n == 0)
        {
            // Empty matrix: L is also empty
            result.L = Mat(0, 0);
            result.status = TINY_OK;
            return result;
        }

        // Check if symmetric (use standard tolerance)
        if (!this->is_symmetric(TINY_MATH_MIN_POSITIVE_INPUT_F32))
        {
            std::cerr << "[Error] cholesky_decompose: requires symmetric matrix\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        result.L = Mat(n, n);
        if (result.L.data == nullptr)
        {
            std::cerr << "[Error] cholesky_decompose: failed to allocate L matrix.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }

        // ---------------------------------------------------------------------
        // Cholesky factorization (column-by-column, packed lower triangular).
        //
        //   For column j running 0 .. n-1:
        //       diag = A(j,j) - <L(j, 0:j), L(j, 0:j)>
        //       if diag <= eps  -> A is not positive definite
        //       L(j,j)  = sqrt(diag)
        //       inv_d  = 1 / L(j,j)
        //       for i in j+1 .. n-1:
        //           L(i,j) = (A(i,j) - <L(i, 0:j), L(j, 0:j)>) * inv_d
        //
        // Strict lower triangle is the only part written; the upper triangle
        // stays zero from Mat(n, n).  Hot inner loops use raw row pointers.
        // ---------------------------------------------------------------------
        const int Lstep = result.L.step;
        const int Astep = this->step;
        float* const Ldata = result.L.data;
        const float* const Adata = this->data;

        for (int j = 0; j < n; ++j)
        {
            float* row_Lj = Ldata + j * Lstep;
            const float* row_Aj = Adata + j * Astep;

            // Diagonal pivot
            float diag_sum = 0.0f;
            for (int k = 0; k < j; ++k)
            {
                diag_sum += row_Lj[k] * row_Lj[k];
            }
            const float diag = row_Aj[j] - diag_sum;
            if (diag <= TINY_MATH_MIN_POSITIVE_INPUT_F32)
            {
                std::cerr << "[Error] cholesky_decompose: matrix is not positive definite "
                          << "(diagonal residual " << diag << " at position ["
                          << j << "][" << j << "]).\n";
                result.status = TINY_ERR_MATH_INVALID_PARAM;
                return result;
            }
            const float pivot = sqrtf(diag);
            row_Lj[j] = pivot;
            const float inv_pivot = 1.0f / pivot;

            // Below-diagonal entries in column j
            for (int i = j + 1; i < n; ++i)
            {
                float* row_Li = Ldata + i * Lstep;
                const float* row_Ai = Adata + i * Astep;

                float off_sum = 0.0f;
                for (int k = 0; k < j; ++k)
                {
                    off_sum += row_Li[k] * row_Lj[k];
                }
                row_Li[j] = (row_Ai[j] - off_sum) * inv_pivot;
            }
        }

        result.status = TINY_OK;
        return result;
    }

    /**
     * @name Mat::qr_decompose()
     * @brief Compute QR decomposition: A = Q * R (Q orthogonal, R upper triangular).
     * @note QR decomposition factors a matrix A into:
     *       A = Q * R
     *       where:
     *       - Q: Orthogonal matrix (Q^T * Q = I, columns are orthonormal)
     *       - R: Upper triangular matrix
     * @note Algorithm: Uses Modified Gram-Schmidt process for orthogonalization.
     *       The Gram-Schmidt process orthogonalizes the columns of A to form Q,
     *       and stores the projection coefficients in R.
     * @note Mathematical relationship:
     *       - Q is m×min(m,n) matrix with orthonormal columns
     *       - R is min(m,n)×n upper triangular matrix
     *       - For m >= n: A = Q * R (full QR)
     *       - For m < n: A = Q * R (reduced QR, Q is m×m, R is m×n)
     * @note Applications:
     *       - Least squares problems: min ||A*x - b|| → solve R*x = Q^T*b
     *       - Solving overdetermined systems
     *       - Eigenvalue computation (QR algorithm)
     *       - Matrix rank estimation
     * @note Time complexity: O(m*n²) - efficient for tall matrices (m >> n)
     * @note Numerical stability: Modified Gram-Schmidt is more stable than classical GS
     * 
     * @return QRDecomposition containing Q and R matrices and status
     */
    Mat::QRDecomposition Mat::qr_decompose() const
    {
        QRDecomposition result;

        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] qr_decompose: matrix data pointer is null.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] qr_decompose: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        const int m = this->row;
        const int n = this->col;
        const int min_dim = (m < n) ? m : n;

        if (m == 0 || n == 0)
        {
            result.Q = Mat(0, 0);
            result.R = Mat(0, 0);
            result.status = TINY_OK;
            return result;
        }

        // ---------------------------------------------------------------------
        // 2. Run Modified Gram-Schmidt (reuses the optimized helper)
        //    R_coeff is already a fully populated n x n upper-triangular block:
        //      - For k <= j: R_coeff(k, j) is the projection r_kj computed by MGS
        //      - For k >  j: zero (kept by gram_schmidt_orthogonalize)
        //    Therefore there is no need to recompute Q^T * A here.
        // ---------------------------------------------------------------------
        Mat Q_ortho, R_coeff;
        if (!Mat::gram_schmidt_orthogonalize(*this, Q_ortho, R_coeff, TINY_MATH_MIN_POSITIVE_INPUT_F32))
        {
            std::cerr << "[Error] qr_decompose: gram_schmidt_orthogonalize failed.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (Q_ortho.data == nullptr || R_coeff.data == nullptr)
        {
            std::cerr << "[Error] qr_decompose: gram_schmidt_orthogonalize returned null buffers.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (Q_ortho.row != m || Q_ortho.col != n ||
            R_coeff.row != n || R_coeff.col != n)
        {
            std::cerr << "[Error] qr_decompose: unexpected dimensions from gram_schmidt_orthogonalize.\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        // ---------------------------------------------------------------------
        // 3. Materialize Q and R in the QR convention
        //    R is sized (m x n) so the bottom (m - min_dim) rows are zero.
        //    The top min_dim rows are exactly R_coeff's upper-triangular block,
        //    plus, for wide matrices (m < n), the extra columns j >= m which
        //    MGS already filled (r_kj = q_k^T * a_j for k < m).
        // ---------------------------------------------------------------------
        result.Q = Q_ortho;
        result.R = Mat(m, n);
        if (result.R.data == nullptr)
        {
            std::cerr << "[Error] qr_decompose: failed to allocate R matrix.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }

        const int Rstep = result.R.step;
        const int Cstep = R_coeff.step;
        for (int j = 0; j < n; ++j)
        {
            const int k_end = (j + 1 < min_dim) ? (j + 1) : min_dim;
            float* col_R = result.R.data + j;
            const float* col_C = R_coeff.data + j;
            for (int k = 0; k < k_end; ++k)
            {
                col_R[k * Rstep] = col_C[k * Cstep];
            }
            // remaining rows in this column stay zero (Mat ctor zero-inits)
        }

        result.status = TINY_OK;
        return result;
    }

    /**
     * @name Mat::svd_decompose()
     * @brief Compute the (thin/economy) Singular Value Decomposition: A = U * S * V^T.
     *
     * Output dimensions:
     *   - U: m x min(m,n)  — orthonormal columns (left singular vectors)
     *   - S: min(m,n) x 1  — singular values, sorted in *descending* order
     *   - V: n x n         — orthogonal matrix; first `rank` columns are the
     *                        right singular vectors corresponding to the non-zero
     *                        singular values (sorted), remaining columns form an
     *                        orthonormal basis of the null space of A.
     *
     * Algorithm (normal-equations / Jacobi):
     *   1. Form M = A^T A  (n x n, symmetric positive-semidefinite)
     *   2. eig(M) = V diag(lambda) V^T  via Jacobi
     *   3. sigma_i = sqrt(lambda_i)     (lambda_i is clamped at 0 to absorb tiny
     *                                    negative roundoff)
     *   4. Sort (sigma_i, V(:, i)) by sigma_i descending
     *   5. U(:, i) = A * V(:, i) / sigma_i, for i with sigma_i > tolerance
     *      (Note: orthogonality of U columns relies on Jacobi's accuracy on
     *       symmetric matrices; small floating-point drift may exist when
     *       several sigma_i are nearly equal. If strict U^T U == I is needed,
     *       apply gram_schmidt_orthogonalize() on the result externally.)
     *
     * Numerical caveat:
     *   Forming A^T A squares the condition number of A, so singular values
     *   smaller than ~sqrt(eps)*sigma_max are unreliable. For accuracy near the
     *   rank cliff, prefer a bidiagonalization-based SVD (not implemented here).
     *
     * Tolerance semantics:
     *   `tolerance` is used both as the Jacobi off-diagonal convergence threshold
     *   and as the *singular-value* zero threshold (sigma <= tolerance is treated
     *   as zero). It is therefore in the *same scale as the entries of A*. This
     *   matches the convention used by `pseudo_inverse(svd, tolerance)`.
     *
     * Properties guaranteed:
     *   - sigma_i >= 0, sorted descending
     *   - rank   = number of sigma_i strictly greater than tolerance
     *   - U columns 0..rank-1 are orthonormal; columns rank..min_dim-1 are zero
     *   - V is fully populated (n columns) and is orthogonal up to floating-point
     *     drift inherited from Jacobi.
     *
     * Time complexity: O(m*n^2 + n^3) — dominated by Jacobi on the n x n matrix.
     *
     * @param max_iter   Maximum Jacobi iterations (must be > 0).
     * @param tolerance  Convergence threshold (Jacobi) and singular-value zero
     *                   threshold (must be >= 0).
     * @return SVDDecomposition with U, S, V, rank, iterations, status.
     */
    Mat::SVDDecomposition Mat::svd_decompose(int max_iter, float tolerance) const
    {
        SVDDecomposition result;

        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] svd_decompose: matrix data pointer is null.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] svd_decompose: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (max_iter <= 0)
        {
            std::cerr << "[Error] svd_decompose: max_iter must be > 0 (got " << max_iter << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] svd_decompose: tolerance must be >= 0 (got " << tolerance << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        const int m = this->row;
        const int n = this->col;
        const int min_dim = (m < n) ? m : n;
        // Note: m == 0 || n == 0 cases are already rejected by the dimension
        // check above (this->row <= 0 || this->col <= 0).

        // ---------------------------------------------------------------------
        // 2. Form M = A^T * A (n x n, symmetric PSD).
        //    Compute the upper triangle, then mirror, so that M is *exactly*
        //    symmetric (Jacobi assumes symmetry).
        // ---------------------------------------------------------------------
        Mat AtA(n, n);
        if (AtA.data == nullptr)
        {
            std::cerr << "[Error] svd_decompose: failed to allocate A^T*A.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }

        const int Astep = this->step;
        const float* const Adata = this->data;
        const int AtAstep = AtA.step;

        for (int i = 0; i < n; ++i)
        {
            float* row_AtA_i = AtA.data + i * AtAstep;
            for (int j = i; j < n; ++j)
            {
                float s = 0.0f;
                for (int k = 0; k < m; ++k)
                {
                    const float* row_Ak = Adata + k * Astep;
                    s += row_Ak[i] * row_Ak[j];
                }
                row_AtA_i[j] = s;
                if (j != i)
                {
                    AtA.data[j * AtAstep + i] = s;
                }
            }
        }

        // ---------------------------------------------------------------------
        // 3. Eigendecomposition of A^T A.
        //    eigenvectors are columns of `eig.eigenvectors`; eigenvalues are
        //    delivered in *unsorted* (natural diagonal) order.
        // ---------------------------------------------------------------------
        Mat::EigenDecomposition eig = AtA.eigendecompose_jacobi(tolerance, max_iter);
        if (eig.status != TINY_OK)
        {
            std::cerr << "[Error] svd_decompose: eigendecomposition failed with status "
                      << eig.status << ".\n";
            result.status = eig.status;
            return result;
        }
        if (eig.eigenvalues.data == nullptr || eig.eigenvectors.data == nullptr)
        {
            std::cerr << "[Error] svd_decompose: eigendecomposition returned null buffers.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (eig.eigenvalues.row != n || eig.eigenvalues.col != 1 ||
            eig.eigenvectors.row != n || eig.eigenvectors.col != n)
        {
            std::cerr << "[Error] svd_decompose: unexpected dimensions from eigendecomposition.\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        // ---------------------------------------------------------------------
        // 4. Build a permutation that sorts eigenvalues in descending order.
        //    Negative roundoff on PSD spectra is clamped to 0 here so that the
        //    sqrt is always well defined.
        // ---------------------------------------------------------------------
        const int evalStep = eig.eigenvalues.step;
        const int EVstep   = eig.eigenvectors.step;

        std::vector<float> lambda(n);
        std::vector<int>   perm(n);
        for (int i = 0; i < n; ++i)
        {
            float lam = eig.eigenvalues.data[i * evalStep];
            if (std::isnan(lam) || lam < 0.0f) lam = 0.0f;   // clamp PSD roundoff
            lambda[i] = lam;
            perm[i] = i;
        }
        // Insertion sort by lambda descending — n is typically small.
        for (int i = 1; i < n; ++i)
        {
            const int   key_idx = perm[i];
            const float key_lam = lambda[i];
            int j = i - 1;
            while (j >= 0 && lambda[j] < key_lam)
            {
                lambda[j + 1] = lambda[j];
                perm[j + 1]   = perm[j];
                --j;
            }
            lambda[j + 1] = key_lam;
            perm[j + 1]   = key_idx;
        }

        // ---------------------------------------------------------------------
        // 5. Allocate outputs and populate V (full n x n) and S (min_dim x 1).
        //    V's columns are *all* eigenvectors of A^T A in sorted order, so V
        //    is an orthogonal matrix (the trailing columns span the null space
        //    of A and have sigma_i = 0).
        //    rank counts only sigma_i > tolerance.
        // ---------------------------------------------------------------------
        result.S = Mat(min_dim, 1);
        result.V = Mat(n, n);
        result.U = Mat(m, min_dim);
        if (result.S.data == nullptr || result.V.data == nullptr || result.U.data == nullptr)
        {
            std::cerr << "[Error] svd_decompose: failed to allocate U/S/V.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }

        const int Vstep = result.V.step;
        const int Sstep = result.S.step;
        const int Ustep = result.U.step;

        // Populate every column of V (n columns total) using the sort permutation.
        for (int col = 0; col < n; ++col)
        {
            const int src = perm[col];
            for (int row_idx = 0; row_idx < n; ++row_idx)
            {
                result.V.data[row_idx * Vstep + col] =
                    eig.eigenvectors.data[row_idx * EVstep + src];
            }
        }

        // Fill S; count rank as sigma_i > tolerance (NOT >= — treat == tolerance
        // as numerically zero, matching pseudo_inverse).
        int rank = 0;
        for (int i = 0; i < min_dim; ++i)
        {
            const float sigma = sqrtf(lambda[i]); // lambda already clamped >= 0
            result.S.data[i * Sstep] = sigma;
            if (sigma > tolerance) ++rank;
        }
        result.rank = rank;

        // ---------------------------------------------------------------------
        // 6. Recover U from A * V = U * S, column by column:
        //        U(:, i) = (A * V(:, i)) / sigma_i,   for sigma_i > tolerance
        //    Columns rank..min_dim-1 are left at 0 (zero-initialized).
        //
        //    In exact arithmetic ||A * V_i|| == sigma_i, so U(:, i) is unit
        //    norm. Across i, U columns are orthogonal because V columns are
        //    (Jacobi guarantees orthonormal eigenvectors of A^T A). Floating-
        //    point drift may slightly violate U^T U == I; downstream consumers
        //    that need strict orthogonality should re-orthogonalize on the
        //    result.
        // ---------------------------------------------------------------------
        for (int i = 0; i < rank; ++i)
        {
            const float sigma = result.S.data[i * Sstep];
            const float inv_sigma = 1.0f / sigma;
            for (int j = 0; j < m; ++j)
            {
                const float* row_Aj = Adata + j * Astep;
                float s = 0.0f;
                for (int k = 0; k < n; ++k)
                {
                    s += row_Aj[k] * result.V.data[k * Vstep + i];
                }
                result.U.data[j * Ustep + i] = s * inv_sigma;
            }
        }

        result.iterations = eig.iterations;
        result.status = TINY_OK;
        return result;
    }

    /**
     * @name Mat::solve_lu()
     * @brief Solve linear system using LU decomposition (more efficient for multiple RHS).
     * @note Solves A * x = b using precomputed LU decomposition.
     *       Algorithm: P * A = L * U, so A * x = b becomes:
     *       1. Apply permutation: P * A * x = P * b → (L * U) * x = P * b
     *       2. Forward substitution: L * y = P * b → solve for y
     *       3. Backward substitution: U * x = y → solve for x
     * @note Advantages over direct solve():
     *       - More efficient when solving multiple systems with same A
     *       - LU decomposition computed once, reused for different b vectors
     *       - Time complexity: O(n²) vs O(n³) for direct solve
     * @note Requirements:
     *       - LU decomposition must be valid (status == TINY_OK)
     *       - Matrix A must be square and non-singular
     *       - Vector b must have same dimension as A
     * 
     * @param lu LU decomposition of coefficient matrix A
     * @param b Right-hand side vector (must be column vector)
     * @return Solution vector x such that A * x = b, or empty matrix on error
     */
    Mat Mat::solve_lu(const LUDecomposition &lu, const Mat &b)
    {
        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (lu.status != TINY_OK)
        {
            std::cerr << "[Error] solve_lu: invalid LU decomposition (status="
                      << lu.status << ").\n";
            return Mat(0, 0);
        }
        if (lu.L.data == nullptr || lu.U.data == nullptr)
        {
            std::cerr << "[Error] solve_lu: LU has null L or U buffer.\n";
            return Mat(0, 0);
        }
        if (b.data == nullptr)
        {
            std::cerr << "[Error] solve_lu: right-hand side b is null.\n";
            return Mat(0, 0);
        }
        if (lu.L.row != lu.L.col || lu.U.row != lu.U.col || lu.L.row != lu.U.row)
        {
            std::cerr << "[Error] solve_lu: L and U must be square and same-sized "
                      << "(got L=" << lu.L.row << "x" << lu.L.col
                      << ", U=" << lu.U.row << "x" << lu.U.col << ").\n";
            return Mat(0, 0);
        }

        const int n = lu.L.row;

        // Legitimate empty system: 0 equations -> trivial 0x1 solution.
        if (n == 0)
        {
            return Mat(0, 1);
        }

        if (b.row != n || b.col != 1)
        {
            std::cerr << "[Error] solve_lu: b must be " << n << "x1 (got "
                      << b.row << "x" << b.col << ").\n";
            return Mat(0, 0);
        }
        if (lu.pivoted && (lu.P.data == nullptr || lu.P.row != n || lu.P.col != n))
        {
            std::cerr << "[Error] solve_lu: pivoting enabled but P is invalid "
                      << "(got " << lu.P.row << "x" << lu.P.col << ").\n";
            return Mat(0, 0);
        }

        // ---------------------------------------------------------------------
        // 2. Build b_perm = P * b (or just a copy of b if no pivoting)
        //    P has exactly one 1.0 per row; we scan each row to find it.
        //    This is O(n^2) but only runs once and avoids mutating b.
        // ---------------------------------------------------------------------
        Mat b_perm(n, 1);
        if (b_perm.data == nullptr)
        {
            std::cerr << "[Error] solve_lu: failed to allocate permuted RHS.\n";
            return Mat(0, 0);
        }
        const int b_step = b.step;
        const int bp_step = b_perm.step;
        if (lu.pivoted)
        {
            const int P_step = lu.P.step;
            for (int i = 0; i < n; ++i)
            {
                const float* row_P = lu.P.data + i * P_step;
                int src = -1;
                for (int j = 0; j < n; ++j)
                {
                    if (fabsf(row_P[j] - 1.0f) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
                    {
                        src = j;
                        break;
                    }
                }
                b_perm.data[i * bp_step] =
                    (src >= 0) ? b.data[src * b_step] : 0.0f;
            }
        }
        else
        {
            for (int i = 0; i < n; ++i)
            {
                b_perm.data[i * bp_step] = b.data[i * b_step];
            }
        }

        // ---------------------------------------------------------------------
        // 3. Forward substitution: L * y = b_perm
        //    L is unit-diagonal (Doolittle), so no divisions are needed.
        //    y is stored in-place by reusing b_perm: y(i) = b_perm(i) - sum_{j<i} L(i,j) y(j)
        // ---------------------------------------------------------------------
        const int L_step = lu.L.step;
        for (int i = 0; i < n; ++i)
        {
            const float* row_L = lu.L.data + i * L_step;
            float s = b_perm.data[i * bp_step];
            for (int j = 0; j < i; ++j)
            {
                s -= row_L[j] * b_perm.data[j * bp_step];
            }
            b_perm.data[i * bp_step] = s; // == y(i)
        }

        // ---------------------------------------------------------------------
        // 4. Backward substitution: U * x = y
        // ---------------------------------------------------------------------
        Mat x(n, 1);
        if (x.data == nullptr)
        {
            std::cerr << "[Error] solve_lu: failed to allocate solution vector.\n";
            return Mat(0, 0);
        }
        const int U_step = lu.U.step;
        const int x_step = x.step;
        for (int i = n - 1; i >= 0; --i)
        {
            const float* row_U = lu.U.data + i * U_step;
            const float pivot = row_U[i];
            if (fabsf(pivot) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
            {
                std::cerr << "[Error] solve_lu: singular U at index " << i
                          << " (pivot=" << pivot << ").\n";
                return Mat(0, 0);
            }
            float s = b_perm.data[i * bp_step]; // y(i)
            for (int j = i + 1; j < n; ++j)
            {
                s -= row_U[j] * x.data[j * x_step];
            }
            x.data[i * x_step] = s / pivot;
        }

        return x;
    }

    /**
     * @name Mat::solve_cholesky()
     * @brief Solve linear system using Cholesky decomposition (for SPD matrices).
     * @note Solves A * x = b using precomputed Cholesky decomposition.
     *       Algorithm: A = L * L^T, so A * x = b becomes:
     *       1. Forward substitution: L * y = b → solve for y
     *       2. Backward substitution: L^T * x = y → solve for x
     * @note Advantages over LU decomposition:
     *       - Faster: O(n²) vs O(n²) but with better numerical stability
     *       - More stable: no pivoting needed for SPD matrices
     *       - Uses less memory: only stores L (not L and U)
     *       - Guaranteed to work for positive definite matrices
     * @note Requirements:
     *       - Cholesky decomposition must be valid (status == TINY_OK)
     *       - Matrix A must be symmetric positive definite (SPD)
     *       - Vector b must have same dimension as A
     * @note Time complexity: O(n²) - efficient for SPD systems
     * 
     * @param chol Cholesky decomposition of coefficient matrix A (A = L * L^T)
     * @param b Right-hand side vector (must be column vector)
     * @return Solution vector x such that A * x = b, or empty matrix on error
     */
    Mat Mat::solve_cholesky(const CholeskyDecomposition &chol, const Mat &b)
    {
        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (chol.status != TINY_OK)
        {
            std::cerr << "[Error] solve_cholesky: invalid decomposition (status="
                      << chol.status << ").\n";
            return Mat(0, 0);
        }
        if (chol.L.data == nullptr)
        {
            std::cerr << "[Error] solve_cholesky: L is null.\n";
            return Mat(0, 0);
        }
        if (b.data == nullptr)
        {
            std::cerr << "[Error] solve_cholesky: right-hand side b is null.\n";
            return Mat(0, 0);
        }
        if (chol.L.row != chol.L.col)
        {
            std::cerr << "[Error] solve_cholesky: L must be square (got "
                      << chol.L.row << "x" << chol.L.col << ").\n";
            return Mat(0, 0);
        }

        const int n = chol.L.row;

        if (n == 0)
        {
            return Mat(0, 1);
        }
        if (b.row != n || b.col != 1)
        {
            std::cerr << "[Error] solve_cholesky: b must be " << n << "x1 (got "
                      << b.row << "x" << b.col << ").\n";
            return Mat(0, 0);
        }

        // ---------------------------------------------------------------------
        // 2. Forward substitution: L * y = b
        //    L lower-triangular with positive diagonal (guaranteed by
        //    cholesky_decompose). We allocate one (n x 1) buffer and reuse it
        //    in-place: it starts as b, becomes y, then becomes x.
        // ---------------------------------------------------------------------
        Mat x(n, 1);
        if (x.data == nullptr)
        {
            std::cerr << "[Error] solve_cholesky: failed to allocate solution.\n";
            return Mat(0, 0);
        }
        const int b_step = b.step;
        const int x_step = x.step;
        for (int i = 0; i < n; ++i)
        {
            x.data[i * x_step] = b.data[i * b_step];
        }

        const int L_step = chol.L.step;
        for (int i = 0; i < n; ++i)
        {
            const float* row_L = chol.L.data + i * L_step;
            const float pivot = row_L[i];
            if (fabsf(pivot) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
            {
                std::cerr << "[Error] solve_cholesky: zero/near-zero L diagonal at "
                          << i << " (= " << pivot << ").\n";
                return Mat(0, 0);
            }
            float s = x.data[i * x_step];
            for (int j = 0; j < i; ++j)
            {
                s -= row_L[j] * x.data[j * x_step];
            }
            x.data[i * x_step] = s / pivot; // == y(i)
        }

        // ---------------------------------------------------------------------
        // 3. Backward substitution: L^T * x = y
        //    L^T(i, j) = L(j, i), so we walk *column* i of L below the diagonal,
        //    which is non-contiguous. We use the row+stride access pattern.
        // ---------------------------------------------------------------------
        for (int i = n - 1; i >= 0; --i)
        {
            const float pivot = chol.L.data[i * L_step + i];
            float s = x.data[i * x_step]; // == y(i)
            for (int j = i + 1; j < n; ++j)
            {
                s -= chol.L.data[j * L_step + i] * x.data[j * x_step];
            }
            x.data[i * x_step] = s / pivot;
        }

        return x;
    }

    /**
     * @name Mat::solve_qr()
     * @brief Solve linear system using QR decomposition (least squares solution).
     * @note Solves A * x = b using precomputed QR decomposition.
     *       Algorithm: A = Q * R, so A * x = b becomes:
     *       1. Compute Q^T * b (project b onto column space of Q)
     *       2. Solve R * x = Q^T * b using backward substitution
     * @note This method provides least squares solution:
     *       - For overdetermined systems (m > n): minimizes ||A * x - b||
     *       - For determined systems (m = n): exact solution
     *       - For underdetermined systems (m < n): minimum norm solution
     * @note Advantages:
     *       - Numerically stable (no pivoting needed)
     *       - Works for rectangular matrices
     *       - Handles rank-deficient matrices gracefully
     * @note Requirements:
     *       - QR decomposition must be valid (status == TINY_OK)
     *       - Vector b must have same number of rows as Q
     * @note Time complexity: O(m*n) - efficient for least squares problems
     * 
     * @param qr QR decomposition of coefficient matrix A (A = Q * R)
     * @param b Right-hand side vector (must be column vector)
     * @return Least squares solution vector x such that ||A * x - b|| is minimized,
     *         or empty matrix on error
     */
    Mat Mat::solve_qr(const QRDecomposition &qr, const Mat &b)
    {
        // ---------------------------------------------------------------------
        // 1. Validate input
        // ---------------------------------------------------------------------
        if (qr.status != TINY_OK)
        {
            std::cerr << "[Error] solve_qr: invalid QR decomposition (status="
                      << qr.status << ").\n";
            return Mat(0, 0);
        }
        if (qr.Q.data == nullptr || qr.R.data == nullptr)
        {
            std::cerr << "[Error] solve_qr: Q or R is null.\n";
            return Mat(0, 0);
        }
        if (b.data == nullptr)
        {
            std::cerr << "[Error] solve_qr: right-hand side b is null.\n";
            return Mat(0, 0);
        }
        if (qr.Q.row <= 0 || qr.Q.col <= 0 || qr.R.row <= 0 || qr.R.col <= 0)
        {
            std::cerr << "[Error] solve_qr: invalid Q/R dimensions.\n";
            return Mat(0, 0);
        }

        const int m = qr.Q.row;
        const int n = qr.R.col;
        const int min_dim = (m < n) ? m : n;

        // Q is expected to be m x n and R is m x n (qr_decompose convention).
        // We only require enough capacity to address Q(:, 0..min_dim-1) and
        // R(0..min_dim-1, :), so check that conservatively.
        if (qr.Q.col < min_dim || qr.R.row < min_dim)
        {
            std::cerr << "[Error] solve_qr: Q/R shape inconsistent (Q="
                      << qr.Q.row << "x" << qr.Q.col
                      << ", R=" << qr.R.row << "x" << qr.R.col << ").\n";
            return Mat(0, 0);
        }

        // Legitimate empty system: 0 equations or 0 unknowns.
        if (m == 0 || n == 0)
        {
            return Mat(n, 1);
        }

        if (b.row != m || b.col != 1)
        {
            std::cerr << "[Error] solve_qr: b must be " << m << "x1 (got "
                      << b.row << "x" << b.col << ").\n";
            return Mat(0, 0);
        }

        // ---------------------------------------------------------------------
        // 2. Compute c = Q^T * b
        //    Q is m x n; only the first min_dim columns are guaranteed to be
        //    a valid orthonormal basis for col(A). The remaining components
        //    are 0 (left at the value set by the zero-initialized constructor).
        //
        //    Inner-product accumulation walks Q column-wise, which is the
        //    cache-unfriendly direction for row-major storage. We still hoist
        //    the row pointers to avoid re-multiplying the stride per element.
        // ---------------------------------------------------------------------
        Mat c(n, 1); // x will reuse this buffer after backward substitution
        if (c.data == nullptr)
        {
            std::cerr << "[Error] solve_qr: failed to allocate working vector.\n";
            return Mat(0, 0);
        }
        const int Q_step = qr.Q.step;
        const int b_step = b.step;
        const int c_step = c.step;
        for (int i = 0; i < min_dim; ++i)
        {
            float s = 0.0f;
            for (int j = 0; j < m; ++j)
            {
                s += qr.Q.data[j * Q_step + i] * b.data[j * b_step];
            }
            c.data[i * c_step] = s;
        }
        // c[min_dim .. n-1] left at 0 by Mat(n,1) zero-init.

        // ---------------------------------------------------------------------
        // 3. Backward substitution: R x = c
        //    R is upper-triangular (m >= n) or upper-trapezoidal (m < n).
        //    We only solve the first min_dim rows; for rank-deficient or
        //    under-determined cases (R(i,i) ~ 0), set x(i)=0 (minimum-norm-ish).
        //    Final x[min_dim..n-1] are 0 (zero-init), giving the basic least
        //    squares solution that lives in the span of the leading min_dim cols.
        // ---------------------------------------------------------------------
        Mat x(n, 1);
        if (x.data == nullptr)
        {
            std::cerr << "[Error] solve_qr: failed to allocate solution vector.\n";
            return Mat(0, 0);
        }
        const int R_step = qr.R.step;
        const int x_step = x.step;
        for (int i = min_dim - 1; i >= 0; --i)
        {
            const float* row_R = qr.R.data + i * R_step;
            const float pivot = row_R[i];
            if (fabsf(pivot) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
            {
                // Rank deficient: take 0 as the i-th unknown (already zero).
                continue;
            }
            float s = c.data[i * c_step];
            for (int j = i + 1; j < n; ++j)
            {
                s -= row_R[j] * x.data[j * x_step];
            }
            x.data[i * x_step] = s / pivot;
        }

        return x;
    }

    /**
     * @name Mat::pseudo_inverse()
     * @brief Compute pseudo-inverse using SVD: A^+ = V * S^+ * U^T.
     * @note Pseudo-inverse (Moore-Penrose inverse) is the generalization of matrix inverse
     *       for rectangular or singular matrices.
     * @note Algorithm: A = U * S * V^T, so A^+ = V * S^+ * U^T
     *       where S^+ is the pseudo-inverse of S (diagonal matrix):
     *       - If σ_i > tolerance: S^+[i][i] = 1/σ_i
     *       - If σ_i <= tolerance: S^+[i][i] = 0 (treat as zero)
     * @note Mathematical properties:
     *       - A * A^+ * A = A
     *       - A^+ * A * A^+ = A^+
     *       - (A * A^+)^T = A * A^+
     *       - (A^+ * A)^T = A^+ * A
     * @note Applications:
     *       - Solving least squares problems: x = A^+ * b
     *       - Solving underdetermined systems (minimum norm solution)
     *       - Solving overdetermined systems (least squares solution)
     *       - Rank-deficient matrix inversion
     * @note Time complexity: O(m*n*rank) - dominated by matrix multiplications
     * 
     * @param svd SVD decomposition of matrix A (A = U * S * V^T)
     * @param tolerance Tolerance for singular values (must be >= 0).
     *                  Singular values <= tolerance are treated as zero.
     * @return Pseudo-inverse matrix A^+ (n×m matrix), or empty matrix on error
     */
    Mat Mat::pseudo_inverse(const SVDDecomposition &svd, float tolerance)
    {
        if (svd.status != TINY_OK)
        {
            std::cerr << "[Error] pseudo_inverse: invalid SVD decomposition (status: "
                      << svd.status << ")\n";
            return Mat(0, 0);
        }
        if (svd.U.data == nullptr || svd.V.data == nullptr || svd.S.data == nullptr)
        {
            std::cerr << "[Error] pseudo_inverse: SVD decomposition matrices have null pointers.\n";
            return Mat(0, 0);
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] pseudo_inverse: tolerance must be >= 0 (got "
                      << tolerance << ").\n";
            return Mat(0, 0);
        }
        if (svd.U.row <= 0 || svd.U.col <= 0 ||
            svd.V.row <= 0 || svd.V.col <= 0 ||
            svd.S.row <= 0 || svd.S.col != 1)
        {
            std::cerr << "[Error] pseudo_inverse: invalid SVD decomposition dimensions.\n";
            return Mat(0, 0);
        }

        const int m = svd.U.row;       // A has m rows
        const int n = svd.V.row;       // A has n columns
        const int min_dim = svd.S.row; // S is min(m,n) x 1

        // Validate SVD shape consistency with svd_decompose() contract.
        if (min_dim != ((m < n) ? m : n) || svd.U.col < min_dim || svd.V.col < min_dim)
        {
            std::cerr << "[Error] pseudo_inverse: inconsistent SVD dimensions.\n";
            return Mat(0, 0);
        }
        if (svd.rank < 0 || svd.rank > min_dim)
        {
            std::cerr << "[Error] pseudo_inverse: invalid rank " << svd.rank
                      << " (expected 0 to " << min_dim << ").\n";
            return Mat(0, 0);
        }

        // Precompute active reciprocal singular values based on this call's tolerance.
        std::vector<int> active_idx;
        std::vector<float> active_inv_sigma;
        active_idx.reserve(min_dim);
        active_inv_sigma.reserve(min_dim);
        for (int k = 0; k < min_dim; ++k)
        {
            const float sigma = svd.S(k, 0);
            if (!std::isfinite(sigma))
            {
                std::cerr << "[Error] pseudo_inverse: non-finite singular value at index "
                          << k << ".\n";
                return Mat(0, 0);
            }
            if (sigma > tolerance)
            {
                active_idx.push_back(k);
                active_inv_sigma.push_back(1.0f / sigma);
            }
        }

        Mat A_plus(n, m);
        if (A_plus.data == nullptr)
        {
            std::cerr << "[Error] pseudo_inverse: failed to allocate result matrix.\n";
            return Mat(0, 0);
        }

        const int AplusStep = A_plus.step;
        const int Ustep = svd.U.step;
        const int Vstep = svd.V.step;

        // Explicitly zero-initialize in case Mat constructor does not.
        for (int i = 0; i < n; ++i)
        {
            float *row_out = A_plus.data + i * AplusStep;
            for (int j = 0; j < m; ++j) row_out[j] = 0.0f;
        }

        // A^+ = sum_k V(:,k) * (1/sigma_k) * U(:,k)^T, over sigma_k > tolerance.
        // Loop order (k -> i -> j) improves reuse of V(i,k)/sigma_k.
        for (size_t t = 0; t < active_idx.size(); ++t)
        {
            const int k = active_idx[t];
            const float inv_sigma = active_inv_sigma[t];
            for (int i = 0; i < n; ++i)
            {
                float *row_out = A_plus.data + i * AplusStep;
                const float vik_scaled = svd.V.data[i * Vstep + k] * inv_sigma;
                const float *u_col_base = svd.U.data + k; // U(j,k) = u_col_base[j*Ustep]
                for (int j = 0; j < m; ++j)
                {
                    row_out[j] += vik_scaled * u_col_base[j * Ustep];
                }
            }
        }

        return A_plus;
    }

    // ============================================================================
    // Eigenvalue & Eigenvector Decomposition
    // ============================================================================
    /**
     * @name Mat::EigenPair::EigenPair()
     * @brief Default constructor for EigenPair structure
     */
    Mat::EigenPair::EigenPair() : eigenvalue(0.0f), iterations(0), status(TINY_OK)
    {
    }

    /**
     * @name Mat::EigenDecomposition::EigenDecomposition()
     * @brief Default constructor for EigenDecomposition structure
     */
    Mat::EigenDecomposition::EigenDecomposition() : iterations(0), status(TINY_OK)
    {
    }

    /**
     * @name Mat::is_symmetric()
     * @brief Check whether A^T = A within a tolerance.
     *
     * @note Semantics:
     *       - Non-square            -> false (silent; not a bug, just not symmetric)
     *       - 0x0                   -> true  (vacuously symmetric)
     *       - Negative dimensions   -> false (logged: invalid input)
     *       - Null data on a sized
     *         matrix                -> false (logged: invalid input)
     *       - NaN element           -> false (NaN never compares <= tolerance,
     *                                  so it falls through naturally)
     * @note Hot loop hoists the row pointer; the (j, i) probe is column-strided
     *       (unavoidable without an explicit transpose).
     * @note Time complexity: O(n^2) (upper triangle only).
     *
     * @param tolerance Max allowed |A(i,j) - A(j,i)|; must be >= 0.
     * @return true if symmetric within tolerance.
     */
    bool Mat::is_symmetric(float tolerance) const
    {
        if (this->data == nullptr && (this->row > 0 && this->col > 0))
        {
            std::cerr << "[Error] is_symmetric: matrix data pointer is null.\n";
            return false;
        }
        if (this->row < 0 || this->col < 0)
        {
            std::cerr << "[Error] is_symmetric: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            return false;
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] is_symmetric: tolerance must be >= 0 (got "
                      << tolerance << ").\n";
            return false;
        }
        if (this->row != this->col)
        {
            return false;
        }

        const int n = this->row;
        if (n == 0)
        {
            return true; // 0x0 is trivially symmetric
        }

        // Hot loop: hoist row pointer for row i; column-direction access for
        // (j, i) is strided but unavoidable without a transpose.
        // Using `!(diff <= tolerance)` causes NaN to fall through to false.
        const int   stride = this->step;
        const float* const base = this->data;
        for (int i = 0; i < n; ++i)
        {
            const float* row_i = base + i * stride;
            for (int j = i + 1; j < n; ++j)
            {
                const float diff = fabsf(row_i[j] - base[j * stride + i]);
                if (!(diff <= tolerance))
                {
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * @name Mat::power_iteration()
     * @brief Compute the dominant (largest magnitude) eigenvalue and eigenvector using power iteration.
     * @note Power iteration finds the eigenvalue with the largest absolute value and its corresponding eigenvector.
     *       Algorithm: v_{k+1} = A * v_k / ||A * v_k||, converges to dominant eigenvector.
     * @note Fast method suitable for real-time SHM applications to quickly identify primary frequency.
     * @note Time complexity: O(n² * iterations) - efficient for sparse or large matrices
     * @note Convergence: Requires that the dominant eigenvalue is unique and has larger magnitude than others.
     * 
     * @param max_iter Maximum number of iterations (must be > 0)
     * @param tolerance Convergence tolerance (must be >= 0). Convergence when |λ_k - λ_{k-1}| < tolerance * |λ_k|
     * @return EigenPair containing the dominant eigenvalue, eigenvector, and status
     */
    Mat::EigenPair Mat::power_iteration(int max_iter, float tolerance) const
    {
        EigenPair result;

        // -----------------------------------------------------------------
        // 1. Validate input
        // -----------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] power_iteration: matrix data pointer is null.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] power_iteration: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (this->row != this->col)
        {
            std::cerr << "[Error] power_iteration: requires square matrix (got "
                      << this->row << "x" << this->col << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (max_iter <= 0)
        {
            std::cerr << "[Error] power_iteration: max_iter must be > 0 (got "
                      << max_iter << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] power_iteration: tolerance must be >= 0 (got "
                      << tolerance << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        const int n = this->row;
        if (n == 0)
        {
            result.eigenvalue  = 0.0f;
            result.eigenvector = Mat(0, 1);
            result.iterations  = 0;
            result.status      = TINY_OK;
            return result;
        }

        // -----------------------------------------------------------------
        // 2. Allocate working buffers (eigenvector + scratch).
        // -----------------------------------------------------------------
        result.eigenvector = Mat(n, 1);
        Mat temp_vec(n, 1);
        if (result.eigenvector.data == nullptr || temp_vec.data == nullptr)
        {
            std::cerr << "[Error] power_iteration: failed to allocate buffers.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }

        // -----------------------------------------------------------------
        // 3. Initialize the seed vector.
        //    We bias by the column-1-norms so directions with more "mass"
        //    in A start with a larger projection along that axis. The +1.0f
        //    floor guarantees a positive result, so the well-known fallback
        //    branch is unreachable; a uniform [1..1] start is fine for any
        //    realistic matrix.
        // -----------------------------------------------------------------
        const int Astep = this->step;
        const int Vstep = result.eigenvector.step;
        const int Tstep = temp_vec.step;
        const float* const Adata = this->data;
        float* const Vdata = result.eigenvector.data;
        float* const Tdata = temp_vec.data;

        {
            float norm_sq = 0.0f;
            for (int i = 0; i < n; ++i)
            {
                float col_sum = 0.0f;
                for (int j = 0; j < n; ++j)
                    col_sum += fabsf(Adata[j * Astep + i]);
                const float v = col_sum + 1.0f;
                Vdata[i * Vstep] = v;
                norm_sq += v * v;
            }
            const float sqrt_norm = sqrtf(norm_sq);
            if (!(sqrt_norm > TINY_MATH_MIN_POSITIVE_INPUT_F32))
            {
                std::cerr << "[Error] power_iteration: invalid initial eigenvector norm.\n";
                result.status = TINY_ERR_MATH_INVALID_PARAM;
                return result;
            }
            const float inv_norm = 1.0f / sqrt_norm;
            for (int i = 0; i < n; ++i)
                Vdata[i * Vstep] *= inv_norm;
        }

        // -----------------------------------------------------------------
        // 4. Power iteration loop.
        //    Invariant: at the top of each iteration, ||v||_2 == 1, so the
        //    Rayleigh quotient denominator (v^T v) is exactly 1 and we skip
        //    computing it. This saves n FLOPs per iteration.
        //    Order each iteration:
        //       temp = A * v
        //       lambda = v^T * temp        (denominator = 1)
        //       v <- temp / ||temp||       (re-establish unit-norm invariant)
        // -----------------------------------------------------------------
        float prev_eigenvalue = 0.0f;
        for (int iter = 0; iter < max_iter; ++iter)
        {
            // 4.1 temp = A * v
            for (int i = 0; i < n; ++i)
            {
                const float* row_Ai = Adata + i * Astep;
                float sum = 0.0f;
                for (int j = 0; j < n; ++j)
                    sum += row_Ai[j] * Vdata[j * Vstep];
                Tdata[i * Tstep] = sum;
            }

            // 4.2 lambda = v^T * temp  (||v|| == 1 at this point)
            float lambda = 0.0f;
            float new_norm_sq = 0.0f;
            for (int i = 0; i < n; ++i)
            {
                const float vi = Vdata[i * Vstep];
                const float ti = Tdata[i * Tstep];
                lambda      += vi * ti;
                new_norm_sq += ti * ti;
            }

            if (!(new_norm_sq > TINY_MATH_MIN_POSITIVE_INPUT_F32))
            {
                std::cerr << "[Error] power_iteration: matrix-vector product collapsed to zero.\n";
                result.status = TINY_ERR_MATH_INVALID_PARAM;
                return result;
            }
            result.eigenvalue = lambda;

            // 4.3 v = temp / ||temp||
            const float inv_norm = 1.0f / sqrtf(new_norm_sq);
            for (int i = 0; i < n; ++i)
                Vdata[i * Vstep] = Tdata[i * Tstep] * inv_norm;

            // 4.4 Convergence check.
            //     Use absolute change for tiny eigenvalues, relative otherwise.
            if (iter > 0)
            {
                const float change = fabsf(result.eigenvalue - prev_eigenvalue);
                const float abs_l  = fabsf(result.eigenvalue);
                const float thresh = (abs_l < TINY_MATH_MIN_POSITIVE_INPUT_F32)
                                     ? tolerance
                                     : tolerance * abs_l;
                if (change < thresh)
                {
                    result.iterations = iter + 1;
                    result.status     = TINY_OK;
                    return result;
                }
            }
            prev_eigenvalue = result.eigenvalue;
        }

        result.iterations = max_iter;
        result.status     = TINY_ERR_NOT_FINISHED;
        std::cerr << "[Warning] power_iteration: did not converge within "
                  << max_iter << " iterations.\n";
        return result;
    }

    /**
     * @name Mat::inverse_power_iteration()
     * @brief Compute the smallest (minimum magnitude) eigenvalue and eigenvector using inverse power iteration.
     * @note Inverse power iteration finds the eigenvalue with the smallest absolute value and its eigenvector.
     *       Algorithm: v_{k+1} = A^(-1) * v_k / ||A^(-1) * v_k||, converges to smallest eigenvector.
     * @note Critical for system identification - finds fundamental frequency/lowest mode in structural dynamics.
     *       This method is essential for SHM applications where the smallest eigenvalue corresponds to the
     *       fundamental frequency of the system.
     * @note Time complexity: O(n³ * iterations) - each iteration requires solving a linear system
     * @note The matrix must be invertible (non-singular) for this method to work.
     *       If the matrix is singular or near-singular, the method will fail gracefully.
     * 
     * @param max_iter Maximum number of iterations (must be > 0)
     * @param tolerance Convergence tolerance (must be >= 0). Convergence when |λ_k - λ_{k-1}| < tolerance * max(|λ_k|, 1)
     * @return EigenPair containing the smallest eigenvalue, eigenvector, and status
     */
    Mat::EigenPair Mat::inverse_power_iteration(int max_iter, float tolerance) const
    {
        EigenPair result;

        // -----------------------------------------------------------------
        // 1. Validate input
        // -----------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] inverse_power_iteration: matrix data pointer is null.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] inverse_power_iteration: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (this->row != this->col)
        {
            std::cerr << "[Error] inverse_power_iteration: requires square matrix (got "
                      << this->row << "x" << this->col << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (max_iter <= 0)
        {
            std::cerr << "[Error] inverse_power_iteration: max_iter must be > 0 (got "
                      << max_iter << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] inverse_power_iteration: tolerance must be >= 0 (got "
                      << tolerance << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        const int n = this->row;
        if (n == 0)
        {
            result.eigenvalue  = 0.0f;
            result.eigenvector = Mat(0, 1);
            result.iterations  = 0;
            result.status      = TINY_OK;
            return result;
        }

        // -----------------------------------------------------------------
        // 2. Factorise A once (PA = LU). Each iteration then solves the
        //    linear system in O(n^2) instead of O(n^3) -- this is the whole
        //    point of inverse power iteration in practice.
        // -----------------------------------------------------------------
        LUDecomposition lu = this->lu_decompose(/*use_pivoting=*/true);
        if (lu.status != TINY_OK)
        {
            std::cerr << "[Error] inverse_power_iteration: matrix is singular or near-singular "
                      << "(LU status=" << lu.status << ").\n";
            result.status = TINY_ERR_MATH_INVALID_PARAM;
            return result;
        }

        // -----------------------------------------------------------------
        // 3. Allocate eigenvector buffer and seed it with an alternating
        //    +/-1 pattern to break alignment with the dominant eigenvector
        //    direction; small perturbation keeps it generic.
        // -----------------------------------------------------------------
        result.eigenvector = Mat(n, 1);
        if (result.eigenvector.data == nullptr)
        {
            std::cerr << "[Error] inverse_power_iteration: failed to allocate eigenvector.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }

        const int Vstep = result.eigenvector.step;
        float* const Vdata = result.eigenvector.data;
        {
            float norm_sq = 0.0f;
            for (int i = 0; i < n; ++i)
            {
                const float v = ((i & 1) == 0 ? 1.0f : -1.0f)
                                + 0.1f * static_cast<float>(i) / static_cast<float>(n);
                Vdata[i * Vstep] = v;
                norm_sq += v * v;
            }
            const float sqrt_norm = sqrtf(norm_sq);
            if (!(sqrt_norm > TINY_MATH_MIN_POSITIVE_INPUT_F32))
            {
                std::cerr << "[Error] inverse_power_iteration: invalid initial eigenvector norm.\n";
                result.status = TINY_ERR_MATH_INVALID_PARAM;
                return result;
            }
            const float inv_norm = 1.0f / sqrt_norm;
            for (int i = 0; i < n; ++i)
                Vdata[i * Vstep] *= inv_norm;
        }

        // -----------------------------------------------------------------
        // 4. Iterate.
        //
        //    Per iteration we want the smallest-magnitude eigenvalue of A,
        //    which is the largest-magnitude eigenvalue of A^{-1}. The
        //    classical recurrence is:
        //        y      = A^{-1} v        (here: solve_lu(lu, v))
        //        mu     = v^T y           (Rayleigh quotient on A^{-1};
        //                                  v^T v == 1 by invariant)
        //        v_next = y / ||y||       (re-normalise)
        //        lambda = 1 / mu          (eigenvalue of A)
        //
        //    This avoids the explicit second matvec A * v that the previous
        //    implementation used, saving ~n^2 FLOPs per iteration.
        // -----------------------------------------------------------------
        float prev_eigenvalue = 0.0f;
        for (int iter = 0; iter < max_iter; ++iter)
        {
            // 4.1 y = A^{-1} v  via reusable LU.
            Mat y = Mat::solve_lu(lu, result.eigenvector);
            if (y.data == nullptr || y.row != n || y.col != 1)
            {
                std::cerr << "[Error] inverse_power_iteration: solve_lu failed at iter "
                          << iter << ".\n";
                result.status = TINY_ERR_MATH_INVALID_PARAM;
                return result;
            }

            // 4.2 Rayleigh quotient on A^{-1} and ||y||^2 in one sweep.
            const int Ystep = y.step;
            const float* const Ydata = y.data;

            float mu = 0.0f;          // v^T y
            float norm_sq = 0.0f;     // y^T y
            for (int i = 0; i < n; ++i)
            {
                const float vi = Vdata[i * Vstep];
                const float yi = Ydata[i * Ystep];
                mu      += vi * yi;
                norm_sq += yi * yi;
            }

            if (!(norm_sq > TINY_MATH_MIN_POSITIVE_INPUT_F32))
            {
                std::cerr << "[Error] inverse_power_iteration: solve produced a zero vector.\n";
                result.status = TINY_ERR_MATH_INVALID_PARAM;
                return result;
            }
            if (fabsf(mu) < TINY_MATH_MIN_POSITIVE_INPUT_F32)
            {
                // mu = v^T A^{-1} v ~ 0 => smallest |eigenvalue| of A is huge
                // (or v happens to be orthogonal to the smallest mode). In
                // either case, 1/mu would overflow; abort gracefully.
                std::cerr << "[Error] inverse_power_iteration: Rayleigh quotient on A^{-1} "
                          << "underflowed (eigenvalue would diverge).\n";
                result.status = TINY_ERR_MATH_INVALID_PARAM;
                return result;
            }
            result.eigenvalue = 1.0f / mu;

            // 4.3 v <- y / ||y||
            const float inv_norm = 1.0f / sqrtf(norm_sq);
            for (int i = 0; i < n; ++i)
                Vdata[i * Vstep] = Ydata[i * Ystep] * inv_norm;

            // 4.4 Convergence: relative change with a unit floor for tiny
            //     eigenvalues. Matches the previous semantics.
            if (iter > 0)
            {
                const float change   = fabsf(result.eigenvalue - prev_eigenvalue);
                const float rel_tol  = tolerance * fmaxf(fabsf(result.eigenvalue), 1.0f);
                if (change < rel_tol)
                {
                    result.iterations = iter + 1;
                    result.status     = TINY_OK;
                    return result;
                }
            }
            prev_eigenvalue = result.eigenvalue;
        }

        result.iterations = max_iter;
        result.status     = TINY_ERR_NOT_FINISHED;
        std::cerr << "[Warning] inverse_power_iteration: did not converge within "
                  << max_iter << " iterations.\n";
        return result;
    }

    /**
     * @name Mat::eigendecompose_jacobi()
     * @brief Eigendecomposition of a symmetric matrix via classical Jacobi rotations.
     *
     * Algorithm:
     *   1. Symmetrize input: A := (A + A^T) / 2  (defends against tiny FP asymmetry)
     *   2. Iterate:
     *        - Find largest off-diagonal |A(p,q)|.
     *        - Build a Givens rotation that zeros A(p,q) (and A(q,p) by symmetry).
     *        - Apply the rotation to A from both sides; accumulate it into V.
     *      Stop when max |A(p,q)| < tolerance, or after `max_iter` sweeps.
     *
     * @note Best for small/medium symmetric matrices (n < ~100). For SHM
     *       structural matrices this is ideal.
     * @note Convergence is tested with an absolute threshold; callers that
     *       need scale-invariance should pre-scale `tolerance` accordingly.
     * @note Time complexity: O(n^3 * iterations); convergence is quadratic
     *       near the solution, so iteration count is typically O(n^2).
     *
     * @param tolerance  Off-diagonal magnitude below which we stop (>= 0).
     * @param max_iter   Maximum sweeps (must be > 0).
     * @return EigenDecomposition with eigenvalues (n x 1), eigenvectors
     *         (n x n, columns are unit eigenvectors), iteration count, status.
     */
    Mat::EigenDecomposition Mat::eigendecompose_jacobi(float tolerance, int max_iter) const
    {
        EigenDecomposition result;

        // -----------------------------------------------------------------
        // 1. Validate input
        // -----------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] eigendecompose_jacobi: matrix data pointer is null.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] eigendecompose_jacobi: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (this->row != this->col)
        {
            std::cerr << "[Error] eigendecompose_jacobi: requires square matrix (got "
                      << this->row << "x" << this->col << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] eigendecompose_jacobi: tolerance must be >= 0 (got "
                      << tolerance << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (max_iter <= 0)
        {
            std::cerr << "[Error] eigendecompose_jacobi: max_iter must be > 0 (got "
                      << max_iter << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        const int n = this->row;

        // -----------------------------------------------------------------
        // 2. Allocate outputs once
        // -----------------------------------------------------------------
        result.eigenvalues  = Mat(n, 1);
        result.eigenvectors = Mat::eye(n);
        if (n > 0 && (result.eigenvalues.data == nullptr ||
                      result.eigenvectors.data == nullptr))
        {
            std::cerr << "[Error] eigendecompose_jacobi: failed to allocate outputs.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (n == 0)
        {
            result.iterations = 0;
            result.status = TINY_OK;
            return result;
        }

        // -----------------------------------------------------------------
        // 3. Working copy with explicit symmetrization: A := (T + T^T) / 2.
        //    Doing this here lets callers pass slightly non-symmetric inputs
        //    safely, and removes the need for a separate is_symmetric() probe
        //    whose tolerance is hard to set scale-invariantly.
        // -----------------------------------------------------------------
        Mat A(n, n);
        if (A.data == nullptr)
        {
            std::cerr << "[Error] eigendecompose_jacobi: failed to allocate working copy.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        {
            const int Astep_init = A.step;
            const int Tstep      = this->step;
            float* const Adata_init = A.data;
            const float* const Tdata = this->data;
            for (int i = 0; i < n; ++i)
            {
                float* row_Ai = Adata_init + i * Astep_init;
                const float* row_Ti = Tdata + i * Tstep;
                row_Ai[i] = row_Ti[i];
                for (int j = i + 1; j < n; ++j)
                {
                    const float v = 0.5f * (row_Ti[j] + Tdata[j * Tstep + i]);
                    row_Ai[j] = v;
                    Adata_init[j * Astep_init + i] = v;
                }
            }
        }

        // -----------------------------------------------------------------
        // 4. Jacobi sweeps (classical: pick the largest off-diagonal entry).
        //    Hot loops use hoisted row pointers to skip operator() overhead.
        // -----------------------------------------------------------------
        const int Astep = A.step;
        const int Vstep = result.eigenvectors.step;
        float* const Adata = A.data;
        float* const Vdata = result.eigenvectors.data;

        bool converged  = false;
        int  iter_count = max_iter;

        for (int iter = 0; iter < max_iter; ++iter)
        {
            // 4.1 Locate the largest |A(p,q)| with p < q.
            float max_off = 0.0f;
            int   p = 0, q = 1;
            for (int i = 0; i < n - 1; ++i)
            {
                const float* row_Ai = Adata + i * Astep;
                for (int j = i + 1; j < n; ++j)
                {
                    const float a = fabsf(row_Ai[j]);
                    if (a > max_off)
                    {
                        max_off = a;
                        p = i;
                        q = j;
                    }
                }
            }

            // 4.2 Convergence check.
            if (max_off < tolerance)
            {
                converged  = true;
                iter_count = iter + 1;
                break;
            }

            // 4.3 Compute rotation (c, s) that zeros A(p,q).
            //     The branch on tau picks the smaller root of the quadratic
            //     and so keeps |t| <= 1, avoiding cancellation.
            float* const row_Ap = Adata + p * Astep;
            float* const row_Aq = Adata + q * Astep;
            const float app = row_Ap[p];
            const float aqq = row_Aq[q];
            const float apq = row_Ap[q];

            const float tau = (aqq - app) / (2.0f * apq);
            const float t = (tau >= 0.0f)
                            ?  1.0f / ( tau + sqrtf(1.0f + tau * tau))
                            : -1.0f / (-tau + sqrtf(1.0f + tau * tau));
            const float c = 1.0f / sqrtf(1.0f + t * t);
            const float s = t * c;

            // 4.4 Apply rotation to rows/cols (j, p) and (j, q), j != p, q.
            for (int j = 0; j < n; ++j)
            {
                if (j == p || j == q) continue;
                float* const row_Aj = Adata + j * Astep;
                const float apj = row_Ap[j];
                const float aqj = row_Aq[j];
                const float new_apj = c * apj - s * aqj;
                const float new_aqj = s * apj + c * aqj;
                row_Ap[j] = new_apj;
                row_Aq[j] = new_aqj;
                row_Aj[p] = new_apj; // maintain symmetry
                row_Aj[q] = new_aqj;
            }

            // 4.5 Update the (p,p), (q,q), (p,q), (q,p) block.
            row_Ap[p] = c * c * app - 2.0f * c * s * apq + s * s * aqq;
            row_Aq[q] = s * s * app + 2.0f * c * s * apq + c * c * aqq;
            row_Ap[q] = 0.0f;
            row_Aq[p] = 0.0f;

            // 4.6 Accumulate the rotation into V: V := V * G.
            for (int i = 0; i < n; ++i)
            {
                float* const row_Vi = Vdata + i * Vstep;
                const float vip = row_Vi[p];
                const float viq = row_Vi[q];
                row_Vi[p] = c * vip - s * viq;
                row_Vi[q] = s * vip + c * viq;
            }
        }

        // -----------------------------------------------------------------
        // 5. Read eigenvalues off the diagonal (single allocation, single pass).
        // -----------------------------------------------------------------
        const int evStep = result.eigenvalues.step;
        for (int i = 0; i < n; ++i)
        {
            result.eigenvalues.data[i * evStep] = Adata[i * Astep + i];
        }

        result.iterations = iter_count;
        if (converged)
        {
            result.status = TINY_OK;
        }
        else
        {
            result.status = TINY_ERR_NOT_FINISHED;
            std::cerr << "[Warning] eigendecompose_jacobi: did not converge within "
                      << max_iter << " iterations.\n";
        }
        return result;
    }

    /**
     * @name Mat::eigendecompose_qr()
     * @brief Unshifted QR algorithm for general (non-symmetric) real matrices.
     *
     * Algorithm (per iteration):
     *   1. A_k = Q_k * R_k         (via Modified Gram-Schmidt; R is full upper)
     *   2. A_{k+1} = R_k * Q_k     (similar transform; preserves spectrum)
     *   3. V_{k+1} = V_k * Q_k     (accumulates eigenvectors)
     *
     * Convergence:
     *   Stop when every sub-diagonal entry is below `tolerance` scaled by the
     *   neighbouring diagonal magnitudes (with a 1.0 floor for tiny matrices).
     *
     * Caveats:
     *   - No shifts and no Hessenberg reduction. Convergence is linear in the
     *     general case; for fast convergence on real-world matrices add a
     *     Wilkinson shift in a future revision.
     *   - Pairs of complex eigenvalues will leave 2x2 blocks on the diagonal
     *     and will *not* converge. Only the real part of the diagonal is
     *     reported, with `status = TINY_ERR_NOT_FINISHED`.
     *
     * Performance:
     *   - Two-buffer ping-pong avoids the per-iteration n*n allocations of
     *     A_new / V_new that the previous implementation did.
     *   - The R*Q product exploits R's upper-triangular structure
     *     (k starts at i), halving its FLOPs on average.
     *
     * @param max_iter  Max QR sweeps (must be > 0).
     * @param tolerance Sub-diagonal convergence threshold (must be >= 0).
     * @return EigenDecomposition with diagonal eigenvalues, accumulated V,
     *         iteration count, and status.
     */
    Mat::EigenDecomposition Mat::eigendecompose_qr(int max_iter, float tolerance) const
    {
        EigenDecomposition result;

        // -----------------------------------------------------------------
        // 1. Validate input
        // -----------------------------------------------------------------
        if (this->data == nullptr)
        {
            std::cerr << "[Error] eigendecompose_qr: matrix data pointer is null.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (this->row <= 0 || this->col <= 0)
        {
            std::cerr << "[Error] eigendecompose_qr: invalid matrix dimensions: "
                      << this->row << "x" << this->col << ".\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (this->row != this->col)
        {
            std::cerr << "[Error] eigendecompose_qr: requires square matrix (got "
                      << this->row << "x" << this->col << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (max_iter <= 0)
        {
            std::cerr << "[Error] eigendecompose_qr: max_iter must be > 0 (got "
                      << max_iter << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (tolerance < 0.0f)
        {
            std::cerr << "[Error] eigendecompose_qr: tolerance must be >= 0 (got "
                      << tolerance << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        const int n = this->row;

        // -----------------------------------------------------------------
        // 2. Allocate eigenvalues output (filled at the end). Empty matrix
        //    short-circuits with valid-empty results.
        // -----------------------------------------------------------------
        result.eigenvalues = Mat(n, 1);
        if (n > 0 && result.eigenvalues.data == nullptr)
        {
            std::cerr << "[Error] eigendecompose_qr: failed to allocate eigenvalues.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (n == 0)
        {
            result.eigenvectors = Mat(0, 0);
            result.iterations   = 0;
            result.status       = TINY_OK;
            return result;
        }

        // -----------------------------------------------------------------
        // 3. Two-buffer ping-pong for A and V, so each iteration only swaps
        //    pointers instead of allocating fresh n*n matrices.
        // -----------------------------------------------------------------
        Mat A_a(*this);          // working copy, drifts toward upper triangular
        Mat A_b(n, n);
        Mat V_a = Mat::eye(n);   // accumulated eigenvectors
        Mat V_b(n, n);
        if (A_a.data == nullptr || A_b.data == nullptr ||
            V_a.data == nullptr || V_b.data == nullptr)
        {
            std::cerr << "[Error] eigendecompose_qr: failed to allocate working buffers.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        Mat *A_cur = &A_a, *A_nxt = &A_b;
        Mat *V_cur = &V_a, *V_nxt = &V_b;

        // gram_schmidt_orthogonalize() reallocates these on every call, so
        // declare them once and let the inner function manage memory.
        Mat Q, R;

        bool converged  = false;
        int  iter_count = max_iter;

        for (int iter = 0; iter < max_iter; ++iter)
        {
            // 3.1 Convergence: every sub-diagonal entry below the (relative)
            //     tolerance. Bail as soon as one entry fails the test.
            {
                bool not_converged = false;
                const int Astep = A_cur->step;
                const float* const Adata = A_cur->data;
                for (int i = 1; i < n && !not_converged; ++i)
                {
                    const float* row_Ai = Adata + i * Astep;
                    for (int j = 0; j < i; ++j)
                    {
                        const float abs_val  = fabsf(row_Ai[j]);
                        const float diag_scl = fmaxf(fabsf(Adata[i * Astep + i]),
                                                     fabsf(Adata[j * Astep + j]));
                        const float rel_tol  = tolerance * fmaxf(1.0f, diag_scl);
                        if (abs_val > rel_tol)
                        {
                            not_converged = true;
                            break;
                        }
                    }
                }
                if (!not_converged)
                {
                    converged  = true;
                    iter_count = iter;
                    break;
                }
            }

            // 3.2 A = Q * R via Modified Gram-Schmidt.
            //     R is delivered as a *full* upper-triangular matrix, so we
            //     can use it directly with no patch-up step.
            if (!Mat::gram_schmidt_orthogonalize(*A_cur, Q, R,
                                                 TINY_MATH_MIN_POSITIVE_INPUT_F32))
            {
                std::cerr << "[Error] eigendecompose_qr: Gram-Schmidt failed.\n";
                result.status = TINY_ERR_MATH_NULL_POINTER;
                return result;
            }

            // 3.3 A_nxt = R * Q  (R is upper-triangular, so k starts at i)
            {
                const int Rstep  = R.step;
                const int Qstep  = Q.step;
                const int Anstep = A_nxt->step;
                const float* const Rdata = R.data;
                const float* const Qdata = Q.data;
                float* const       Andata = A_nxt->data;
                for (int i = 0; i < n; ++i)
                {
                    const float* row_Ri = Rdata + i * Rstep;
                    float* row_Ani = Andata + i * Anstep;
                    for (int j = 0; j < n; ++j)
                    {
                        float sum = 0.0f;
                        for (int k = i; k < n; ++k) // R(i,k) = 0 for k < i
                            sum += row_Ri[k] * Qdata[k * Qstep + j];
                        row_Ani[j] = sum;
                    }
                }
            }

            // 3.4 V_nxt = V_cur * Q
            {
                const int Vcstep = V_cur->step;
                const int Vnstep = V_nxt->step;
                const int Qstep  = Q.step;
                const float* const Vcdata = V_cur->data;
                const float* const Qdata  = Q.data;
                float* const       Vndata = V_nxt->data;
                for (int i = 0; i < n; ++i)
                {
                    const float* row_Vi  = Vcdata + i * Vcstep;
                    float*       row_Vni = Vndata + i * Vnstep;
                    for (int j = 0; j < n; ++j)
                    {
                        float sum = 0.0f;
                        for (int k = 0; k < n; ++k)
                            sum += row_Vi[k] * Qdata[k * Qstep + j];
                        row_Vni[j] = sum;
                    }
                }
            }

            // 3.5 Swap the ping-pong pointers (zero copy).
            {
                Mat *tmp = A_cur; A_cur = A_nxt; A_nxt = tmp;
                tmp = V_cur; V_cur = V_nxt; V_nxt = tmp;
            }
            iter_count = iter + 1;
        }

        // -----------------------------------------------------------------
        // 4. Read eigenvalues off the (converged or last) diagonal and copy
        //    out the accumulated eigenvectors.
        // -----------------------------------------------------------------
        {
            const int Astep        = A_cur->step;
            const float* const Adata = A_cur->data;
            const int evStep       = result.eigenvalues.step;
            for (int i = 0; i < n; ++i)
                result.eigenvalues.data[i * evStep] = Adata[i * Astep + i];
        }
        result.eigenvectors = *V_cur;
        result.iterations   = iter_count;
        if (converged)
        {
            result.status = TINY_OK;
        }
        else
        {
            result.status = TINY_ERR_NOT_FINISHED;
            std::cerr << "[Warning] eigendecompose_qr: did not converge within "
                      << max_iter << " iterations.\n";
        }
        return result;
    }

    /**
     * @name Mat::eigendecompose()
     * @brief Automatic eigenvalue decomposition with method selection.
     * @note Convenient interface for edge computing: automatically selects the best method.
     *       - Symmetric matrices: Uses Jacobi method (more efficient and stable)
     *       - General matrices: Uses QR algorithm
     * @note Time complexity: Depends on selected method
     *       - Jacobi: O(n³ * iterations) for symmetric matrices
     *       - QR: O(n³ * iterations) for general matrices
     * 
     * @param tolerance Convergence tolerance (must be >= 0)
     * @param max_iter Maximum number of iterations (must be > 0, default = 100)
     * @return EigenDecomposition containing eigenvalues, eigenvectors, and status
     */
    Mat::EigenDecomposition Mat::eigendecompose(float tolerance, int max_iter) const
    {
        if (this->data == nullptr)
        {
            EigenDecomposition result;
            std::cerr << "[Error] eigendecompose: matrix data pointer is null.\n";
            result.status = TINY_ERR_MATH_NULL_POINTER;
            return result;
        }
        if (tolerance < 0.0f)
        {
            EigenDecomposition result;
            std::cerr << "[Error] eigendecompose: tolerance must be >= 0 (got "
                      << tolerance << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }
        if (max_iter <= 0)
        {
            EigenDecomposition result;
            std::cerr << "[Error] eigendecompose: max_iter must be > 0 (got "
                      << max_iter << ").\n";
            result.status = TINY_ERR_INVALID_ARG;
            return result;
        }

        // Symmetry-test margin: treat the matrix as symmetric when off-diag
        // mismatches stay within `kSymmetryToleranceFactor * tolerance`. This
        // is intentionally loose so matrices built from differences of
        // symmetric ops (which carry small FP asymmetry) still pick the
        // faster, more-stable Jacobi path.
        constexpr float kSymmetryToleranceFactor = 10.0f;
        const float sym_tol = kSymmetryToleranceFactor * tolerance;
        if (this->is_symmetric(sym_tol))
        {
            return this->eigendecompose_jacobi(tolerance, max_iter);
        }
        return this->eigendecompose_qr(max_iter, tolerance);
    }

    // ============================================================================
    // Stream Operators
    // ============================================================================
    /**
     * @name operator<<
     * @brief Stream insertion operator for printing matrix to the output stream (e.g., std::cout).
     *
     * This function allows printing the contents of a matrix to an output stream.
     * It prints each row of the matrix on a new line, with elements separated by spaces.
     *
     * @param os Output stream where the matrix will be printed (e.g., std::cout)
     * @param m Matrix to be printed
     *
     * @return os The output stream after printing the matrix
     */
    std::ostream &operator<<(std::ostream &os, const Mat &m)
    {
        // A 0x0 / 0xN / Nx0 matrix is a legal empty value (see Mat(int,int));
        // print nothing for it. A truly bad pointer (sized but null) is the
        // only real error case.
        if (m.row == 0 || m.col == 0)
        {
            return os;
        }
        if (m.data == nullptr)
        {
            os << "[Error] Cannot print matrix: data pointer is null.\n";
            return os;
        }

        for (int i = 0; i < m.row; ++i)
        {
            os << m(i, 0);
            for (int j = 1; j < m.col; ++j)
            {
                os << " " << m(i, j);
            }
            os << std::endl;
        }
        return os;
    }

    /**
     * @name operator<<
     * @brief Stream insertion operator for printing the Rectangular ROI structure to the output stream.
     *
     * This function prints the details of the ROI (Region of Interest) including the start row and column,
     * and the width and height of the rectangular region.
     *
     * @param os Output stream where the ROI will be printed (e.g., std::cout)
     * @param roi The ROI structure to be printed
     *
     * @return os The output stream after printing the ROI details
     */
    std::ostream &operator<<(std::ostream &os, const Mat::ROI &roi)
    {
        os << "row start " << roi.pos_y << std::endl;
        os << "col start " << roi.pos_x << std::endl;
        os << "row count " << roi.height << std::endl;
        os << "col count " << roi.width << std::endl;

        return os;
    }

    /**
     * @name operator>>
     * @brief Stream extraction operator for reading matrix from the input stream (e.g., std::cin).
     *
     * This function reads the contents of a matrix from an input stream.
     * The matrix elements are read row by row, with elements separated by spaces or newlines.
     *
     * @param is Input stream from which the matrix will be read (e.g., std::cin)
     * @param m Matrix to store the read data
     *
     * @return is The input stream after reading the matrix
     */
    std::istream &operator>>(std::istream &is, Mat &m)
    {
        if (m.row == 0 || m.col == 0)
        {
            return is; // nothing to read into an empty matrix
        }
        if (m.data == nullptr)
        {
            std::cerr << "[Error] operator>>: target matrix has null data buffer.\n";
            is.setstate(std::ios::failbit);
            return is;
        }

        for (int i = 0; i < m.row; ++i)
        {
            for (int j = 0; j < m.col; ++j)
            {
                is >> m(i, j);
            }
        }
        return is;
    }

    // ============================================================================
    // Global Arithmetic Operators
    // ============================================================================
    /**
     * + operator, sum of two matrices
     * The operator use DSP optimized implementation of multiplication.
     *
     * @param[in] A: Input matrix A
     * @param[in] B: Input matrix B
     *
     * @return
     *     - result matrix A+B
     */
    Mat operator+(const Mat &m1, const Mat &m2)
    {
        if (m1.data == nullptr || m2.data == nullptr)
        {
            std::cerr << "[Error] operator+: null matrix data pointer.\n";
            return Mat(0, 0);
        }
        if ((m1.row != m2.row) || (m1.col != m2.col))
        {
            std::cerr << "[Error] operator+: matrices do not have equal dimensions ("
                      << m1.row << "x" << m1.col << " vs "
                      << m2.row << "x" << m2.col << ").\n";
            return Mat(0, 0);
        }

        if (m1.sub_matrix || m2.sub_matrix)
        {
            Mat temp(m1.row, m1.col);
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_add_f32(m1.data, m2.data, temp.data, m1.row, m1.col, m1.pad, m2.pad, temp.pad, 1, 1, 1);
#else
            tiny_mat_add_f32(m1.data, m2.data, temp.data, m1.row, m1.col, m1.pad, m2.pad, temp.pad, 1, 1, 1);
#endif
            return temp;
        }
        else
        {
            Mat temp(m1);
            return (temp += m2);
        }
    }

    /**
     * + operator, sum of matrix with constant
     * The operator use DSP optimized implementation of multiplication.
     *
     * @param[in] A: Input matrix A
     * @param[in] C: Input constant
     *
     * @return
     *     - result matrix A+C
     */
    Mat operator+(const Mat &m, float C)
    {
        if (m.sub_matrix)
        {
            Mat temp(m.row, m.col);
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_addc_f32(m.data, temp.data, C, m.row, m.col, m.pad, temp.pad, 1, 1);
#else
            tiny_mat_addc_f32(m.data, temp.data, C, m.row, m.col, m.pad, temp.pad, 1, 1);
#endif
            return temp;
        }
        else
        {
            Mat temp(m);
            return (temp += C);
        }
    }

    /**
     * - operator, subtraction of two matrices
     * The operator use DSP optimized implementation of multiplication.
     *
     * @param[in] A: Input matrix A
     * @param[in] B: Input matrix B
     *
     * @return
     *     - result matrix A-B
     */
    Mat operator-(const Mat &m1, const Mat &m2)
    {
        if (m1.data == nullptr || m2.data == nullptr)
        {
            std::cerr << "[Error] operator-: null matrix data pointer.\n";
            return Mat(0, 0);
        }
        if ((m1.row != m2.row) || (m1.col != m2.col))
        {
            std::cerr << "[Error] operator-: matrices do not have equal dimensions ("
                      << m1.row << "x" << m1.col << " vs "
                      << m2.row << "x" << m2.col << ").\n";
            return Mat(0, 0);
        }

        if (m1.sub_matrix || m2.sub_matrix)
        {
            Mat temp(m1.row, m1.col);
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_sub_f32(m1.data, m2.data, temp.data, m1.row, m1.col, m1.pad, m2.pad, temp.pad, 1, 1, 1);
#else
            tiny_mat_sub_f32(m1.data, m2.data, temp.data, m1.row, m1.col, m1.pad, m2.pad, temp.pad, 1, 1, 1);
#endif
            return temp;
        }
        else
        {
            Mat temp(m1);
            return (temp -= m2);
        }
    }


    /**
     * - operator, subtraction of matrix with constant
     * The operator use DSP optimized implementation of multiplication.
     *
     * @param[in] A: Input matrix A
     * @param[in] C: Input constant
     *
     * @return
     *     - result matrix A-C
     */
    Mat operator-(const Mat &m, float C)
    {
        if (m.sub_matrix)
        {
            Mat temp(m.row, m.col);
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_addc_f32(m.data, temp.data, -C, m.row, m.col, m.pad, temp.pad, 1, 1);
#else
            tiny_mat_addc_f32(m.data, temp.data, -C, m.row, m.col, m.pad, temp.pad, 1, 1);
#endif
            return temp;
        }
        else
        {
            Mat temp(m);
            return (temp -= C);
        }
    }


    /**
     * * operator, multiplication of two matrices.
     * The operator use DSP optimized implementation of multiplication.
     *
     * @param[in] A: Input matrix A
     * @param[in] B: Input matrix B
     *
     * @return
     *     - result matrix A*B
     */
    Mat operator*(const Mat &m1, const Mat &m2)
    {
        if (m1.data == nullptr || m2.data == nullptr)
        {
            std::cerr << "[Error] operator*: null matrix data pointer.\n";
            return Mat(0, 0);
        }
        if (m1.col != m2.row)
        {
            std::cerr << "[Error] operator*: incompatible inner dimensions ("
                      << m1.row << "x" << m1.col << " * "
                      << m2.row << "x" << m2.col << ").\n";
            return Mat(0, 0);
        }
        Mat temp(m1.row, m2.col);

        if (m1.sub_matrix || m2.sub_matrix)
        {
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_mult_ex_f32(m1.data, m2.data, temp.data, m1.row, m1.col, m2.col, m1.pad, m2.pad, temp.pad);
#else
            tiny_mat_mult_ex_f32(m1.data, m2.data, temp.data, m1.row, m1.col, m2.col, m1.pad, m2.pad, temp.pad);
#endif
        }
        else
        {
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_mult_f32(m1.data, m2.data, temp.data, m1.row, m1.col, m2.col);
#else
            tiny_mat_mult_f32(m1.data, m2.data, temp.data, m1.row, m1.col, m2.col);
#endif
        }

        return temp;
    }

    /**
     * * operator, multiplication of matrix with constant
     * The operator use DSP optimized implementation of multiplication.
     *
     * @param[in] A: Input matrix A
     * @param[in] C: floating point value
     *
     * @return
     *     - result matrix A*B
     */
    Mat operator*(const Mat &m, float num)
    {
        if (m.sub_matrix)
        {
            Mat temp(m.row, m.col);
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_mulc_f32(m.data, temp.data, num, m.row, m.col, m.pad, temp.pad, 1, 1);
#else
            tiny_mat_multc_f32(m.data, temp.data, num, m.row, m.col, m.pad, temp.pad, 1, 1);
#endif
            return temp;
        }
        else
        {
            Mat temp(m);
            return (temp *= num);
        }
    }

    /**
     * * operator, multiplication of matrix with constant
     * The operator use DSP optimized implementation of multiplication.
     *
     * @param[in] C: floating point value
     * @param[in] A: Input matrix A
     *
     * @return
     *     - result matrix C*A
     */
    Mat operator*(float num, const Mat &m)
    {
        return (m * num);
    }

    /**
     * / operator, divide of matrix by constant
     * The operator use DSP optimized implementation of multiplication.
     *
     * @param[in] A: Input matrix A
     * @param[in] C: floating point value
     *
     * @return
     *     - result matrix A/C
     */
    Mat operator/(const Mat &m, float num)
    {
        if (m.data == nullptr)
        {
            std::cerr << "[Error] operator/: null matrix data pointer.\n";
            return Mat(0, 0);
        }
        if (!(fabsf(num) > TINY_MATH_MIN_POSITIVE_INPUT_F32))
        {
            std::cerr << "[Error] operator/: division by zero (num=" << num << ").\n";
            return Mat(0, 0);
        }

        if (m.sub_matrix)
        {
            Mat temp(m.row, m.col);
            float inv_num = 1.0f / num;
#if MCU_PLATFORM_SELECTED == MCU_PLATFORM_ESP32
            dspm_mulc_f32(m.data, temp.data, inv_num, m.row, m.col, m.pad, temp.pad, 1, 1);
#else
            tiny_mat_multc_f32(m.data, temp.data, inv_num, m.row, m.col, m.pad, temp.pad, 1, 1);
#endif
            return temp;
        }
        else
        {
            Mat temp(m);
            return (temp /= num);
        }
    }


    /**
     * / operator, divide matrix A by matrix B (element-wise)
     *
     * @param[in] A: Input matrix A
     * @param[in] B: Input matrix B
     *
     * @return
     *     - result matrix C, where C[i,j] = A[i,j]/B[i,j]
     */
    Mat operator/(const Mat &A, const Mat &B)
    {
        if (A.data == nullptr || B.data == nullptr)
        {
            std::cerr << "[Error] operator/: null matrix data pointer.\n";
            return Mat(0, 0);
        }
        if (A.row <= 0 || A.col <= 0 || B.row <= 0 || B.col <= 0)
        {
            std::cerr << "[Error] operator/: invalid matrix dimensions.\n";
            return Mat(0, 0);
        }
        if ((A.row != B.row) || (A.col != B.col))
        {
            std::cerr << "[Error] operator/: matrices do not have equal dimensions ("
                      << A.row << "x" << A.col << " vs "
                      << B.row << "x" << B.col << ").\n";
            return Mat(0, 0);
        }

        Mat temp(A.row, A.col);
        if (temp.data == nullptr)
        {
            std::cerr << "[Error] operator/: failed to allocate result.\n";
            return Mat(0, 0);
        }

        const int Astep = A.step;
        const int Bstep = B.step;
        const int Tstep = temp.step;
        const float* const Adata = A.data;
        const float* const Bdata = B.data;
        float* const       Tdata = temp.data;
        const int rows = A.row;
        const int cols = A.col;

        for (int r = 0; r < rows; ++r)
        {
            const float* row_A = Adata + r * Astep;
            const float* row_B = Bdata + r * Bstep;
            float*       row_T = Tdata + r * Tstep;
            for (int c = 0; c < cols; ++c)
            {
                if (!(fabsf(row_B[c]) > TINY_MATH_MIN_POSITIVE_INPUT_F32))
                {
                    std::cerr << "[Error] operator/: division by zero at ("
                              << r << ", " << c << ").\n";
                    return Mat(0, 0);
                }
                row_T[c] = row_A[c] / row_B[c];
            }
        }
        return temp;
    }


    /**
     * == operator, compare two matrices
     *
     * @param[in] A: Input matrix A
     * @param[in] B: Input matrix B
     *
     * @return
     *      - true if matrices are the same
     *      - false if matrices are different
     */
    bool operator==(const Mat &m1, const Mat &m2)
    {
        // Shape mismatch -> not equal.
        if ((m1.col != m2.col) || (m1.row != m2.row))
        {
            return false;
        }
        // Two empty matrices of the same shape are equal.
        if (m1.row == 0 || m1.col == 0)
        {
            return true;
        }
        // Pointer hazards: a sized matrix with a null buffer cannot be
        // compared meaningfully. Treat as not-equal rather than UB.
        if (m1.data == nullptr || m2.data == nullptr)
        {
            return false;
        }

        // Use `!(diff <= epsilon)` so that NaN on either side falls through
        // to "not equal" (NaN compares unordered against everything).
        // This is the IEEE-correct semantics that `diff > epsilon` misses.
        constexpr float epsilon = 1e-5f;
        const int s1 = m1.step;
        const int s2 = m2.step;
        const float* const d1 = m1.data;
        const float* const d2 = m2.data;
        const int rows = m1.row;
        const int cols = m1.col;

        for (int r = 0; r < rows; ++r)
        {
            const float* row1 = d1 + r * s1;
            const float* row2 = d2 + r * s2;
            for (int c = 0; c < cols; ++c)
            {
                const float diff = fabsf(row1[c] - row2[c]);
                if (!(diff <= epsilon))
                {
                    return false;
                }
            }
        }
        return true;
    }
}