stdlib_intrinsics_dot_product.fypp Source File


Source Code

#:include "common.fypp"
#:set I_KINDS_TYPES = list(zip(INT_KINDS, INT_TYPES, INT_KINDS))
#:set R_KINDS_TYPES = list(zip(REAL_KINDS, REAL_TYPES, REAL_SUFFIX))
#:set C_KINDS_TYPES = list(zip(CMPLX_KINDS, CMPLX_TYPES, CMPLX_SUFFIX))

#:def cnjg(type,expression)
#:if 'complex' in type
conjg(${expression}$)
#:else
${expression}$
#:endif
#:enddef

submodule(stdlib_intrinsics) stdlib_intrinsics_dot_product
    !!Replacement for certain Fortran intrinsic functions offering either faster and/or more accurate implementations.
    !! ([Specification](../page/specs/stdlib_intrinsics.html))
    use stdlib_kinds
    use stdlib_constants
    implicit none

    integer, parameter :: ilp = int64
    
contains
! This implementation is based on https://github.com/jalvesz/fast_math
#:for k, t, s in I_KINDS_TYPES + R_KINDS_TYPES + C_KINDS_TYPES
pure module function stdlib_dot_product_${s}$(a,b) result(p)
    integer(ilp), parameter :: chunk = 64
    ${t}$, intent(in) :: a(:)
    ${t}$, intent(in) :: b(:)
    ${t}$ :: p
    ${t}$ :: abatch(chunk)
    integer(ilp) :: i, n, r
    ! -----------------------------
    n = size(a,kind=ilp)
    r = mod(n,chunk)

    abatch(1:r)       = a(1:r)*${cnjg(t,'b(1:r)')}$
    abatch(r+1:chunk) = zero_${s}$
    do i = r+1, n-r, chunk
        abatch(1:chunk) = abatch(1:chunk) + a(i:i+chunk-1)*${cnjg(t,'b(i:i+chunk-1)')}$
    end do

    p = zero_${s}$
    do i = 1, chunk/2
        p = p + abatch(i)+abatch(chunk/2+i)
    end do
end function
#:endfor

#:for k, t, s in R_KINDS_TYPES + C_KINDS_TYPES
pure module function stdlib_dot_product_kahan_${s}$(a,b) result(p)
    integer(ilp), parameter :: chunk = 64
    ${t}$, intent(in) :: a(:)
    ${t}$, intent(in) :: b(:)
    ${t}$ :: p
    ${t}$ :: abatch(chunk)
    ${t}$ :: cbatch(chunk)
    integer(ilp) :: i, n, r
    ! -----------------------------
    n = size(a,kind=ilp)
    r = mod(n,chunk)

    abatch(1:r)       = a(1:r)*${cnjg(t,'b(1:r)')}$
    abatch(r+1:chunk) = zero_${s}$
    cbatch = zero_${s}$
    do i = r+1, n-r, chunk
        call kahan_kernel( a(i:i+chunk-1)*${cnjg(t,'b(i:i+chunk-1)')}$ , abatch(1:chunk) , cbatch(1:chunk) )
    end do     

    p = zero_${s}$
    do i = 1, chunk
        call kahan_kernel( abatch(i) , p , cbatch(i) )
    end do      
end function
#:endfor

end submodule stdlib_intrinsics_dot_product