stdlib_lapack_eigv_svd_drivers2.fypp Source File


Source Code

#:include "common.fypp" 
submodule(stdlib_lapack_eig_svd_lsq) stdlib_lapack_eigv_svd_drivers2
  implicit none


  contains
#:for ik,it,ii in LINALG_INT_KINDS_TYPES

     module subroutine stdlib${ii}$_sgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, iwork, info )
     !! SGESDD computes the singular value decomposition (SVD) of a real
     !! M-by-N matrix A, optionally computing the left and right singular
     !! vectors.  If singular vectors are desired, it uses a
     !! divide-and-conquer algorithm.
     !! The SVD is written
     !! A = U * SIGMA * transpose(V)
     !! where SIGMA is an M-by-N matrix which is zero except for its
     !! min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
     !! V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
     !! are the singular values of A; they are real and non-negative, and
     !! are returned in descending order.  The first min(m,n) columns of
     !! U and V are the left and right singular vectors of A.
     !! Note that the routine returns VT = V**T, not V.
     !! The divide and conquer algorithm makes very mild assumptions about
     !! floating point arithmetic. It will work on machines with a guard
     !! digit in add/subtract, or on those binary machines without guard
     !! digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
     !! Cray-2. It could conceivably fail on hexadecimal or decimal machines
     !! without guard digits, but we know of none.
               
        ! -- lapack driver routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_sp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobz
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lda, ldu, ldvt, lwork, m, n
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(sp), intent(inout) :: a(lda,*)
           real(sp), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
        ! =====================================================================
           
           ! Local Scalars 
           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
           integer(${ik}$) :: bdspac, blk, chunk, i, ie, ierr, il, ir, iscl, itau, itaup, itauq, iu, &
                     ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, nwork, wrkbl
           integer(${ik}$) :: lwork_sgebrd_mn, lwork_sgebrd_mm, lwork_sgebrd_nn, lwork_sgelqf_mn, &
           lwork_sgeqrf_mn, lwork_sorgbr_p_mm, lwork_sorgbr_q_nn, lwork_sorglq_mn, &
           lwork_sorglq_nn, lwork_sorgqr_mm, lwork_sorgqr_mn, lwork_sormbr_prt_mm, &
           lwork_sormbr_qln_mm, lwork_sormbr_prt_mn, lwork_sormbr_qln_mn, lwork_sormbr_prt_nn, &
                     lwork_sormbr_qln_nn
           real(sp) :: anrm, bignum, eps, smlnum
           ! Local Arrays 
           integer(${ik}$) :: idum(1_${ik}$)
           real(sp) :: dum(1_${ik}$)
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test the input arguments
           info   = 0_${ik}$
           minmn  = min( m, n )
           wntqa  = stdlib_lsame( jobz, 'A' )
           wntqs  = stdlib_lsame( jobz, 'S' )
           wntqas = wntqa .or. wntqs
           wntqo  = stdlib_lsame( jobz, 'O' )
           wntqn  = stdlib_lsame( jobz, 'N' )
           lquery = ( lwork==-1_${ik}$ )
           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
              info = -1_${ik}$
           else if( m<0_${ik}$ ) then
              info = -2_${ik}$
           else if( n<0_${ik}$ ) then
              info = -3_${ik}$
           else if( lda<max( 1_${ik}$, m ) ) then
              info = -5_${ik}$
           else if( ldu<1_${ik}$ .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
                     then
              info = -8_${ik}$
           else if( ldvt<1_${ik}$ .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
                     .and. m>=n .and. ldvt<n ) ) then
              info = -10_${ik}$
           end if
           ! compute workspace
             ! note: comments in the code beginning "workspace:" describe the
             ! minimal amount of workspace allocated at that point in the code,
             ! as well as the preferred amount for good performance.
             ! nb refers to the optimal block size for the immediately
             ! following subroutine, as returned by stdlib${ii}$_ilaenv.
           if( info==0_${ik}$ ) then
              minwrk = 1_${ik}$
              maxwrk = 1_${ik}$
              bdspac = 0_${ik}$
              mnthr  = int( minmn*11.0_sp / 6.0_sp,KIND=${ik}$)
              if( m>=n .and. minmn>0_${ik}$ ) then
                 ! compute space needed for stdlib${ii}$_sbdsdc
                 if( wntqn ) then
                    ! stdlib${ii}$_sbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_sp)
                    ! keep 7*n for backwards compatibility.
                    bdspac = 7_${ik}$*n
                 else
                    bdspac = 3_${ik}$*n*n + 4_${ik}$*n
                 end if
                 ! compute space preferred for each routine
                 call stdlib${ii}$_sgebrd( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_sgebrd_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sgebrd( n, n, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_sgebrd_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sgeqrf( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_sgeqrf_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sorgbr( 'Q', n, n, n, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$,ierr )
                 lwork_sorgbr_q_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sorgqr( m, m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_sorgqr_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sorgqr( m, n, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_sorgqr_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, n, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_sormbr_prt_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sormbr( 'Q', 'L', 'N', n, n, n, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_sormbr_qln_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, n, n, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_sormbr_qln_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, n, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_sormbr_qln_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 if( m>=mnthr ) then
                    if( wntqn ) then
                       ! path 1 (m >> n, jobz='n')
                       wrkbl = n + lwork_sgeqrf_mn
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sgebrd_nn )
                       maxwrk = max( wrkbl, bdspac + n )
                       minwrk = bdspac + n
                    else if( wntqo ) then
                       ! path 2 (m >> n, jobz='o')
                       wrkbl = n + lwork_sgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_sorgqr_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + 2_${ik}$*n*n
                       minwrk = bdspac + 2_${ik}$*n*n + 3_${ik}$*n
                    else if( wntqs ) then
                       ! path 3 (m >> n, jobz='s')
                       wrkbl = n + lwork_sgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_sorgqr_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + n*n
                       minwrk = bdspac + n*n + 3_${ik}$*n
                    else if( wntqa ) then
                       ! path 4 (m >> n, jobz='a')
                       wrkbl = n + lwork_sgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_sorgqr_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + n*n
                       minwrk = n*n + max( 3_${ik}$*n + bdspac, n + m )
                    end if
                 else
                    ! path 5 (m >= n, but not much larger)
                    wrkbl = 3_${ik}$*n + lwork_sgebrd_mn
                    if( wntqn ) then
                       ! path 5n (m >= n, jobz='n')
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    else if( wntqo ) then
                       ! path 5o (m >= n, jobz='o')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_qln_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + m*n
                       minwrk = 3_${ik}$*n + max( m, n*n + bdspac )
                    else if( wntqs ) then
                       ! path 5s (m >= n, jobz='s')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_qln_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    else if( wntqa ) then
                       ! path 5a (m >= n, jobz='a')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_sormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    end if
                 end if
              else if( minmn>0_${ik}$ ) then
                 ! compute space needed for stdlib${ii}$_sbdsdc
                 if( wntqn ) then
                    ! stdlib${ii}$_sbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_sp)
                    ! keep 7*n for backwards compatibility.
                    bdspac = 7_${ik}$*m
                 else
                    bdspac = 3_${ik}$*m*m + 4_${ik}$*m
                 end if
                 ! compute space preferred for each routine
                 call stdlib${ii}$_sgebrd( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_sgebrd_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sgebrd( m, m, a, m, s, dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                           
                 lwork_sgebrd_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sgelqf( m, n, a, m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_sgelqf_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sorglq( n, n, m, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_sorglq_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sorglq( m, n, m, a, m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_sorglq_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sorgbr( 'P', m, m, m, a, n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_sorgbr_p_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sormbr( 'P', 'R', 'T', m, m, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_sormbr_prt_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sormbr( 'P', 'R', 'T', m, n, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_sormbr_prt_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, m, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_sormbr_prt_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_sormbr_qln_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 if( n>=mnthr ) then
                    if( wntqn ) then
                       ! path 1t (n >> m, jobz='n')
                       wrkbl = m + lwork_sgelqf_mn
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sgebrd_mm )
                       maxwrk = max( wrkbl, bdspac + m )
                       minwrk = bdspac + m
                    else if( wntqo ) then
                       ! path 2t (n >> m, jobz='o')
                       wrkbl = m + lwork_sgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_sorglq_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + 2_${ik}$*m*m
                       minwrk = bdspac + 2_${ik}$*m*m + 3_${ik}$*m
                    else if( wntqs ) then
                       ! path 3t (n >> m, jobz='s')
                       wrkbl = m + lwork_sgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_sorglq_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*m
                       minwrk = bdspac + m*m + 3_${ik}$*m
                    else if( wntqa ) then
                       ! path 4t (n >> m, jobz='a')
                       wrkbl = m + lwork_sgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_sorglq_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*m
                       minwrk = m*m + max( 3_${ik}$*m + bdspac, m + n )
                    end if
                 else
                    ! path 5t (n > m, but not much larger)
                    wrkbl = 3_${ik}$*m + lwork_sgebrd_mn
                    if( wntqn ) then
                       ! path 5tn (n > m, jobz='n')
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    else if( wntqo ) then
                       ! path 5to (n > m, jobz='o')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_prt_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*n
                       minwrk = 3_${ik}$*m + max( n, m*m + bdspac )
                    else if( wntqs ) then
                       ! path 5ts (n > m, jobz='s')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_prt_mn )
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    else if( wntqa ) then
                       ! path 5ta (n > m, jobz='a')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_sormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    end if
                 end if
              end if
              maxwrk = max( maxwrk, minwrk )
              work( 1_${ik}$ ) = stdlib${ii}$_sroundup_lwork( maxwrk )
              if( lwork<minwrk .and. .not.lquery ) then
                 info = -12_${ik}$
              end if
           end if
           if( info/=0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'SGESDD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! quick return if possible
           if( m==0_${ik}$ .or. n==0_${ik}$ ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_slamch( 'P' )
           smlnum = sqrt( stdlib${ii}$_slamch( 'S' ) ) / eps
           bignum = one / smlnum
           ! scale a if max element outside range [smlnum,bignum]
           anrm = stdlib${ii}$_slange( 'M', m, n, a, lda, dum )
           if( stdlib${ii}$_sisnan( anrm ) ) then
               info = -4_${ik}$
               return
           end if
           iscl = 0_${ik}$
           if( anrm>zero .and. anrm<smlnum ) then
              iscl = 1_${ik}$
              call stdlib${ii}$_slascl( 'G', 0_${ik}$, 0_${ik}$, anrm, smlnum, m, n, a, lda, ierr )
           else if( anrm>bignum ) then
              iscl = 1_${ik}$
              call stdlib${ii}$_slascl( 'G', 0_${ik}$, 0_${ik}$, anrm, bignum, m, n, a, lda, ierr )
           end if
           if( m>=n ) then
              ! a has at least as many rows as columns. if a has sufficiently
              ! more rows than columns, first reduce using the qr
              ! decomposition (if sufficient workspace available)
              if( m>=mnthr ) then
                 if( wntqn ) then
                    ! path 1 (m >> n, jobz='n')
                    ! no singular vectors to be computed
                    itau = 1_${ik}$
                    nwork = itau + n
                    ! compute a=q*r
                    ! workspace: need   n [tau] + n    [work]
                    ! workspace: prefer n [tau] + n*nb [work]
                    call stdlib${ii}$_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! zero out below r
                    if (n>1_${ik}$) call stdlib${ii}$_slaset( 'L', n-1, n-1, zero, zero, a( 2_${ik}$, 1_${ik}$ ), lda )
                    ie = 1_${ik}$
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in a
                    ! workspace: need   3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_sgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    nwork = ie + n
                    ! perform bidiagonal svd, computing singular values only
                    ! workspace: need   n [e] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'N', n, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 2 (m >> n, jobz = 'o')
                    ! n left singular vectors to be overwritten on a and
                    ! n right singular vectors to be computed in vt
                    ir = 1_${ik}$
                    ! work(ir) is ldwrkr by n
                    if( lwork >= lda*n + n*n + 3_${ik}$*n + bdspac ) then
                       ldwrkr = lda
                    else
                       ldwrkr = ( lwork - n*n - 3_${ik}$*n - bdspac ) / n
                    end if
                    itau = ir + ldwrkr*n
                    nwork = itau + n
                    ! compute a=q*r
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy r to work(ir), zeroing out below it
                    call stdlib${ii}$_slacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
                    call stdlib${ii}$_slaset( 'L', n - 1_${ik}$, n - 1_${ik}$, zero, zero, work(ir+1),ldwrkr )
                    ! generate q in a
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_sorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in work(ir)
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_sgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! work(iu) is n by n
                    iu = nwork
                    nwork = iu + n*n
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in work(iu) and computing right
                    ! singular vectors of bidiagonal matrix in vt
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
                              idum, work( nwork ), iwork,info )
                    ! overwrite work(iu) by left singular vectors of r
                    ! and vt by right singular vectors of r
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n    [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
                              work( iu ), n, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
                              vt, ldvt, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! multiply q in a by left singular vectors of r in
                    ! work(iu), storing result in work(ir) and copying to a
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u]
                    ! workspace: prefer m*n [r] + 3*n [e, tauq, taup] + n*n [u]
                    do i = 1, m, ldwrkr
                       chunk = min( m - i + 1_${ik}$, ldwrkr )
                       call stdlib${ii}$_sgemm( 'N', 'N', chunk, n, n, one, a( i, 1_${ik}$ ),lda, work( iu ), &
                                 n, zero, work( ir ),ldwrkr )
                       call stdlib${ii}$_slacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1_${ik}$ ), lda )
                                 
                    end do
                 else if( wntqs ) then
                    ! path 3 (m >> n, jobz='s')
                    ! n left singular vectors to be computed in u and
                    ! n right singular vectors to be computed in vt
                    ir = 1_${ik}$
                    ! work(ir) is n by n
                    ldwrkr = n
                    itau = ir + ldwrkr*n
                    nwork = itau + n
                    ! compute a=q*r
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy r to work(ir), zeroing out below it
                    call stdlib${ii}$_slacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
                    call stdlib${ii}$_slaset( 'L', n - 1_${ik}$, n - 1_${ik}$, zero, zero, work(ir+1),ldwrkr )
                    ! generate q in a
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_sorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in work(ir)
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_sgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagoal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! overwrite u by left singular vectors of r and vt
                    ! by right singular vectors of r
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n    [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
                              u, ldu, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
                              vt, ldvt, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! multiply q in a by left singular vectors of r in
                    ! work(ir), storing result in u
                    ! workspace: need   n*n [r]
                    call stdlib${ii}$_slacpy( 'F', n, n, u, ldu, work( ir ), ldwrkr )
                    call stdlib${ii}$_sgemm( 'N', 'N', m, n, n, one, a, lda, work( ir ),ldwrkr, zero, u,&
                               ldu )
                 else if( wntqa ) then
                    ! path 4 (m >> n, jobz='a')
                    ! m left singular vectors to be computed in u and
                    ! n right singular vectors to be computed in vt
                    iu = 1_${ik}$
                    ! work(iu) is n by n
                    ldwrku = n
                    itau = iu + ldwrku*n
                    nwork = itau + n
                    ! compute a=q*r, copying result to u
                    ! workspace: need   n*n [u] + n [tau] + n    [work]
                    ! workspace: prefer n*n [u] + n [tau] + n*nb [work]
                    call stdlib${ii}$_sgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    call stdlib${ii}$_slacpy( 'L', m, n, a, lda, u, ldu )
                    ! generate q in u
                    ! workspace: need   n*n [u] + n [tau] + m    [work]
                    ! workspace: prefer n*n [u] + n [tau] + m*nb [work]
                    call stdlib${ii}$_sorgqr( m, m, n, u, ldu, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ! produce r in a, zeroing out other entries
                    if (n>1_${ik}$) call stdlib${ii}$_slaset( 'L', n-1, n-1, zero, zero, a( 2_${ik}$, 1_${ik}$ ), lda )
                    ie = itau
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in a
                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_sgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in work(iu) and computing right
                    ! singular vectors of bidiagonal matrix in vt
                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
                              idum, work( nwork ), iwork,info )
                    ! overwrite work(iu) by left singular vectors of r and vt
                    ! by right singular vectors of r
                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n    [work]
                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + n*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', n, n, n, a, lda,work( itauq ), work( iu ), &
                              ldwrku,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! multiply q in u by left singular vectors of r in
                    ! work(iu), storing result in a
                    ! workspace: need   n*n [u]
                    call stdlib${ii}$_sgemm( 'N', 'N', m, n, n, one, u, ldu, work( iu ),ldwrku, zero, a,&
                               lda )
                    ! copy left singular vectors of a from a to u
                    call stdlib${ii}$_slacpy( 'F', m, n, a, lda, u, ldu )
                 end if
              else
                 ! m < mnthr
                 ! path 5 (m >= n, but not much larger)
                 ! reduce to bidiagonal form without qr decomposition
                 ie = 1_${ik}$
                 itauq = ie + n
                 itaup = itauq + n
                 nwork = itaup + n
                 ! bidiagonalize a
                 ! workspace: need   3*n [e, tauq, taup] + m        [work]
                 ! workspace: prefer 3*n [e, tauq, taup] + (m+n)*nb [work]
                 call stdlib${ii}$_sgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                           work( nwork ), lwork-nwork+1,ierr )
                 if( wntqn ) then
                    ! path 5n (m >= n, jobz='n')
                    ! perform bidiagonal svd, only computing singular values
                    ! workspace: need   3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'N', n, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 5o (m >= n, jobz='o')
                    iu = nwork
                    if( lwork >= m*n + 3_${ik}$*n + bdspac ) then
                       ! work( iu ) is m by n
                       ldwrku = m
                       nwork = iu + ldwrku*n
                       call stdlib${ii}$_slaset( 'F', m, n, zero, zero, work( iu ),ldwrku )
                       ! ir is unused; silence compile warnings
                       ir = -1_${ik}$
                    else
                       ! work( iu ) is n by n
                       ldwrku = n
                       nwork = iu + ldwrku*n
                       ! work(ir) is ldwrkr by n
                       ir = nwork
                       ldwrkr = ( lwork - n*n - 3_${ik}$*n ) / n
                    end if
                    nwork = iu + ldwrku*n
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in work(iu) and computing right
                    ! singular vectors of bidiagonal matrix in vt
                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'I', n, s, work( ie ), work( iu ),ldwrku, vt, ldvt, &
                              dum, idum, work( nwork ),iwork, info )
                    ! overwrite vt by right singular vectors of a
                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    if( lwork >= m*n + 3_${ik}$*n + bdspac ) then
                       ! path 5o-fast
                       ! overwrite work(iu) by left singular vectors of a
                       ! workspace: need   3*n [e, tauq, taup] + m*n [u] + n    [work]
                       ! workspace: prefer 3*n [e, tauq, taup] + m*n [u] + n*nb [work]
                       call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), work( iu &
                                 ), ldwrku,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                       ! copy left singular vectors of a from work(iu) to a
                       call stdlib${ii}$_slacpy( 'F', m, n, work( iu ), ldwrku, a, lda )
                    else
                       ! path 5o-slow
                       ! generate q in a
                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
                       call stdlib${ii}$_sorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), &
                                 lwork - nwork + 1_${ik}$, ierr )
                       ! multiply q in a by left singular vectors of
                       ! bidiagonal matrix in work(iu), storing result in
                       ! work(ir) and copying to a
                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + nb*n [r]
                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + m*n  [r]
                       do i = 1, m, ldwrkr
                          chunk = min( m - i + 1_${ik}$, ldwrkr )
                          call stdlib${ii}$_sgemm( 'N', 'N', chunk, n, n, one, a( i, 1_${ik}$ ),lda, work( iu )&
                                    , ldwrku, zero,work( ir ), ldwrkr )
                          call stdlib${ii}$_slacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1_${ik}$ ), lda )
                                    
                       end do
                    end if
                 else if( wntqs ) then
                    ! path 5s (m >= n, jobz='s')
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_slaset( 'F', m, n, zero, zero, u, ldu )
                    call stdlib${ii}$_sbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! overwrite u by left singular vectors of a and vt
                    ! by right singular vectors of a
                    ! workspace: need   3*n [e, tauq, taup] + n    [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + n*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                 else if( wntqa ) then
                    ! path 5a (m >= n, jobz='a')
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_slaset( 'F', m, m, zero, zero, u, ldu )
                    call stdlib${ii}$_sbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! set the right corner of u to identity matrix
                    if( m>n ) then
                       call stdlib${ii}$_slaset( 'F', m - n, m - n, zero, one, u(n+1,n+1),ldu )
                    end if
                    ! overwrite u by left singular vectors of a and vt
                    ! by right singular vectors of a
                    ! workspace: need   3*n [e, tauq, taup] + m    [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + m*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                 end if
              end if
           else
              ! a has more columns than rows. if a has sufficiently more
              ! columns than rows, first reduce using the lq decomposition (if
              ! sufficient workspace available)
              if( n>=mnthr ) then
                 if( wntqn ) then
                    ! path 1t (n >> m, jobz='n')
                    ! no singular vectors to be computed
                    itau = 1_${ik}$
                    nwork = itau + m
                    ! compute a=l*q
                    ! workspace: need   m [tau] + m [work]
                    ! workspace: prefer m [tau] + m*nb [work]
                    call stdlib${ii}$_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! zero out above l
                    if (m>1_${ik}$) call stdlib${ii}$_slaset( 'U', m-1, m-1, zero, zero, a( 1_${ik}$, 2_${ik}$ ), lda )
                    ie = 1_${ik}$
                    itauq = ie + m
                    itaup = itauq + m
                    nwork = itaup + m
                    ! bidiagonalize l in a
                    ! workspace: need   3*m [e, tauq, taup] + m      [work]
                    ! workspace: prefer 3*m [e, tauq, taup] + 2*m*nb [work]
                    call stdlib${ii}$_sgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    nwork = ie + m
                    ! perform bidiagonal svd, computing singular values only
                    ! workspace: need   m [e] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'N', m, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 2t (n >> m, jobz='o')
                    ! m right singular vectors to be overwritten on a and
                    ! m left singular vectors to be computed in u
                    ivt = 1_${ik}$
                    ! work(ivt) is m by m
                    ! work(il)  is m by m; it is later resized to m by chunk for gemm
                    il = ivt + m*m
                    if( lwork >= m*n + m*m + 3_${ik}$*m + bdspac ) then
                       ldwrkl = m
                       chunk = n
                    else
                       ldwrkl = m
                       chunk = ( lwork - m*m ) / m
                    end if
                    itau = il + ldwrkl*m
                    nwork = itau + m
                    ! compute a=l*q
                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
                    call stdlib${ii}$_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy l to work(il), zeroing about above it
                    call stdlib${ii}$_slacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
                    call stdlib${ii}$_slaset( 'U', m - 1_${ik}$, m - 1_${ik}$, zero, zero,work( il + ldwrkl ), ldwrkl &
                              )
                    ! generate q in a
                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
                    call stdlib${ii}$_sorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + m
                    itaup = itauq + m
                    nwork = itaup + m
                    ! bidiagonalize l in work(il)
                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m      [work]
                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
                    call stdlib${ii}$_sgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u, and computing right singular
                    ! vectors of bidiagonal matrix in work(ivt)
                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), m, dum, &
                              idum, work( nwork ),iwork, info )
                    ! overwrite u by left singular vectors of l and work(ivt)
                    ! by right singular vectors of l
                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m    [work]
                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
                              u, ldu, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
                              work( ivt ), m,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                    ! multiply right singular vectors of l in work(ivt) by q
                    ! in a, storing result in work(il) and copying to a
                    ! workspace: need   m*m [vt] + m*m [l]
                    ! workspace: prefer m*m [vt] + m*n [l]
                    ! at this point, l is resized as m by chunk.
                    do i = 1, n, chunk
                       blk = min( n - i + 1_${ik}$, chunk )
                       call stdlib${ii}$_sgemm( 'N', 'N', m, blk, m, one, work( ivt ), m,a( 1_${ik}$, i ), lda,&
                                  zero, work( il ), ldwrkl )
                       call stdlib${ii}$_slacpy( 'F', m, blk, work( il ), ldwrkl,a( 1_${ik}$, i ), lda )
                                 
                    end do
                 else if( wntqs ) then
                    ! path 3t (n >> m, jobz='s')
                    ! m right singular vectors to be computed in vt and
                    ! m left singular vectors to be computed in u
                    il = 1_${ik}$
                    ! work(il) is m by m
                    ldwrkl = m
                    itau = il + ldwrkl*m
                    nwork = itau + m
                    ! compute a=l*q
                    ! workspace: need   m*m [l] + m [tau] + m    [work]
                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
                    call stdlib${ii}$_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy l to work(il), zeroing out above it
                    call stdlib${ii}$_slacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
                    call stdlib${ii}$_slaset( 'U', m - 1_${ik}$, m - 1_${ik}$, zero, zero,work( il + ldwrkl ), ldwrkl &
                              )
                    ! generate q in a
                    ! workspace: need   m*m [l] + m [tau] + m    [work]
                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
                    call stdlib${ii}$_sorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + m
                    itaup = itauq + m
                    nwork = itaup + m
                    ! bidiagonalize l in work(iu).
                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m      [work]
                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
                    call stdlib${ii}$_sgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! overwrite u by left singular vectors of l and vt
                    ! by right singular vectors of l
                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m    [work]
                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
                              u, ldu, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
                              vt, ldvt, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! multiply right singular vectors of l in work(il) by
                    ! q in a, storing result in vt
                    ! workspace: need   m*m [l]
                    call stdlib${ii}$_slacpy( 'F', m, m, vt, ldvt, work( il ), ldwrkl )
                    call stdlib${ii}$_sgemm( 'N', 'N', m, n, m, one, work( il ), ldwrkl,a, lda, zero, &
                              vt, ldvt )
                 else if( wntqa ) then
                    ! path 4t (n >> m, jobz='a')
                    ! n right singular vectors to be computed in vt and
                    ! m left singular vectors to be computed in u
                    ivt = 1_${ik}$
                    ! work(ivt) is m by m
                    ldwkvt = m
                    itau = ivt + ldwkvt*m
                    nwork = itau + m
                    ! compute a=l*q, copying result to vt
                    ! workspace: need   m*m [vt] + m [tau] + m    [work]
                    ! workspace: prefer m*m [vt] + m [tau] + m*nb [work]
                    call stdlib${ii}$_sgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    call stdlib${ii}$_slacpy( 'U', m, n, a, lda, vt, ldvt )
                    ! generate q in vt
                    ! workspace: need   m*m [vt] + m [tau] + n    [work]
                    ! workspace: prefer m*m [vt] + m [tau] + n*nb [work]
                    call stdlib${ii}$_sorglq( n, n, m, vt, ldvt, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ! produce l in a, zeroing out other entries
                    if (m>1_${ik}$) call stdlib${ii}$_slaset( 'U', m-1, m-1, zero, zero, a( 1_${ik}$, 2_${ik}$ ), lda )
                    ie = itau
                    itauq = ie + m
                    itaup = itauq + m
                    nwork = itaup + m
                    ! bidiagonalize l in a
                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + m      [work]
                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup] + 2*m*nb [work]
                    call stdlib${ii}$_sgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in work(ivt)
                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_sbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
                              dum, idum,work( nwork ), iwork, info )
                    ! overwrite u by left singular vectors of l and work(ivt)
                    ! by right singular vectors of l
                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup]+ m    [work]
                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup]+ m*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, m, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', m, m, m, a, lda,work( itaup ), work( ivt ),&
                               ldwkvt,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                    ! multiply right singular vectors of l in work(ivt) by
                    ! q in vt, storing result in a
                    ! workspace: need   m*m [vt]
                    call stdlib${ii}$_sgemm( 'N', 'N', m, n, m, one, work( ivt ), ldwkvt,vt, ldvt, zero,&
                               a, lda )
                    ! copy right singular vectors of a from a to vt
                    call stdlib${ii}$_slacpy( 'F', m, n, a, lda, vt, ldvt )
                 end if
              else
                 ! n < mnthr
                 ! path 5t (n > m, but not much larger)
                 ! reduce to bidiagonal form without lq decomposition
                 ie = 1_${ik}$
                 itauq = ie + m
                 itaup = itauq + m
                 nwork = itaup + m
                 ! bidiagonalize a
                 ! workspace: need   3*m [e, tauq, taup] + n        [work]
                 ! workspace: prefer 3*m [e, tauq, taup] + (m+n)*nb [work]
                 call stdlib${ii}$_sgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                           work( nwork ), lwork-nwork+1,ierr )
                 if( wntqn ) then
                    ! path 5tn (n > m, jobz='n')
                    ! perform bidiagonal svd, only computing singular values
                    ! workspace: need   3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_sbdsdc( 'L', 'N', m, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 5to (n > m, jobz='o')
                    ldwkvt = m
                    ivt = nwork
                    if( lwork >= m*n + 3_${ik}$*m + bdspac ) then
                       ! work( ivt ) is m by n
                       call stdlib${ii}$_slaset( 'F', m, n, zero, zero, work( ivt ),ldwkvt )
                       nwork = ivt + ldwkvt*n
                       ! il is unused; silence compile warnings
                       il = -1_${ik}$
                    else
                       ! work( ivt ) is m by m
                       nwork = ivt + ldwkvt*m
                       il = nwork
                       ! work(il) is m by chunk
                       chunk = ( lwork - m*m - 3_${ik}$*m ) / m
                    end if
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in work(ivt)
                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + bdspac
                    call stdlib${ii}$_sbdsdc( 'L', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
                              dum, idum,work( nwork ), iwork, info )
                    ! overwrite u by left singular vectors of a
                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
                    ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    if( lwork >= m*n + 3_${ik}$*m + bdspac ) then
                       ! path 5to-fast
                       ! overwrite work(ivt) by left singular vectors of a
                       ! workspace: need   3*m [e, tauq, taup] + m*n [vt] + m    [work]
                       ! workspace: prefer 3*m [e, tauq, taup] + m*n [vt] + m*nb [work]
                       call stdlib${ii}$_sormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), work( &
                                 ivt ), ldwkvt,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                       ! copy right singular vectors of a from work(ivt) to a
                       call stdlib${ii}$_slacpy( 'F', m, n, work( ivt ), ldwkvt, a, lda )
                    else
                       ! path 5to-slow
                       ! generate p**t in a
                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
                       call stdlib${ii}$_sorgbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), &
                                 lwork - nwork + 1_${ik}$, ierr )
                       ! multiply q in a by right singular vectors of
                       ! bidiagonal matrix in work(ivt), storing result in
                       ! work(il) and copying to a
                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m*nb [l]
                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*n  [l]
                       do i = 1, n, chunk
                          blk = min( n - i + 1_${ik}$, chunk )
                          call stdlib${ii}$_sgemm( 'N', 'N', m, blk, m, one, work( ivt ),ldwkvt, a( 1_${ik}$, &
                                    i ), lda, zero,work( il ), m )
                          call stdlib${ii}$_slacpy( 'F', m, blk, work( il ), m, a( 1_${ik}$, i ),lda )
                       end do
                    end if
                 else if( wntqs ) then
                    ! path 5ts (n > m, jobz='s')
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_slaset( 'F', m, n, zero, zero, vt, ldvt )
                    call stdlib${ii}$_sbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! overwrite u by left singular vectors of a and vt
                    ! by right singular vectors of a
                    ! workspace: need   3*m [e, tauq, taup] + m    [work]
                    ! workspace: prefer 3*m [e, tauq, taup] + m*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                 else if( wntqa ) then
                    ! path 5ta (n > m, jobz='a')
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_slaset( 'F', n, n, zero, zero, vt, ldvt )
                    call stdlib${ii}$_sbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! set the right corner of vt to identity matrix
                    if( n>m ) then
                       call stdlib${ii}$_slaset( 'F', n-m, n-m, zero, one, vt(m+1,m+1),ldvt )
                    end if
                    ! overwrite u by left singular vectors of a and vt
                    ! by right singular vectors of a
                    ! workspace: need   3*m [e, tauq, taup] + n    [work]
                    ! workspace: prefer 3*m [e, tauq, taup] + n*nb [work]
                    call stdlib${ii}$_sormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_sormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                 end if
              end if
           end if
           ! undo scaling if necessary
           if( iscl==1_${ik}$ ) then
              if( anrm>bignum )call stdlib${ii}$_slascl( 'G', 0_${ik}$, 0_${ik}$, bignum, anrm, minmn, 1_${ik}$, s, minmn,&
                        ierr )
              if( anrm<smlnum )call stdlib${ii}$_slascl( 'G', 0_${ik}$, 0_${ik}$, smlnum, anrm, minmn, 1_${ik}$, s, minmn,&
                        ierr )
           end if
           ! return optimal workspace in work(1)
           work( 1_${ik}$ ) = stdlib${ii}$_sroundup_lwork( maxwrk )
           return
     end subroutine stdlib${ii}$_sgesdd

     module subroutine stdlib${ii}$_dgesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, iwork, info )
     !! DGESDD computes the singular value decomposition (SVD) of a real
     !! M-by-N matrix A, optionally computing the left and right singular
     !! vectors.  If singular vectors are desired, it uses a
     !! divide-and-conquer algorithm.
     !! The SVD is written
     !! A = U * SIGMA * transpose(V)
     !! where SIGMA is an M-by-N matrix which is zero except for its
     !! min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
     !! V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
     !! are the singular values of A; they are real and non-negative, and
     !! are returned in descending order.  The first min(m,n) columns of
     !! U and V are the left and right singular vectors of A.
     !! Note that the routine returns VT = V**T, not V.
     !! The divide and conquer algorithm makes very mild assumptions about
     !! floating point arithmetic. It will work on machines with a guard
     !! digit in add/subtract, or on those binary machines without guard
     !! digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
     !! Cray-2. It could conceivably fail on hexadecimal or decimal machines
     !! without guard digits, but we know of none.
               
        ! -- lapack driver routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_dp, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobz
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lda, ldu, ldvt, lwork, m, n
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(dp), intent(inout) :: a(lda,*)
           real(dp), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
        ! =====================================================================
           
           ! Local Scalars 
           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
           integer(${ik}$) :: bdspac, blk, chunk, i, ie, ierr, il, ir, iscl, itau, itaup, itauq, iu, &
                     ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, nwork, wrkbl
           integer(${ik}$) :: lwork_dgebrd_mn, lwork_dgebrd_mm, lwork_dgebrd_nn, lwork_dgelqf_mn, &
           lwork_dgeqrf_mn, lwork_dorgbr_p_mm, lwork_dorgbr_q_nn, lwork_dorglq_mn, &
           lwork_dorglq_nn, lwork_dorgqr_mm, lwork_dorgqr_mn, lwork_dormbr_prt_mm, &
           lwork_dormbr_qln_mm, lwork_dormbr_prt_mn, lwork_dormbr_qln_mn, lwork_dormbr_prt_nn, &
                     lwork_dormbr_qln_nn
           real(dp) :: anrm, bignum, eps, smlnum
           ! Local Arrays 
           integer(${ik}$) :: idum(1_${ik}$)
           real(dp) :: dum(1_${ik}$)
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test the input arguments
           info   = 0_${ik}$
           minmn  = min( m, n )
           wntqa  = stdlib_lsame( jobz, 'A' )
           wntqs  = stdlib_lsame( jobz, 'S' )
           wntqas = wntqa .or. wntqs
           wntqo  = stdlib_lsame( jobz, 'O' )
           wntqn  = stdlib_lsame( jobz, 'N' )
           lquery = ( lwork==-1_${ik}$ )
           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
              info = -1_${ik}$
           else if( m<0_${ik}$ ) then
              info = -2_${ik}$
           else if( n<0_${ik}$ ) then
              info = -3_${ik}$
           else if( lda<max( 1_${ik}$, m ) ) then
              info = -5_${ik}$
           else if( ldu<1_${ik}$ .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
                     then
              info = -8_${ik}$
           else if( ldvt<1_${ik}$ .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
                     .and. m>=n .and. ldvt<n ) ) then
              info = -10_${ik}$
           end if
           ! compute workspace
             ! note: comments in the code beginning "workspace:" describe the
             ! minimal amount of workspace allocated at that point in the code,
             ! as well as the preferred amount for good performance.
             ! nb refers to the optimal block size for the immediately
             ! following subroutine, as returned by stdlib${ii}$_ilaenv.
           if( info==0_${ik}$ ) then
              minwrk = 1_${ik}$
              maxwrk = 1_${ik}$
              bdspac = 0_${ik}$
              mnthr  = int( minmn*11.0_dp / 6.0_dp,KIND=${ik}$)
              if( m>=n .and. minmn>0_${ik}$ ) then
                 ! compute space needed for stdlib${ii}$_dbdsdc
                 if( wntqn ) then
                    ! stdlib${ii}$_dbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_dp)
                    ! keep 7*n for backwards compatibility.
                    bdspac = 7_${ik}$*n
                 else
                    bdspac = 3_${ik}$*n*n + 4_${ik}$*n
                 end if
                 ! compute space preferred for each routine
                 call stdlib${ii}$_dgebrd( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_dgebrd_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dgebrd( n, n, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_dgebrd_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dgeqrf( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_dgeqrf_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dorgbr( 'Q', n, n, n, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$,ierr )
                 lwork_dorgbr_q_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dorgqr( m, m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_dorgqr_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dorgqr( m, n, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_dorgqr_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, n, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_dormbr_prt_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dormbr( 'Q', 'L', 'N', n, n, n, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_dormbr_qln_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, n, n, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_dormbr_qln_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, n, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_dormbr_qln_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 if( m>=mnthr ) then
                    if( wntqn ) then
                       ! path 1 (m >> n, jobz='n')
                       wrkbl = n + lwork_dgeqrf_mn
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dgebrd_nn )
                       maxwrk = max( wrkbl, bdspac + n )
                       minwrk = bdspac + n
                    else if( wntqo ) then
                       ! path 2 (m >> n, jobz='o')
                       wrkbl = n + lwork_dgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_dorgqr_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + 2_${ik}$*n*n
                       minwrk = bdspac + 2_${ik}$*n*n + 3_${ik}$*n
                    else if( wntqs ) then
                       ! path 3 (m >> n, jobz='s')
                       wrkbl = n + lwork_dgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_dorgqr_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + n*n
                       minwrk = bdspac + n*n + 3_${ik}$*n
                    else if( wntqa ) then
                       ! path 4 (m >> n, jobz='a')
                       wrkbl = n + lwork_dgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_dorgqr_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + n*n
                       minwrk = n*n + max( 3_${ik}$*n + bdspac, n + m )
                    end if
                 else
                    ! path 5 (m >= n, but not much larger)
                    wrkbl = 3_${ik}$*n + lwork_dgebrd_mn
                    if( wntqn ) then
                       ! path 5n (m >= n, jobz='n')
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    else if( wntqo ) then
                       ! path 5o (m >= n, jobz='o')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_qln_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + m*n
                       minwrk = 3_${ik}$*n + max( m, n*n + bdspac )
                    else if( wntqs ) then
                       ! path 5s (m >= n, jobz='s')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_qln_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    else if( wntqa ) then
                       ! path 5a (m >= n, jobz='a')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_dormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    end if
                 end if
              else if( minmn>0_${ik}$ ) then
                 ! compute space needed for stdlib${ii}$_dbdsdc
                 if( wntqn ) then
                    ! stdlib${ii}$_dbdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_dp)
                    ! keep 7*n for backwards compatibility.
                    bdspac = 7_${ik}$*m
                 else
                    bdspac = 3_${ik}$*m*m + 4_${ik}$*m
                 end if
                 ! compute space preferred for each routine
                 call stdlib${ii}$_dgebrd( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_dgebrd_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dgebrd( m, m, a, m, s, dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                           
                 lwork_dgebrd_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dgelqf( m, n, a, m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_dgelqf_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dorglq( n, n, m, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_dorglq_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dorglq( m, n, m, a, m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_dorglq_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dorgbr( 'P', m, m, m, a, n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_dorgbr_p_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dormbr( 'P', 'R', 'T', m, m, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_dormbr_prt_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dormbr( 'P', 'R', 'T', m, n, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_dormbr_prt_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, m, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_dormbr_prt_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_dormbr_qln_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 if( n>=mnthr ) then
                    if( wntqn ) then
                       ! path 1t (n >> m, jobz='n')
                       wrkbl = m + lwork_dgelqf_mn
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dgebrd_mm )
                       maxwrk = max( wrkbl, bdspac + m )
                       minwrk = bdspac + m
                    else if( wntqo ) then
                       ! path 2t (n >> m, jobz='o')
                       wrkbl = m + lwork_dgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_dorglq_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + 2_${ik}$*m*m
                       minwrk = bdspac + 2_${ik}$*m*m + 3_${ik}$*m
                    else if( wntqs ) then
                       ! path 3t (n >> m, jobz='s')
                       wrkbl = m + lwork_dgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_dorglq_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*m
                       minwrk = bdspac + m*m + 3_${ik}$*m
                    else if( wntqa ) then
                       ! path 4t (n >> m, jobz='a')
                       wrkbl = m + lwork_dgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_dorglq_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*m
                       minwrk = m*m + max( 3_${ik}$*m + bdspac, m + n )
                    end if
                 else
                    ! path 5t (n > m, but not much larger)
                    wrkbl = 3_${ik}$*m + lwork_dgebrd_mn
                    if( wntqn ) then
                       ! path 5tn (n > m, jobz='n')
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    else if( wntqo ) then
                       ! path 5to (n > m, jobz='o')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_prt_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*n
                       minwrk = 3_${ik}$*m + max( n, m*m + bdspac )
                    else if( wntqs ) then
                       ! path 5ts (n > m, jobz='s')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_prt_mn )
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    else if( wntqa ) then
                       ! path 5ta (n > m, jobz='a')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_dormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    end if
                 end if
              end if
              maxwrk = max( maxwrk, minwrk )
              work( 1_${ik}$ ) = stdlib${ii}$_droundup_lwork( maxwrk )
              if( lwork<minwrk .and. .not.lquery ) then
                 info = -12_${ik}$
              end if
           end if
           if( info/=0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DGESDD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! quick return if possible
           if( m==0_${ik}$ .or. n==0_${ik}$ ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_dlamch( 'P' )
           smlnum = sqrt( stdlib${ii}$_dlamch( 'S' ) ) / eps
           bignum = one / smlnum
           ! scale a if max element outside range [smlnum,bignum]
           anrm = stdlib${ii}$_dlange( 'M', m, n, a, lda, dum )
           if( stdlib${ii}$_disnan( anrm ) ) then
               info = -4_${ik}$
               return
           end if
           iscl = 0_${ik}$
           if( anrm>zero .and. anrm<smlnum ) then
              iscl = 1_${ik}$
              call stdlib${ii}$_dlascl( 'G', 0_${ik}$, 0_${ik}$, anrm, smlnum, m, n, a, lda, ierr )
           else if( anrm>bignum ) then
              iscl = 1_${ik}$
              call stdlib${ii}$_dlascl( 'G', 0_${ik}$, 0_${ik}$, anrm, bignum, m, n, a, lda, ierr )
           end if
           if( m>=n ) then
              ! a has at least as many rows as columns. if a has sufficiently
              ! more rows than columns, first reduce using the qr
              ! decomposition (if sufficient workspace available)
              if( m>=mnthr ) then
                 if( wntqn ) then
                    ! path 1 (m >> n, jobz='n')
                    ! no singular vectors to be computed
                    itau = 1_${ik}$
                    nwork = itau + n
                    ! compute a=q*r
                    ! workspace: need   n [tau] + n    [work]
                    ! workspace: prefer n [tau] + n*nb [work]
                    call stdlib${ii}$_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! zero out below r
                    if (n>1_${ik}$) call stdlib${ii}$_dlaset( 'L', n-1, n-1, zero, zero, a( 2_${ik}$, 1_${ik}$ ), lda )
                    ie = 1_${ik}$
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in a
                    ! workspace: need   3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_dgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    nwork = ie + n
                    ! perform bidiagonal svd, computing singular values only
                    ! workspace: need   n [e] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'N', n, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 2 (m >> n, jobz = 'o')
                    ! n left singular vectors to be overwritten on a and
                    ! n right singular vectors to be computed in vt
                    ir = 1_${ik}$
                    ! work(ir) is ldwrkr by n
                    if( lwork >= lda*n + n*n + 3_${ik}$*n + bdspac ) then
                       ldwrkr = lda
                    else
                       ldwrkr = ( lwork - n*n - 3_${ik}$*n - bdspac ) / n
                    end if
                    itau = ir + ldwrkr*n
                    nwork = itau + n
                    ! compute a=q*r
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy r to work(ir), zeroing out below it
                    call stdlib${ii}$_dlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
                    call stdlib${ii}$_dlaset( 'L', n - 1_${ik}$, n - 1_${ik}$, zero, zero, work(ir+1),ldwrkr )
                    ! generate q in a
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_dorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in work(ir)
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_dgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! work(iu) is n by n
                    iu = nwork
                    nwork = iu + n*n
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in work(iu) and computing right
                    ! singular vectors of bidiagonal matrix in vt
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
                              idum, work( nwork ), iwork,info )
                    ! overwrite work(iu) by left singular vectors of r
                    ! and vt by right singular vectors of r
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n    [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
                              work( iu ), n, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
                              vt, ldvt, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! multiply q in a by left singular vectors of r in
                    ! work(iu), storing result in work(ir) and copying to a
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u]
                    ! workspace: prefer m*n [r] + 3*n [e, tauq, taup] + n*n [u]
                    do i = 1, m, ldwrkr
                       chunk = min( m - i + 1_${ik}$, ldwrkr )
                       call stdlib${ii}$_dgemm( 'N', 'N', chunk, n, n, one, a( i, 1_${ik}$ ),lda, work( iu ), &
                                 n, zero, work( ir ),ldwrkr )
                       call stdlib${ii}$_dlacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1_${ik}$ ), lda )
                                 
                    end do
                 else if( wntqs ) then
                    ! path 3 (m >> n, jobz='s')
                    ! n left singular vectors to be computed in u and
                    ! n right singular vectors to be computed in vt
                    ir = 1_${ik}$
                    ! work(ir) is n by n
                    ldwrkr = n
                    itau = ir + ldwrkr*n
                    nwork = itau + n
                    ! compute a=q*r
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy r to work(ir), zeroing out below it
                    call stdlib${ii}$_dlacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
                    call stdlib${ii}$_dlaset( 'L', n - 1_${ik}$, n - 1_${ik}$, zero, zero, work(ir+1),ldwrkr )
                    ! generate q in a
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_dorgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in work(ir)
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_dgebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagoal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! overwrite u by left singular vectors of r and vt
                    ! by right singular vectors of r
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n    [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + n*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', n, n, n, work( ir ), ldwrkr,work( itauq ), &
                              u, ldu, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, n, work( ir ), ldwrkr,work( itaup ), &
                              vt, ldvt, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! multiply q in a by left singular vectors of r in
                    ! work(ir), storing result in u
                    ! workspace: need   n*n [r]
                    call stdlib${ii}$_dlacpy( 'F', n, n, u, ldu, work( ir ), ldwrkr )
                    call stdlib${ii}$_dgemm( 'N', 'N', m, n, n, one, a, lda, work( ir ),ldwrkr, zero, u,&
                               ldu )
                 else if( wntqa ) then
                    ! path 4 (m >> n, jobz='a')
                    ! m left singular vectors to be computed in u and
                    ! n right singular vectors to be computed in vt
                    iu = 1_${ik}$
                    ! work(iu) is n by n
                    ldwrku = n
                    itau = iu + ldwrku*n
                    nwork = itau + n
                    ! compute a=q*r, copying result to u
                    ! workspace: need   n*n [u] + n [tau] + n    [work]
                    ! workspace: prefer n*n [u] + n [tau] + n*nb [work]
                    call stdlib${ii}$_dgeqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    call stdlib${ii}$_dlacpy( 'L', m, n, a, lda, u, ldu )
                    ! generate q in u
                    ! workspace: need   n*n [u] + n [tau] + m    [work]
                    ! workspace: prefer n*n [u] + n [tau] + m*nb [work]
                    call stdlib${ii}$_dorgqr( m, m, n, u, ldu, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ! produce r in a, zeroing out other entries
                    if (n>1_${ik}$) call stdlib${ii}$_dlaset( 'L', n-1, n-1, zero, zero, a( 2_${ik}$, 1_${ik}$ ), lda )
                    ie = itau
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in a
                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_dgebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in work(iu) and computing right
                    ! singular vectors of bidiagonal matrix in vt
                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
                              idum, work( nwork ), iwork,info )
                    ! overwrite work(iu) by left singular vectors of r and vt
                    ! by right singular vectors of r
                    ! workspace: need   n*n [u] + 3*n [e, tauq, taup] + n    [work]
                    ! workspace: prefer n*n [u] + 3*n [e, tauq, taup] + n*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', n, n, n, a, lda,work( itauq ), work( iu ), &
                              ldwrku,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! multiply q in u by left singular vectors of r in
                    ! work(iu), storing result in a
                    ! workspace: need   n*n [u]
                    call stdlib${ii}$_dgemm( 'N', 'N', m, n, n, one, u, ldu, work( iu ),ldwrku, zero, a,&
                               lda )
                    ! copy left singular vectors of a from a to u
                    call stdlib${ii}$_dlacpy( 'F', m, n, a, lda, u, ldu )
                 end if
              else
                 ! m < mnthr
                 ! path 5 (m >= n, but not much larger)
                 ! reduce to bidiagonal form without qr decomposition
                 ie = 1_${ik}$
                 itauq = ie + n
                 itaup = itauq + n
                 nwork = itaup + n
                 ! bidiagonalize a
                 ! workspace: need   3*n [e, tauq, taup] + m        [work]
                 ! workspace: prefer 3*n [e, tauq, taup] + (m+n)*nb [work]
                 call stdlib${ii}$_dgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                           work( nwork ), lwork-nwork+1,ierr )
                 if( wntqn ) then
                    ! path 5n (m >= n, jobz='n')
                    ! perform bidiagonal svd, only computing singular values
                    ! workspace: need   3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'N', n, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 5o (m >= n, jobz='o')
                    iu = nwork
                    if( lwork >= m*n + 3_${ik}$*n + bdspac ) then
                       ! work( iu ) is m by n
                       ldwrku = m
                       nwork = iu + ldwrku*n
                       call stdlib${ii}$_dlaset( 'F', m, n, zero, zero, work( iu ),ldwrku )
                       ! ir is unused; silence compile warnings
                       ir = -1_${ik}$
                    else
                       ! work( iu ) is n by n
                       ldwrku = n
                       nwork = iu + ldwrku*n
                       ! work(ir) is ldwrkr by n
                       ir = nwork
                       ldwrkr = ( lwork - n*n - 3_${ik}$*n ) / n
                    end if
                    nwork = iu + ldwrku*n
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in work(iu) and computing right
                    ! singular vectors of bidiagonal matrix in vt
                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'I', n, s, work( ie ), work( iu ),ldwrku, vt, ldvt, &
                              dum, idum, work( nwork ),iwork, info )
                    ! overwrite vt by right singular vectors of a
                    ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    if( lwork >= m*n + 3_${ik}$*n + bdspac ) then
                       ! path 5o-fast
                       ! overwrite work(iu) by left singular vectors of a
                       ! workspace: need   3*n [e, tauq, taup] + m*n [u] + n    [work]
                       ! workspace: prefer 3*n [e, tauq, taup] + m*n [u] + n*nb [work]
                       call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), work( iu &
                                 ), ldwrku,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                       ! copy left singular vectors of a from work(iu) to a
                       call stdlib${ii}$_dlacpy( 'F', m, n, work( iu ), ldwrku, a, lda )
                    else
                       ! path 5o-slow
                       ! generate q in a
                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + n    [work]
                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + n*nb [work]
                       call stdlib${ii}$_dorgbr( 'Q', m, n, n, a, lda, work( itauq ),work( nwork ), &
                                 lwork - nwork + 1_${ik}$, ierr )
                       ! multiply q in a by left singular vectors of
                       ! bidiagonal matrix in work(iu), storing result in
                       ! work(ir) and copying to a
                       ! workspace: need   3*n [e, tauq, taup] + n*n [u] + nb*n [r]
                       ! workspace: prefer 3*n [e, tauq, taup] + n*n [u] + m*n  [r]
                       do i = 1, m, ldwrkr
                          chunk = min( m - i + 1_${ik}$, ldwrkr )
                          call stdlib${ii}$_dgemm( 'N', 'N', chunk, n, n, one, a( i, 1_${ik}$ ),lda, work( iu )&
                                    , ldwrku, zero,work( ir ), ldwrkr )
                          call stdlib${ii}$_dlacpy( 'F', chunk, n, work( ir ), ldwrkr,a( i, 1_${ik}$ ), lda )
                                    
                       end do
                    end if
                 else if( wntqs ) then
                    ! path 5s (m >= n, jobz='s')
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dlaset( 'F', m, n, zero, zero, u, ldu )
                    call stdlib${ii}$_dbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! overwrite u by left singular vectors of a and vt
                    ! by right singular vectors of a
                    ! workspace: need   3*n [e, tauq, taup] + n    [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + n*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, n, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, n, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                 else if( wntqa ) then
                    ! path 5a (m >= n, jobz='a')
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   3*n [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dlaset( 'F', m, m, zero, zero, u, ldu )
                    call stdlib${ii}$_dbdsdc( 'U', 'I', n, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! set the right corner of u to identity matrix
                    if( m>n ) then
                       call stdlib${ii}$_dlaset( 'F', m - n, m - n, zero, one, u(n+1,n+1),ldu )
                    end if
                    ! overwrite u by left singular vectors of a and vt
                    ! by right singular vectors of a
                    ! workspace: need   3*n [e, tauq, taup] + m    [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + m*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                 end if
              end if
           else
              ! a has more columns than rows. if a has sufficiently more
              ! columns than rows, first reduce using the lq decomposition (if
              ! sufficient workspace available)
              if( n>=mnthr ) then
                 if( wntqn ) then
                    ! path 1t (n >> m, jobz='n')
                    ! no singular vectors to be computed
                    itau = 1_${ik}$
                    nwork = itau + m
                    ! compute a=l*q
                    ! workspace: need   m [tau] + m [work]
                    ! workspace: prefer m [tau] + m*nb [work]
                    call stdlib${ii}$_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! zero out above l
                    if (m>1_${ik}$) call stdlib${ii}$_dlaset( 'U', m-1, m-1, zero, zero, a( 1_${ik}$, 2_${ik}$ ), lda )
                    ie = 1_${ik}$
                    itauq = ie + m
                    itaup = itauq + m
                    nwork = itaup + m
                    ! bidiagonalize l in a
                    ! workspace: need   3*m [e, tauq, taup] + m      [work]
                    ! workspace: prefer 3*m [e, tauq, taup] + 2*m*nb [work]
                    call stdlib${ii}$_dgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    nwork = ie + m
                    ! perform bidiagonal svd, computing singular values only
                    ! workspace: need   m [e] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'N', m, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 2t (n >> m, jobz='o')
                    ! m right singular vectors to be overwritten on a and
                    ! m left singular vectors to be computed in u
                    ivt = 1_${ik}$
                    ! work(ivt) is m by m
                    ! work(il)  is m by m; it is later resized to m by chunk for gemm
                    il = ivt + m*m
                    if( lwork >= m*n + m*m + 3_${ik}$*m + bdspac ) then
                       ldwrkl = m
                       chunk = n
                    else
                       ldwrkl = m
                       chunk = ( lwork - m*m ) / m
                    end if
                    itau = il + ldwrkl*m
                    nwork = itau + m
                    ! compute a=l*q
                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
                    call stdlib${ii}$_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy l to work(il), zeroing about above it
                    call stdlib${ii}$_dlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
                    call stdlib${ii}$_dlaset( 'U', m - 1_${ik}$, m - 1_${ik}$, zero, zero,work( il + ldwrkl ), ldwrkl &
                              )
                    ! generate q in a
                    ! workspace: need   m*m [vt] + m*m [l] + m [tau] + m    [work]
                    ! workspace: prefer m*m [vt] + m*m [l] + m [tau] + m*nb [work]
                    call stdlib${ii}$_dorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + m
                    itaup = itauq + m
                    nwork = itaup + m
                    ! bidiagonalize l in work(il)
                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m      [work]
                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
                    call stdlib${ii}$_dgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u, and computing right singular
                    ! vectors of bidiagonal matrix in work(ivt)
                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), m, dum, &
                              idum, work( nwork ),iwork, info )
                    ! overwrite u by left singular vectors of l and work(ivt)
                    ! by right singular vectors of l
                    ! workspace: need   m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m    [work]
                    ! workspace: prefer m*m [vt] + m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
                              u, ldu, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
                              work( ivt ), m,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                    ! multiply right singular vectors of l in work(ivt) by q
                    ! in a, storing result in work(il) and copying to a
                    ! workspace: need   m*m [vt] + m*m [l]
                    ! workspace: prefer m*m [vt] + m*n [l]
                    ! at this point, l is resized as m by chunk.
                    do i = 1, n, chunk
                       blk = min( n - i + 1_${ik}$, chunk )
                       call stdlib${ii}$_dgemm( 'N', 'N', m, blk, m, one, work( ivt ), m,a( 1_${ik}$, i ), lda,&
                                  zero, work( il ), ldwrkl )
                       call stdlib${ii}$_dlacpy( 'F', m, blk, work( il ), ldwrkl,a( 1_${ik}$, i ), lda )
                                 
                    end do
                 else if( wntqs ) then
                    ! path 3t (n >> m, jobz='s')
                    ! m right singular vectors to be computed in vt and
                    ! m left singular vectors to be computed in u
                    il = 1_${ik}$
                    ! work(il) is m by m
                    ldwrkl = m
                    itau = il + ldwrkl*m
                    nwork = itau + m
                    ! compute a=l*q
                    ! workspace: need   m*m [l] + m [tau] + m    [work]
                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
                    call stdlib${ii}$_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy l to work(il), zeroing out above it
                    call stdlib${ii}$_dlacpy( 'L', m, m, a, lda, work( il ), ldwrkl )
                    call stdlib${ii}$_dlaset( 'U', m - 1_${ik}$, m - 1_${ik}$, zero, zero,work( il + ldwrkl ), ldwrkl &
                              )
                    ! generate q in a
                    ! workspace: need   m*m [l] + m [tau] + m    [work]
                    ! workspace: prefer m*m [l] + m [tau] + m*nb [work]
                    call stdlib${ii}$_dorglq( m, n, m, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + m
                    itaup = itauq + m
                    nwork = itaup + m
                    ! bidiagonalize l in work(iu).
                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m      [work]
                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + 2*m*nb [work]
                    call stdlib${ii}$_dgebrd( m, m, work( il ), ldwrkl, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! overwrite u by left singular vectors of l and vt
                    ! by right singular vectors of l
                    ! workspace: need   m*m [l] + 3*m [e, tauq, taup] + m    [work]
                    ! workspace: prefer m*m [l] + 3*m [e, tauq, taup] + m*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, m, work( il ), ldwrkl,work( itauq ), &
                              u, ldu, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', m, m, m, work( il ), ldwrkl,work( itaup ), &
                              vt, ldvt, work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! multiply right singular vectors of l in work(il) by
                    ! q in a, storing result in vt
                    ! workspace: need   m*m [l]
                    call stdlib${ii}$_dlacpy( 'F', m, m, vt, ldvt, work( il ), ldwrkl )
                    call stdlib${ii}$_dgemm( 'N', 'N', m, n, m, one, work( il ), ldwrkl,a, lda, zero, &
                              vt, ldvt )
                 else if( wntqa ) then
                    ! path 4t (n >> m, jobz='a')
                    ! n right singular vectors to be computed in vt and
                    ! m left singular vectors to be computed in u
                    ivt = 1_${ik}$
                    ! work(ivt) is m by m
                    ldwkvt = m
                    itau = ivt + ldwkvt*m
                    nwork = itau + m
                    ! compute a=l*q, copying result to vt
                    ! workspace: need   m*m [vt] + m [tau] + m    [work]
                    ! workspace: prefer m*m [vt] + m [tau] + m*nb [work]
                    call stdlib${ii}$_dgelqf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    call stdlib${ii}$_dlacpy( 'U', m, n, a, lda, vt, ldvt )
                    ! generate q in vt
                    ! workspace: need   m*m [vt] + m [tau] + n    [work]
                    ! workspace: prefer m*m [vt] + m [tau] + n*nb [work]
                    call stdlib${ii}$_dorglq( n, n, m, vt, ldvt, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ! produce l in a, zeroing out other entries
                    if (m>1_${ik}$) call stdlib${ii}$_dlaset( 'U', m-1, m-1, zero, zero, a( 1_${ik}$, 2_${ik}$ ), lda )
                    ie = itau
                    itauq = ie + m
                    itaup = itauq + m
                    nwork = itaup + m
                    ! bidiagonalize l in a
                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + m      [work]
                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup] + 2*m*nb [work]
                    call stdlib${ii}$_dgebrd( m, m, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in work(ivt)
                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dbdsdc( 'U', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
                              dum, idum,work( nwork ), iwork, info )
                    ! overwrite u by left singular vectors of l and work(ivt)
                    ! by right singular vectors of l
                    ! workspace: need   m*m [vt] + 3*m [e, tauq, taup]+ m    [work]
                    ! workspace: prefer m*m [vt] + 3*m [e, tauq, taup]+ m*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, m, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', m, m, m, a, lda,work( itaup ), work( ivt ),&
                               ldwkvt,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                    ! multiply right singular vectors of l in work(ivt) by
                    ! q in vt, storing result in a
                    ! workspace: need   m*m [vt]
                    call stdlib${ii}$_dgemm( 'N', 'N', m, n, m, one, work( ivt ), ldwkvt,vt, ldvt, zero,&
                               a, lda )
                    ! copy right singular vectors of a from a to vt
                    call stdlib${ii}$_dlacpy( 'F', m, n, a, lda, vt, ldvt )
                 end if
              else
                 ! n < mnthr
                 ! path 5t (n > m, but not much larger)
                 ! reduce to bidiagonal form without lq decomposition
                 ie = 1_${ik}$
                 itauq = ie + m
                 itaup = itauq + m
                 nwork = itaup + m
                 ! bidiagonalize a
                 ! workspace: need   3*m [e, tauq, taup] + n        [work]
                 ! workspace: prefer 3*m [e, tauq, taup] + (m+n)*nb [work]
                 call stdlib${ii}$_dgebrd( m, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                           work( nwork ), lwork-nwork+1,ierr )
                 if( wntqn ) then
                    ! path 5tn (n > m, jobz='n')
                    ! perform bidiagonal svd, only computing singular values
                    ! workspace: need   3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dbdsdc( 'L', 'N', m, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 5to (n > m, jobz='o')
                    ldwkvt = m
                    ivt = nwork
                    if( lwork >= m*n + 3_${ik}$*m + bdspac ) then
                       ! work( ivt ) is m by n
                       call stdlib${ii}$_dlaset( 'F', m, n, zero, zero, work( ivt ),ldwkvt )
                       nwork = ivt + ldwkvt*n
                       ! il is unused; silence compile warnings
                       il = -1_${ik}$
                    else
                       ! work( ivt ) is m by m
                       nwork = ivt + ldwkvt*m
                       il = nwork
                       ! work(il) is m by chunk
                       chunk = ( lwork - m*m - 3_${ik}$*m ) / m
                    end if
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in work(ivt)
                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + bdspac
                    call stdlib${ii}$_dbdsdc( 'L', 'I', m, s, work( ie ), u, ldu,work( ivt ), ldwkvt, &
                              dum, idum,work( nwork ), iwork, info )
                    ! overwrite u by left singular vectors of a
                    ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
                    ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    if( lwork >= m*n + 3_${ik}$*m + bdspac ) then
                       ! path 5to-fast
                       ! overwrite work(ivt) by left singular vectors of a
                       ! workspace: need   3*m [e, tauq, taup] + m*n [vt] + m    [work]
                       ! workspace: prefer 3*m [e, tauq, taup] + m*n [vt] + m*nb [work]
                       call stdlib${ii}$_dormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), work( &
                                 ivt ), ldwkvt,work( nwork ), lwork - nwork + 1_${ik}$, ierr )
                       ! copy right singular vectors of a from work(ivt) to a
                       call stdlib${ii}$_dlacpy( 'F', m, n, work( ivt ), ldwkvt, a, lda )
                    else
                       ! path 5to-slow
                       ! generate p**t in a
                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m    [work]
                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*nb [work]
                       call stdlib${ii}$_dorgbr( 'P', m, n, m, a, lda, work( itaup ),work( nwork ), &
                                 lwork - nwork + 1_${ik}$, ierr )
                       ! multiply q in a by right singular vectors of
                       ! bidiagonal matrix in work(ivt), storing result in
                       ! work(il) and copying to a
                       ! workspace: need   3*m [e, tauq, taup] + m*m [vt] + m*nb [l]
                       ! workspace: prefer 3*m [e, tauq, taup] + m*m [vt] + m*n  [l]
                       do i = 1, n, chunk
                          blk = min( n - i + 1_${ik}$, chunk )
                          call stdlib${ii}$_dgemm( 'N', 'N', m, blk, m, one, work( ivt ),ldwkvt, a( 1_${ik}$, &
                                    i ), lda, zero,work( il ), m )
                          call stdlib${ii}$_dlacpy( 'F', m, blk, work( il ), m, a( 1_${ik}$, i ),lda )
                       end do
                    end if
                 else if( wntqs ) then
                    ! path 5ts (n > m, jobz='s')
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dlaset( 'F', m, n, zero, zero, vt, ldvt )
                    call stdlib${ii}$_dbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! overwrite u by left singular vectors of a and vt
                    ! by right singular vectors of a
                    ! workspace: need   3*m [e, tauq, taup] + m    [work]
                    ! workspace: prefer 3*m [e, tauq, taup] + m*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', m, n, m, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                 else if( wntqa ) then
                    ! path 5ta (n > m, jobz='a')
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in u and computing right singular
                    ! vectors of bidiagonal matrix in vt
                    ! workspace: need   3*m [e, tauq, taup] + bdspac
                    call stdlib${ii}$_dlaset( 'F', n, n, zero, zero, vt, ldvt )
                    call stdlib${ii}$_dbdsdc( 'L', 'I', m, s, work( ie ), u, ldu, vt,ldvt, dum, idum, &
                              work( nwork ), iwork,info )
                    ! set the right corner of vt to identity matrix
                    if( n>m ) then
                       call stdlib${ii}$_dlaset( 'F', n-m, n-m, zero, one, vt(m+1,m+1),ldvt )
                    end if
                    ! overwrite u by left singular vectors of a and vt
                    ! by right singular vectors of a
                    ! workspace: need   3*m [e, tauq, taup] + n    [work]
                    ! workspace: prefer 3*m [e, tauq, taup] + n*nb [work]
                    call stdlib${ii}$_dormbr( 'Q', 'L', 'N', m, m, n, a, lda,work( itauq ), u, ldu, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    call stdlib${ii}$_dormbr( 'P', 'R', 'T', n, n, m, a, lda,work( itaup ), vt, ldvt, &
                              work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                 end if
              end if
           end if
           ! undo scaling if necessary
           if( iscl==1_${ik}$ ) then
              if( anrm>bignum )call stdlib${ii}$_dlascl( 'G', 0_${ik}$, 0_${ik}$, bignum, anrm, minmn, 1_${ik}$, s, minmn,&
                        ierr )
              if( anrm<smlnum )call stdlib${ii}$_dlascl( 'G', 0_${ik}$, 0_${ik}$, smlnum, anrm, minmn, 1_${ik}$, s, minmn,&
                        ierr )
           end if
           ! return optimal workspace in work(1)
           work( 1_${ik}$ ) = stdlib${ii}$_droundup_lwork( maxwrk )
           return
     end subroutine stdlib${ii}$_dgesdd

#:for rk,rt,ri in REAL_KINDS_TYPES
#:if not rk in ["sp","dp"]
     module subroutine stdlib${ii}$_${ri}$gesdd( jobz, m, n, a, lda, s, u, ldu, vt, ldvt,work, lwork, iwork, info )
     !! DGESDD: computes the singular value decomposition (SVD) of a real
     !! M-by-N matrix A, optionally computing the left and right singular
     !! vectors.  If singular vectors are desired, it uses a
     !! divide-and-conquer algorithm.
     !! The SVD is written
     !! A = U * SIGMA * transpose(V)
     !! where SIGMA is an M-by-N matrix which is zero except for its
     !! min(m,n) diagonal elements, U is an M-by-M orthogonal matrix, and
     !! V is an N-by-N orthogonal matrix.  The diagonal elements of SIGMA
     !! are the singular values of A; they are real and non-negative, and
     !! are returned in descending order.  The first min(m,n) columns of
     !! U and V are the left and right singular vectors of A.
     !! Note that the routine returns VT = V**T, not V.
     !! The divide and conquer algorithm makes very mild assumptions about
     !! floating point arithmetic. It will work on machines with a guard
     !! digit in add/subtract, or on those binary machines without guard
     !! digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
     !! Cray-2. It could conceivably fail on hexadecimal or decimal machines
     !! without guard digits, but we know of none.
               
        ! -- lapack driver routine --
        ! -- lapack is a software package provided by univ. of tennessee,    --
        ! -- univ. of california berkeley, univ. of colorado denver and nag ltd..--
           use stdlib_blas_constants_${rk}$, only: negone, zero, half, one, two, three, four, eight, ten, czero, chalf, cone, cnegone
           ! Scalar Arguments 
           character, intent(in) :: jobz
           integer(${ik}$), intent(out) :: info
           integer(${ik}$), intent(in) :: lda, ldu, ldvt, lwork, m, n
           ! Array Arguments 
           integer(${ik}$), intent(out) :: iwork(*)
           real(${rk}$), intent(inout) :: a(lda,*)
           real(${rk}$), intent(out) :: s(*), u(ldu,*), vt(ldvt,*), work(*)
        ! =====================================================================
           
           ! Local Scalars 
           logical(lk) :: lquery, wntqa, wntqas, wntqn, wntqo, wntqs
           integer(${ik}$) :: bdspac, blk, chunk, i, ie, ierr, il, ir, iscl, itau, itaup, itauq, iu, &
                     ivt, ldwkvt, ldwrkl, ldwrkr, ldwrku, maxwrk, minmn, minwrk, mnthr, nwork, wrkbl
           integer(${ik}$) :: lwork_qgebrd_mn, lwork_qgebrd_mm, lwork_qgebrd_nn, lwork_qgelqf_mn, &
           lwork_qgeqrf_mn, lwork_qorgbr_p_mm, lwork_qorgbr_q_nn, lwork_qorglq_mn, &
           lwork_qorglq_nn, lwork_qorgqr_mm, lwork_qorgqr_mn, lwork_qormbr_prt_mm, &
           lwork_qormbr_qln_mm, lwork_qormbr_prt_mn, lwork_qormbr_qln_mn, lwork_qormbr_prt_nn, &
                     lwork_qormbr_qln_nn
           real(${rk}$) :: anrm, bignum, eps, smlnum
           ! Local Arrays 
           integer(${ik}$) :: idum(1_${ik}$)
           real(${rk}$) :: dum(1_${ik}$)
           ! Intrinsic Functions 
           ! Executable Statements 
           ! test the input arguments
           info   = 0_${ik}$
           minmn  = min( m, n )
           wntqa  = stdlib_lsame( jobz, 'A' )
           wntqs  = stdlib_lsame( jobz, 'S' )
           wntqas = wntqa .or. wntqs
           wntqo  = stdlib_lsame( jobz, 'O' )
           wntqn  = stdlib_lsame( jobz, 'N' )
           lquery = ( lwork==-1_${ik}$ )
           if( .not.( wntqa .or. wntqs .or. wntqo .or. wntqn ) ) then
              info = -1_${ik}$
           else if( m<0_${ik}$ ) then
              info = -2_${ik}$
           else if( n<0_${ik}$ ) then
              info = -3_${ik}$
           else if( lda<max( 1_${ik}$, m ) ) then
              info = -5_${ik}$
           else if( ldu<1_${ik}$ .or. ( wntqas .and. ldu<m ) .or.( wntqo .and. m<n .and. ldu<m ) ) &
                     then
              info = -8_${ik}$
           else if( ldvt<1_${ik}$ .or. ( wntqa .and. ldvt<n ) .or.( wntqs .and. ldvt<minmn ) .or.( wntqo &
                     .and. m>=n .and. ldvt<n ) ) then
              info = -10_${ik}$
           end if
           ! compute workspace
             ! note: comments in the code beginning "workspace:" describe the
             ! minimal amount of workspace allocated at that point in the code,
             ! as well as the preferred amount for good performance.
             ! nb refers to the optimal block size for the immediately
             ! following subroutine, as returned by stdlib${ii}$_ilaenv.
           if( info==0_${ik}$ ) then
              minwrk = 1_${ik}$
              maxwrk = 1_${ik}$
              bdspac = 0_${ik}$
              mnthr  = int( minmn*11.0_${rk}$ / 6.0_${rk}$,KIND=${ik}$)
              if( m>=n .and. minmn>0_${ik}$ ) then
                 ! compute space needed for stdlib${ii}$_${ri}$bdsdc
                 if( wntqn ) then
                    ! stdlib${ii}$_${ri}$bdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_${rk}$)
                    ! keep 7*n for backwards compatibility.
                    bdspac = 7_${ik}$*n
                 else
                    bdspac = 3_${ik}$*n*n + 4_${ik}$*n
                 end if
                 ! compute space preferred for each routine
                 call stdlib${ii}$_${ri}$gebrd( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_qgebrd_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$gebrd( n, n, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_qgebrd_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$geqrf( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_qgeqrf_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$orgbr( 'Q', n, n, n, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$,ierr )
                 lwork_qorgbr_q_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$orgqr( m, m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_qorgqr_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$orgqr( m, n, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_qorgqr_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$ormbr( 'P', 'R', 'T', n, n, n, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_qormbr_prt_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$ormbr( 'Q', 'L', 'N', n, n, n, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_qormbr_qln_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$ormbr( 'Q', 'L', 'N', m, n, n, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_qormbr_qln_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$ormbr( 'Q', 'L', 'N', m, m, n, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_qormbr_qln_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 if( m>=mnthr ) then
                    if( wntqn ) then
                       ! path 1 (m >> n, jobz='n')
                       wrkbl = n + lwork_qgeqrf_mn
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qgebrd_nn )
                       maxwrk = max( wrkbl, bdspac + n )
                       minwrk = bdspac + n
                    else if( wntqo ) then
                       ! path 2 (m >> n, jobz='o')
                       wrkbl = n + lwork_qgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_qorgqr_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + 2_${ik}$*n*n
                       minwrk = bdspac + 2_${ik}$*n*n + 3_${ik}$*n
                    else if( wntqs ) then
                       ! path 3 (m >> n, jobz='s')
                       wrkbl = n + lwork_qgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_qorgqr_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + n*n
                       minwrk = bdspac + n*n + 3_${ik}$*n
                    else if( wntqa ) then
                       ! path 4 (m >> n, jobz='a')
                       wrkbl = n + lwork_qgeqrf_mn
                       wrkbl = max( wrkbl,   n + lwork_qorgqr_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qgebrd_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_qln_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + n*n
                       minwrk = n*n + max( 3_${ik}$*n + bdspac, n + m )
                    end if
                 else
                    ! path 5 (m >= n, but not much larger)
                    wrkbl = 3_${ik}$*n + lwork_qgebrd_mn
                    if( wntqn ) then
                       ! path 5n (m >= n, jobz='n')
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    else if( wntqo ) then
                       ! path 5o (m >= n, jobz='o')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_prt_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_qln_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + bdspac )
                       maxwrk = wrkbl + m*n
                       minwrk = 3_${ik}$*n + max( m, n*n + bdspac )
                    else if( wntqs ) then
                       ! path 5s (m >= n, jobz='s')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_qln_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    else if( wntqa ) then
                       ! path 5a (m >= n, jobz='a')
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*n + lwork_qormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*n + bdspac )
                       minwrk = 3_${ik}$*n + max( m, bdspac )
                    end if
                 end if
              else if( minmn>0_${ik}$ ) then
                 ! compute space needed for stdlib${ii}$_${ri}$bdsdc
                 if( wntqn ) then
                    ! stdlib${ii}$_${ri}$bdsdc needs only 4*n (or 6*n for uplo=l for lapack <= 3.6_${rk}$)
                    ! keep 7*n for backwards compatibility.
                    bdspac = 7_${ik}$*m
                 else
                    bdspac = 3_${ik}$*m*m + 4_${ik}$*m
                 end if
                 ! compute space preferred for each routine
                 call stdlib${ii}$_${ri}$gebrd( m, n, dum(1_${ik}$), m, dum(1_${ik}$), dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, &
                           ierr )
                 lwork_qgebrd_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$gebrd( m, m, a, m, s, dum(1_${ik}$), dum(1_${ik}$),dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                           
                 lwork_qgebrd_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$gelqf( m, n, a, m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_qgelqf_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$orglq( n, n, m, dum(1_${ik}$), n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_qorglq_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$orglq( m, n, m, a, m, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_qorglq_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$orgbr( 'P', m, m, m, a, n, dum(1_${ik}$), dum(1_${ik}$), -1_${ik}$, ierr )
                 lwork_qorgbr_p_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$ormbr( 'P', 'R', 'T', m, m, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_qormbr_prt_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$ormbr( 'P', 'R', 'T', m, n, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_qormbr_prt_mn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$ormbr( 'P', 'R', 'T', n, n, m, dum(1_${ik}$), n,dum(1_${ik}$), dum(1_${ik}$), n, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_qormbr_prt_nn = int( dum(1_${ik}$),KIND=${ik}$)
                 call stdlib${ii}$_${ri}$ormbr( 'Q', 'L', 'N', m, m, m, dum(1_${ik}$), m,dum(1_${ik}$), dum(1_${ik}$), m, dum(1_${ik}$), &
                           -1_${ik}$, ierr )
                 lwork_qormbr_qln_mm = int( dum(1_${ik}$),KIND=${ik}$)
                 if( n>=mnthr ) then
                    if( wntqn ) then
                       ! path 1t (n >> m, jobz='n')
                       wrkbl = m + lwork_qgelqf_mn
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qgebrd_mm )
                       maxwrk = max( wrkbl, bdspac + m )
                       minwrk = bdspac + m
                    else if( wntqo ) then
                       ! path 2t (n >> m, jobz='o')
                       wrkbl = m + lwork_qgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_qorglq_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + 2_${ik}$*m*m
                       minwrk = bdspac + 2_${ik}$*m*m + 3_${ik}$*m
                    else if( wntqs ) then
                       ! path 3t (n >> m, jobz='s')
                       wrkbl = m + lwork_qgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_qorglq_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*m
                       minwrk = bdspac + m*m + 3_${ik}$*m
                    else if( wntqa ) then
                       ! path 4t (n >> m, jobz='a')
                       wrkbl = m + lwork_qgelqf_mn
                       wrkbl = max( wrkbl,   m + lwork_qorglq_nn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qgebrd_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_prt_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*m
                       minwrk = m*m + max( 3_${ik}$*m + bdspac, m + n )
                    end if
                 else
                    ! path 5t (n > m, but not much larger)
                    wrkbl = 3_${ik}$*m + lwork_qgebrd_mn
                    if( wntqn ) then
                       ! path 5tn (n > m, jobz='n')
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    else if( wntqo ) then
                       ! path 5to (n > m, jobz='o')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_prt_mn )
                       wrkbl = max( wrkbl, 3_${ik}$*m + bdspac )
                       maxwrk = wrkbl + m*n
                       minwrk = 3_${ik}$*m + max( n, m*m + bdspac )
                    else if( wntqs ) then
                       ! path 5ts (n > m, jobz='s')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_prt_mn )
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    else if( wntqa ) then
                       ! path 5ta (n > m, jobz='a')
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_qln_mm )
                       wrkbl = max( wrkbl, 3_${ik}$*m + lwork_qormbr_prt_nn )
                       maxwrk = max( wrkbl, 3_${ik}$*m + bdspac )
                       minwrk = 3_${ik}$*m + max( n, bdspac )
                    end if
                 end if
              end if
              maxwrk = max( maxwrk, minwrk )
              work( 1_${ik}$ ) = stdlib${ii}$_${ri}$roundup_lwork( maxwrk )
              if( lwork<minwrk .and. .not.lquery ) then
                 info = -12_${ik}$
              end if
           end if
           if( info/=0_${ik}$ ) then
              call stdlib${ii}$_xerbla( 'DGESDD', -info )
              return
           else if( lquery ) then
              return
           end if
           ! quick return if possible
           if( m==0_${ik}$ .or. n==0_${ik}$ ) then
              return
           end if
           ! get machine constants
           eps = stdlib${ii}$_${ri}$lamch( 'P' )
           smlnum = sqrt( stdlib${ii}$_${ri}$lamch( 'S' ) ) / eps
           bignum = one / smlnum
           ! scale a if max element outside range [smlnum,bignum]
           anrm = stdlib${ii}$_${ri}$lange( 'M', m, n, a, lda, dum )
           if( stdlib${ii}$_${ri}$isnan( anrm ) ) then
               info = -4_${ik}$
               return
           end if
           iscl = 0_${ik}$
           if( anrm>zero .and. anrm<smlnum ) then
              iscl = 1_${ik}$
              call stdlib${ii}$_${ri}$lascl( 'G', 0_${ik}$, 0_${ik}$, anrm, smlnum, m, n, a, lda, ierr )
           else if( anrm>bignum ) then
              iscl = 1_${ik}$
              call stdlib${ii}$_${ri}$lascl( 'G', 0_${ik}$, 0_${ik}$, anrm, bignum, m, n, a, lda, ierr )
           end if
           if( m>=n ) then
              ! a has at least as many rows as columns. if a has sufficiently
              ! more rows than columns, first reduce using the qr
              ! decomposition (if sufficient workspace available)
              if( m>=mnthr ) then
                 if( wntqn ) then
                    ! path 1 (m >> n, jobz='n')
                    ! no singular vectors to be computed
                    itau = 1_${ik}$
                    nwork = itau + n
                    ! compute a=q*r
                    ! workspace: need   n [tau] + n    [work]
                    ! workspace: prefer n [tau] + n*nb [work]
                    call stdlib${ii}$_${ri}$geqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! zero out below r
                    if (n>1_${ik}$) call stdlib${ii}$_${ri}$laset( 'L', n-1, n-1, zero, zero, a( 2_${ik}$, 1_${ik}$ ), lda )
                    ie = 1_${ik}$
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in a
                    ! workspace: need   3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_${ri}$gebrd( n, n, a, lda, s, work( ie ), work( itauq ),work( itaup ), &
                              work( nwork ), lwork-nwork+1,ierr )
                    nwork = ie + n
                    ! perform bidiagonal svd, computing singular values only
                    ! workspace: need   n [e] + bdspac
                    call stdlib${ii}$_${ri}$bdsdc( 'U', 'N', n, s, work( ie ), dum, 1_${ik}$, dum, 1_${ik}$,dum, idum, &
                              work( nwork ), iwork, info )
                 else if( wntqo ) then
                    ! path 2 (m >> n, jobz = 'o')
                    ! n left singular vectors to be overwritten on a and
                    ! n right singular vectors to be computed in vt
                    ir = 1_${ik}$
                    ! work(ir) is ldwrkr by n
                    if( lwork >= lda*n + n*n + 3_${ik}$*n + bdspac ) then
                       ldwrkr = lda
                    else
                       ldwrkr = ( lwork - n*n - 3_${ik}$*n - bdspac ) / n
                    end if
                    itau = ir + ldwrkr*n
                    nwork = itau + n
                    ! compute a=q*r
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_${ri}$geqrf( m, n, a, lda, work( itau ), work( nwork ),lwork - nwork + &
                              1_${ik}$, ierr )
                    ! copy r to work(ir), zeroing out below it
                    call stdlib${ii}$_${ri}$lacpy( 'U', n, n, a, lda, work( ir ), ldwrkr )
                    call stdlib${ii}$_${ri}$laset( 'L', n - 1_${ik}$, n - 1_${ik}$, zero, zero, work(ir+1),ldwrkr )
                    ! generate q in a
                    ! workspace: need   n*n [r] + n [tau] + n    [work]
                    ! workspace: prefer n*n [r] + n [tau] + n*nb [work]
                    call stdlib${ii}$_${ri}$orgqr( m, n, n, a, lda, work( itau ),work( nwork ), lwork - &
                              nwork + 1_${ik}$, ierr )
                    ie = itau
                    itauq = ie + n
                    itaup = itauq + n
                    nwork = itaup + n
                    ! bidiagonalize r in work(ir)
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n      [work]
                    ! workspace: prefer n*n [r] + 3*n [e, tauq, taup] + 2*n*nb [work]
                    call stdlib${ii}$_${ri}$gebrd( n, n, work( ir ), ldwrkr, s, work( ie ),work( itauq ), &
                              work( itaup ), work( nwork ),lwork - nwork + 1_${ik}$, ierr )
                    ! work(iu) is n by n
                    iu = nwork
                    nwork = iu + n*n
                    ! perform bidiagonal svd, computing left singular vectors
                    ! of bidiagonal matrix in work(iu) and computing right
                    ! singular vectors of bidiagonal matrix in vt
                    ! workspace: need   n*n [r] + 3*n [e, tauq, taup] + n*n [u] + bdspac
                    call stdlib${ii}$_${ri}$bdsdc( 'U', 'I', n, s, work( ie ), work( iu ), n,vt, ldvt, dum, &
                              idum, work( nwork ), iwork,info )
                    ! overwrite work(iu) by left singular vectors of r
                    ! a